1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2019 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 * 47 * Some notes: 48 * - The 'M'odified bit is only applicable to terminal PTEs. 49 * 50 * - The 'U'ser access bit can be set for higher-level PTEs as 51 * long as it isn't set for terminal PTEs for pages we don't 52 * want user access to. 53 */ 54 55 #if 0 /* JG */ 56 #include "opt_pmap.h" 57 #endif 58 #include "opt_msgbuf.h" 59 60 #include <sys/param.h> 61 #include <sys/kernel.h> 62 #include <sys/proc.h> 63 #include <sys/msgbuf.h> 64 #include <sys/vmmeter.h> 65 #include <sys/mman.h> 66 #include <sys/systm.h> 67 68 #include <vm/vm.h> 69 #include <vm/vm_param.h> 70 #include <sys/sysctl.h> 71 #include <sys/lock.h> 72 #include <vm/vm_kern.h> 73 #include <vm/vm_page.h> 74 #include <vm/vm_map.h> 75 #include <vm/vm_object.h> 76 #include <vm/vm_extern.h> 77 #include <vm/vm_pageout.h> 78 #include <vm/vm_pager.h> 79 #include <vm/vm_zone.h> 80 81 #include <sys/thread2.h> 82 #include <sys/spinlock2.h> 83 #include <vm/vm_page2.h> 84 85 #include <machine/cputypes.h> 86 #include <machine/cpu.h> 87 #include <machine/md_var.h> 88 #include <machine/specialreg.h> 89 #include <machine/smp.h> 90 #include <machine_base/apic/apicreg.h> 91 #include <machine/globaldata.h> 92 #include <machine/pmap.h> 93 #include <machine/pmap_inval.h> 94 95 #include <ddb/ddb.h> 96 97 #define PMAP_KEEP_PDIRS 98 99 #if defined(DIAGNOSTIC) 100 #define PMAP_DIAGNOSTIC 101 #endif 102 103 #define MINPV 2048 104 105 /* 106 * pmap debugging will report who owns a pv lock when blocking. 107 */ 108 #ifdef PMAP_DEBUG 109 110 #define PMAP_DEBUG_DECL ,const char *func, int lineno 111 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 112 #define PMAP_DEBUG_COPY , func, lineno 113 114 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp \ 115 PMAP_DEBUG_ARGS) 116 #define pv_lock(pv) _pv_lock(pv \ 117 PMAP_DEBUG_ARGS) 118 #define pv_hold_try(pv) _pv_hold_try(pv \ 119 PMAP_DEBUG_ARGS) 120 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 121 PMAP_DEBUG_ARGS) 122 123 #define pv_free(pv, pvp) _pv_free(pv, pvp PMAP_DEBUG_ARGS) 124 125 #else 126 127 #define PMAP_DEBUG_DECL 128 #define PMAP_DEBUG_ARGS 129 #define PMAP_DEBUG_COPY 130 131 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp) 132 #define pv_lock(pv) _pv_lock(pv) 133 #define pv_hold_try(pv) _pv_hold_try(pv) 134 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 135 #define pv_free(pv, pvp) _pv_free(pv, pvp) 136 137 #endif 138 139 /* 140 * Get PDEs and PTEs for user/kernel address space 141 */ 142 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 143 144 #define pmap_pde_v(pmap, pte) \ 145 ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 146 #define pmap_pte_w(pmap, pte) \ 147 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) 148 #define pmap_pte_m(pmap, pte) \ 149 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) 150 #define pmap_pte_u(pmap, pte) \ 151 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) 152 #define pmap_pte_v(pmap, pte) \ 153 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 154 155 /* 156 * Given a map and a machine independent protection code, 157 * convert to a vax protection code. 158 */ 159 #define pte_prot(m, p) \ 160 (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 161 static uint64_t protection_codes[PROTECTION_CODES_SIZE]; 162 163 #define PMAP_PAGE_BACKING_SCAN(m, match_pmap, ipmap, iptep, ipte, iva) \ 164 if (m->object) { \ 165 vm_object_t iobj = m->object; \ 166 vm_map_backing_t iba, next_ba; \ 167 struct pmap *ipmap; \ 168 pt_entry_t ipte; \ 169 pt_entry_t *iptep; \ 170 vm_offset_t iva; \ 171 vm_pindex_t ipindex_start; \ 172 vm_pindex_t ipindex_end; \ 173 \ 174 lockmgr(&iobj->backing_lk, LK_SHARED); \ 175 next_ba = TAILQ_FIRST(&iobj->backing_list); \ 176 while ((iba = next_ba) != NULL) { \ 177 next_ba = TAILQ_NEXT(iba, entry); \ 178 ipmap = iba->pmap; \ 179 if (match_pmap && ipmap != match_pmap) \ 180 continue; \ 181 ipindex_start = iba->offset >> PAGE_SHIFT; \ 182 ipindex_end = ipindex_start + \ 183 ((iba->end - iba->start) >> PAGE_SHIFT); \ 184 if (m->pindex < ipindex_start || \ 185 m->pindex >= ipindex_end) { \ 186 continue; \ 187 } \ 188 iva = iba->start + \ 189 ((m->pindex - ipindex_start) << PAGE_SHIFT); \ 190 iptep = pmap_pte(ipmap, iva); \ 191 if (iptep == NULL) \ 192 continue; \ 193 ipte = *iptep; \ 194 if (m->phys_addr != (ipte & PG_FRAME)) \ 195 continue; \ 196 197 #define PMAP_PAGE_BACKING_RETRY \ 198 { \ 199 next_ba = iba; \ 200 continue; \ 201 } \ 202 203 #define PMAP_PAGE_BACKING_DONE \ 204 } \ 205 lockmgr(&iobj->backing_lk, LK_RELEASE); \ 206 } \ 207 208 struct pmap kernel_pmap; 209 struct pmap iso_pmap; 210 211 vm_paddr_t avail_start; /* PA of first available physical page */ 212 vm_paddr_t avail_end; /* PA of last available physical page */ 213 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 214 vm_offset_t virtual2_end; 215 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 216 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 217 vm_offset_t KvaStart; /* VA start of KVA space */ 218 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 219 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 220 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 221 //static int pgeflag; /* PG_G or-in */ 222 uint64_t PatMsr; 223 224 static int ndmpdp; 225 static vm_paddr_t dmaplimit; 226 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 227 228 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 229 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */ 230 231 static uint64_t KPTbase; 232 static uint64_t KPTphys; 233 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 234 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 235 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 236 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 237 238 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 239 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 240 241 /* 242 * Data for the pv entry allocation mechanism 243 */ 244 __read_mostly static vm_zone_t pvzone; 245 __read_mostly static int pmap_pagedaemon_waken = 0; 246 static struct vm_zone pvzone_store; 247 static struct pv_entry *pvinit; 248 249 /* 250 * All those kernel PT submaps that BSD is so fond of 251 */ 252 pt_entry_t *CMAP1 = NULL, *ptmmap; 253 caddr_t CADDR1 = NULL, ptvmmap = NULL; 254 static pt_entry_t *msgbufmap; 255 struct msgbuf *msgbufp=NULL; 256 257 /* 258 * PMAP default PG_* bits. Needed to be able to add 259 * EPT/NPT pagetable pmap_bits for the VMM module 260 */ 261 uint64_t pmap_bits_default[] = { 262 REGULAR_PMAP, /* TYPE_IDX 0 */ 263 X86_PG_V, /* PG_V_IDX 1 */ 264 X86_PG_RW, /* PG_RW_IDX 2 */ 265 X86_PG_U, /* PG_U_IDX 3 */ 266 X86_PG_A, /* PG_A_IDX 4 */ 267 X86_PG_M, /* PG_M_IDX 5 */ 268 X86_PG_PS, /* PG_PS_IDX3 6 */ 269 X86_PG_G, /* PG_G_IDX 7 */ 270 X86_PG_AVAIL1, /* PG_AVAIL1_IDX 8 */ 271 X86_PG_AVAIL2, /* PG_AVAIL2_IDX 9 */ 272 X86_PG_AVAIL3, /* PG_AVAIL3_IDX 10 */ 273 X86_PG_NC_PWT | X86_PG_NC_PCD, /* PG_N_IDX 11 */ 274 X86_PG_NX, /* PG_NX_IDX 12 */ 275 }; 276 /* 277 * Crashdump maps. 278 */ 279 static pt_entry_t *pt_crashdumpmap; 280 static caddr_t crashdumpmap; 281 282 static int pmap_debug = 0; 283 SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW, 284 &pmap_debug, 0, "Debug pmap's"); 285 #ifdef PMAP_DEBUG2 286 static int pmap_enter_debug = 0; 287 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, 288 &pmap_enter_debug, 0, "Debug pmap_enter's"); 289 #endif 290 static int pmap_yield_count = 64; 291 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 292 &pmap_yield_count, 0, "Yield during init_pt/release"); 293 int pmap_fast_kernel_cpusync = 0; 294 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, 295 &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); 296 int pmap_dynamic_delete = 0; 297 SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW, 298 &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs"); 299 int pmap_lock_delay = 100; 300 SYSCTL_INT(_machdep, OID_AUTO, pmap_lock_delay, CTLFLAG_RW, 301 &pmap_lock_delay, 0, "Spin loops"); 302 static int meltdown_mitigation = -1; 303 TUNABLE_INT("machdep.meltdown_mitigation", &meltdown_mitigation); 304 SYSCTL_INT(_machdep, OID_AUTO, meltdown_mitigation, CTLFLAG_RW, 305 &meltdown_mitigation, 0, "Userland pmap isolation"); 306 307 static int pmap_nx_enable = -1; /* -1 = auto */ 308 /* needs manual TUNABLE in early probe, see below */ 309 SYSCTL_INT(_machdep, OID_AUTO, pmap_nx_enable, CTLFLAG_RD, 310 &pmap_nx_enable, 0, 311 "no-execute support (0=disabled, 1=w/READ, 2=w/READ & WRITE)"); 312 313 static int pmap_pv_debug = 50; 314 SYSCTL_INT(_machdep, OID_AUTO, pmap_pv_debug, CTLFLAG_RW, 315 &pmap_pv_debug, 0, ""); 316 317 static long vm_pmap_pv_entries; 318 SYSCTL_LONG(_vm, OID_AUTO, pmap_pv_entries, CTLFLAG_RD, 319 &vm_pmap_pv_entries, 0, ""); 320 321 /* Standard user access funtions */ 322 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, 323 size_t *lencopied); 324 extern int std_copyin (const void *udaddr, void *kaddr, size_t len); 325 extern int std_copyout (const void *kaddr, void *udaddr, size_t len); 326 extern int std_fubyte (const uint8_t *base); 327 extern int std_subyte (uint8_t *base, uint8_t byte); 328 extern int32_t std_fuword32 (const uint32_t *base); 329 extern int64_t std_fuword64 (const uint64_t *base); 330 extern int std_suword64 (uint64_t *base, uint64_t word); 331 extern int std_suword32 (uint32_t *base, int word); 332 extern uint32_t std_swapu32 (volatile uint32_t *base, uint32_t v); 333 extern uint64_t std_swapu64 (volatile uint64_t *base, uint64_t v); 334 extern uint32_t std_fuwordadd32 (volatile uint32_t *base, uint32_t v); 335 extern uint64_t std_fuwordadd64 (volatile uint64_t *base, uint64_t v); 336 337 #if 0 338 static void pv_hold(pv_entry_t pv); 339 #endif 340 static int _pv_hold_try(pv_entry_t pv 341 PMAP_DEBUG_DECL); 342 static void pv_drop(pv_entry_t pv); 343 static void _pv_lock(pv_entry_t pv 344 PMAP_DEBUG_DECL); 345 static void pv_unlock(pv_entry_t pv); 346 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 347 PMAP_DEBUG_DECL); 348 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp 349 PMAP_DEBUG_DECL); 350 static void _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL); 351 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, 352 vm_pindex_t **pmarkp, int *errorp); 353 static void pv_put(pv_entry_t pv); 354 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 355 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 356 pv_entry_t *pvpp); 357 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 358 pmap_inval_bulk_t *bulk, int destroy); 359 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 360 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, 361 pmap_inval_bulk_t *bulk); 362 363 struct pmap_scan_info; 364 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 365 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 366 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 367 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 368 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 369 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 370 371 static void x86_64_protection_init (void); 372 static void create_pagetables(vm_paddr_t *firstaddr); 373 static void pmap_remove_all (vm_page_t m); 374 static boolean_t pmap_testbit (vm_page_t m, int bit); 375 376 static pt_entry_t *pmap_pte_quick (pmap_t pmap, vm_offset_t va); 377 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 378 379 static void pmap_pinit_defaults(struct pmap *pmap); 380 static void pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark); 381 static void pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark); 382 383 static int 384 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 385 { 386 if (pv1->pv_pindex < pv2->pv_pindex) 387 return(-1); 388 if (pv1->pv_pindex > pv2->pv_pindex) 389 return(1); 390 return(0); 391 } 392 393 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 394 pv_entry_compare, vm_pindex_t, pv_pindex); 395 396 /* 397 * Keep track of pages in the pmap. The procedure is handed 398 * the vm_page->md.pmap_count value prior to an increment or 399 * decrement. 400 * 401 * t_arm - Active real memory 402 * t_avm - Active virtual memory 403 * t_armshr - Active real memory that is also shared 404 * t_avmshr - Active virtual memory that is also shared 405 * 406 * NOTE: At the moment t_avm is effectively just the same as t_arm. 407 */ 408 static __inline 409 void 410 pmap_page_stats_adding(long prev_count) 411 { 412 globaldata_t gd = mycpu; 413 414 if (prev_count == 0) { 415 ++gd->gd_vmtotal.t_arm; 416 ++gd->gd_vmtotal.t_avm; 417 } else if (prev_count == 1) { 418 ++gd->gd_vmtotal.t_armshr; 419 ++gd->gd_vmtotal.t_avmshr; 420 } else { 421 ++gd->gd_vmtotal.t_avmshr; 422 } 423 } 424 425 static __inline 426 void 427 pmap_page_stats_deleting(long prev_count) 428 { 429 globaldata_t gd = mycpu; 430 431 if (prev_count == 1) { 432 --gd->gd_vmtotal.t_arm; 433 --gd->gd_vmtotal.t_avm; 434 } else if (prev_count == 2) { 435 --gd->gd_vmtotal.t_armshr; 436 --gd->gd_vmtotal.t_avmshr; 437 } else { 438 --gd->gd_vmtotal.t_avmshr; 439 } 440 } 441 442 /* 443 * Move the kernel virtual free pointer to the next 444 * 2MB. This is used to help improve performance 445 * by using a large (2MB) page for much of the kernel 446 * (.text, .data, .bss) 447 */ 448 static 449 vm_offset_t 450 pmap_kmem_choose(vm_offset_t addr) 451 { 452 vm_offset_t newaddr = addr; 453 454 newaddr = roundup2(addr, NBPDR); 455 return newaddr; 456 } 457 458 /* 459 * Returns the pindex of a page table entry (representing a terminal page). 460 * There are NUPTE_TOTAL page table entries possible (a huge number) 461 * 462 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 463 * We want to properly translate negative KVAs. 464 */ 465 static __inline 466 vm_pindex_t 467 pmap_pte_pindex(vm_offset_t va) 468 { 469 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 470 } 471 472 /* 473 * Returns the pindex of a page table. 474 */ 475 static __inline 476 vm_pindex_t 477 pmap_pt_pindex(vm_offset_t va) 478 { 479 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 480 } 481 482 /* 483 * Returns the pindex of a page directory. 484 */ 485 static __inline 486 vm_pindex_t 487 pmap_pd_pindex(vm_offset_t va) 488 { 489 return (NUPTE_TOTAL + NUPT_TOTAL + 490 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 491 } 492 493 static __inline 494 vm_pindex_t 495 pmap_pdp_pindex(vm_offset_t va) 496 { 497 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 498 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 499 } 500 501 static __inline 502 vm_pindex_t 503 pmap_pml4_pindex(void) 504 { 505 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 506 } 507 508 /* 509 * Return various clipped indexes for a given VA 510 * 511 * Returns the index of a pt in a page directory, representing a page 512 * table. 513 */ 514 static __inline 515 vm_pindex_t 516 pmap_pt_index(vm_offset_t va) 517 { 518 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 519 } 520 521 /* 522 * Returns the index of a pd in a page directory page, representing a page 523 * directory. 524 */ 525 static __inline 526 vm_pindex_t 527 pmap_pd_index(vm_offset_t va) 528 { 529 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 530 } 531 532 /* 533 * Returns the index of a pdp in the pml4 table, representing a page 534 * directory page. 535 */ 536 static __inline 537 vm_pindex_t 538 pmap_pdp_index(vm_offset_t va) 539 { 540 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 541 } 542 543 /* 544 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 545 * the PT layer. This will speed up core pmap operations considerably. 546 * We also cache the PTE layer to (hopefully) improve relative lookup 547 * speeds. 548 * 549 * NOTE: The pmap spinlock does not need to be held but the passed-in pv 550 * must be in a known associated state (typically by being locked when 551 * the pmap spinlock isn't held). We allow the race for that case. 552 * 553 * NOTE: pm_pvhint* is only accessed (read) with the spin-lock held, using 554 * cpu_ccfence() to prevent compiler optimizations from reloading the 555 * field. 556 */ 557 static __inline 558 void 559 pv_cache(pmap_t pmap, pv_entry_t pv, vm_pindex_t pindex) 560 { 561 if (pindex < pmap_pt_pindex(0)) { 562 ; 563 } else if (pindex < pmap_pd_pindex(0)) { 564 pmap->pm_pvhint_pt = pv; 565 } 566 } 567 568 /* 569 * Locate the requested pt_entry 570 */ 571 static __inline 572 pv_entry_t 573 pv_entry_lookup(pmap_t pmap, vm_pindex_t pindex) 574 { 575 pv_entry_t pv; 576 577 if (pindex < pmap_pt_pindex(0)) 578 return NULL; 579 #if 1 580 if (pindex < pmap_pd_pindex(0)) 581 pv = pmap->pm_pvhint_pt; 582 else 583 pv = NULL; 584 cpu_ccfence(); 585 if (pv == NULL || pv->pv_pmap != pmap) { 586 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 587 if (pv) 588 pv_cache(pmap, pv, pindex); 589 } else if (pv->pv_pindex != pindex) { 590 pv = pv_entry_rb_tree_RB_LOOKUP_REL(&pmap->pm_pvroot, 591 pindex, pv); 592 if (pv) 593 pv_cache(pmap, pv, pindex); 594 } 595 #else 596 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 597 #endif 598 return pv; 599 } 600 601 /* 602 * pmap_pte_quick: 603 * 604 * Super fast pmap_pte routine best used when scanning the pv lists. 605 * This eliminates many course-grained invltlb calls. Note that many of 606 * the pv list scans are across different pmaps and it is very wasteful 607 * to do an entire invltlb when checking a single mapping. 608 */ 609 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 610 611 static 612 pt_entry_t * 613 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 614 { 615 return pmap_pte(pmap, va); 616 } 617 618 /* 619 * The placemarker hash must be broken up into four zones so lock 620 * ordering semantics continue to work (e.g. pte, pt, pd, then pdp). 621 * 622 * Placemarkers are used to 'lock' page table indices that do not have 623 * a pv_entry. This allows the pmap to support managed and unmanaged 624 * pages and shared page tables. 625 */ 626 #define PM_PLACE_BASE (PM_PLACEMARKS >> 2) 627 628 static __inline 629 vm_pindex_t * 630 pmap_placemarker_hash(pmap_t pmap, vm_pindex_t pindex) 631 { 632 int hi; 633 634 if (pindex < pmap_pt_pindex(0)) /* zone 0 - PTE */ 635 hi = 0; 636 else if (pindex < pmap_pd_pindex(0)) /* zone 1 - PT */ 637 hi = PM_PLACE_BASE; 638 else if (pindex < pmap_pdp_pindex(0)) /* zone 2 - PD */ 639 hi = PM_PLACE_BASE << 1; 640 else /* zone 3 - PDP (and PML4E) */ 641 hi = PM_PLACE_BASE | (PM_PLACE_BASE << 1); 642 hi += pindex & (PM_PLACE_BASE - 1); 643 644 return (&pmap->pm_placemarks[hi]); 645 } 646 647 648 /* 649 * Generic procedure to index a pte from a pt, pd, or pdp. 650 * 651 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 652 * a page table page index but is instead of PV lookup index. 653 */ 654 static 655 void * 656 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 657 { 658 pt_entry_t *pte; 659 660 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 661 return(&pte[pindex]); 662 } 663 664 /* 665 * Return pointer to PDP slot in the PML4 666 */ 667 static __inline 668 pml4_entry_t * 669 pmap_pdp(pmap_t pmap, vm_offset_t va) 670 { 671 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 672 } 673 674 /* 675 * Return pointer to PD slot in the PDP given a pointer to the PDP 676 */ 677 static __inline 678 pdp_entry_t * 679 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 680 { 681 pdp_entry_t *pd; 682 683 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 684 return (&pd[pmap_pd_index(va)]); 685 } 686 687 /* 688 * Return pointer to PD slot in the PDP. 689 */ 690 static __inline 691 pdp_entry_t * 692 pmap_pd(pmap_t pmap, vm_offset_t va) 693 { 694 pml4_entry_t *pdp; 695 696 pdp = pmap_pdp(pmap, va); 697 if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) 698 return NULL; 699 return (pmap_pdp_to_pd(*pdp, va)); 700 } 701 702 /* 703 * Return pointer to PT slot in the PD given a pointer to the PD 704 */ 705 static __inline 706 pd_entry_t * 707 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 708 { 709 pd_entry_t *pt; 710 711 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 712 return (&pt[pmap_pt_index(va)]); 713 } 714 715 /* 716 * Return pointer to PT slot in the PD 717 * 718 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 719 * so we cannot lookup the PD via the PDP. Instead we 720 * must look it up via the pmap. 721 */ 722 static __inline 723 pd_entry_t * 724 pmap_pt(pmap_t pmap, vm_offset_t va) 725 { 726 pdp_entry_t *pd; 727 pv_entry_t pv; 728 vm_pindex_t pd_pindex; 729 vm_paddr_t phys; 730 731 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 732 pd_pindex = pmap_pd_pindex(va); 733 spin_lock_shared(&pmap->pm_spin); 734 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 735 if (pv == NULL || pv->pv_m == NULL) { 736 spin_unlock_shared(&pmap->pm_spin); 737 return NULL; 738 } 739 phys = VM_PAGE_TO_PHYS(pv->pv_m); 740 spin_unlock_shared(&pmap->pm_spin); 741 return (pmap_pd_to_pt(phys, va)); 742 } else { 743 pd = pmap_pd(pmap, va); 744 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) 745 return NULL; 746 return (pmap_pd_to_pt(*pd, va)); 747 } 748 } 749 750 /* 751 * Return pointer to PTE slot in the PT given a pointer to the PT 752 */ 753 static __inline 754 pt_entry_t * 755 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 756 { 757 pt_entry_t *pte; 758 759 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 760 return (&pte[pmap_pte_index(va)]); 761 } 762 763 /* 764 * Return pointer to PTE slot in the PT 765 */ 766 static __inline 767 pt_entry_t * 768 pmap_pte(pmap_t pmap, vm_offset_t va) 769 { 770 pd_entry_t *pt; 771 772 pt = pmap_pt(pmap, va); 773 if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) 774 return NULL; 775 if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) 776 return ((pt_entry_t *)pt); 777 return (pmap_pt_to_pte(*pt, va)); 778 } 779 780 /* 781 * Return address of PT slot in PD (KVM only) 782 * 783 * Cannot be used for user page tables because it might interfere with 784 * the shared page-table-page optimization (pmap_mmu_optimize). 785 */ 786 static __inline 787 pd_entry_t * 788 vtopt(vm_offset_t va) 789 { 790 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 791 NPML4EPGSHIFT)) - 1); 792 793 return (PDmap + ((va >> PDRSHIFT) & mask)); 794 } 795 796 /* 797 * KVM - return address of PTE slot in PT 798 */ 799 static __inline 800 pt_entry_t * 801 vtopte(vm_offset_t va) 802 { 803 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 804 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 805 806 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 807 } 808 809 /* 810 * Returns the physical address translation from va for a user address. 811 * (vm_paddr_t)-1 is returned on failure. 812 */ 813 vm_paddr_t 814 uservtophys(vm_offset_t va) 815 { 816 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 817 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 818 vm_paddr_t pa; 819 pt_entry_t pte; 820 pmap_t pmap; 821 822 pmap = vmspace_pmap(mycpu->gd_curthread->td_lwp->lwp_vmspace); 823 pa = (vm_paddr_t)-1; 824 if (va < VM_MAX_USER_ADDRESS) { 825 pte = kreadmem64(PTmap + ((va >> PAGE_SHIFT) & mask)); 826 if (pte & pmap->pmap_bits[PG_V_IDX]) 827 pa = (pte & PG_FRAME) | (va & PAGE_MASK); 828 } 829 return pa; 830 } 831 832 static uint64_t 833 allocpages(vm_paddr_t *firstaddr, long n) 834 { 835 uint64_t ret; 836 837 ret = *firstaddr; 838 bzero((void *)ret, n * PAGE_SIZE); 839 *firstaddr += n * PAGE_SIZE; 840 return (ret); 841 } 842 843 static 844 void 845 create_pagetables(vm_paddr_t *firstaddr) 846 { 847 long i; /* must be 64 bits */ 848 long nkpt_base; 849 long nkpt_phys; 850 long nkpd_phys; 851 int j; 852 853 /* 854 * We are running (mostly) V=P at this point 855 * 856 * Calculate how many 1GB PD entries in our PDP pages are needed 857 * for the DMAP. This is only allocated if the system does not 858 * support 1GB pages. Otherwise ndmpdp is simply a count of 859 * the number of 1G terminal entries in our PDP pages are needed. 860 * 861 * NOTE: Maxmem is in pages 862 */ 863 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 864 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 865 ndmpdp = 4; 866 KKASSERT(ndmpdp <= NDMPML4E * NPML4EPG); 867 868 /* 869 * Starting at KERNBASE - map all 2G worth of page table pages. 870 * KERNBASE is offset -2G from the end of kvm. This will accomodate 871 * all KVM allocations above KERNBASE, including the SYSMAPs below. 872 * 873 * We do this by allocating 2*512 PT pages. Each PT page can map 874 * 2MB, for 2GB total. 875 */ 876 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 877 878 /* 879 * Starting at the beginning of kvm (VM_MIN_KERNEL_ADDRESS), 880 * Calculate how many page table pages we need to preallocate 881 * for early vm_map allocations. 882 * 883 * A few extra won't hurt, they will get used up in the running 884 * system. 885 * 886 * vm_page array 887 * initial pventry's 888 */ 889 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 890 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 891 nkpt_phys += 128; /* a few extra */ 892 893 /* 894 * The highest value nkpd_phys can be set to is 895 * NKPDPE - (NPDPEPG - KPDPI) (i.e. NKPDPE - 2). 896 * 897 * Doing so would cause all PD pages to be pre-populated for 898 * a maximal KVM space (approximately 16*512 pages, or 32MB. 899 * We can save memory by not doing this. 900 */ 901 nkpd_phys = (nkpt_phys + NPDPEPG - 1) / NPDPEPG; 902 903 /* 904 * Allocate pages 905 * 906 * Normally NKPML4E=1-16 (1-16 kernel PDP page) 907 * Normally NKPDPE= NKPML4E*512-1 (511 min kernel PD pages) 908 * 909 * Only allocate enough PD pages 910 * NOTE: We allocate all kernel PD pages up-front, typically 911 * ~511G of KVM, requiring 511 PD pages. 912 */ 913 KPTbase = allocpages(firstaddr, nkpt_base); /* KERNBASE to end */ 914 KPTphys = allocpages(firstaddr, nkpt_phys); /* KVA start */ 915 KPML4phys = allocpages(firstaddr, 1); /* recursive PML4 map */ 916 KPDPphys = allocpages(firstaddr, NKPML4E); /* kernel PDP pages */ 917 KPDphys = allocpages(firstaddr, nkpd_phys); /* kernel PD pages */ 918 919 /* 920 * Alloc PD pages for the area starting at KERNBASE. 921 */ 922 KPDbase = allocpages(firstaddr, NPDPEPG - KPDPI); 923 924 /* 925 * Stuff for our DMAP 926 */ 927 DMPDPphys = allocpages(firstaddr, NDMPML4E); 928 if ((amd_feature & AMDID_PAGE1GB) == 0) 929 DMPDphys = allocpages(firstaddr, ndmpdp); 930 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 931 932 /* 933 * Fill in the underlying page table pages for the area around 934 * KERNBASE. This remaps low physical memory to KERNBASE. 935 * 936 * Read-only from zero to physfree 937 * XXX not fully used, underneath 2M pages 938 */ 939 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 940 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 941 ((pt_entry_t *)KPTbase)[i] |= 942 pmap_bits_default[PG_RW_IDX] | 943 pmap_bits_default[PG_V_IDX] | 944 pmap_bits_default[PG_G_IDX]; 945 } 946 947 /* 948 * Now map the initial kernel page tables. One block of page 949 * tables is placed at the beginning of kernel virtual memory, 950 * and another block is placed at KERNBASE to map the kernel binary, 951 * data, bss, and initial pre-allocations. 952 */ 953 for (i = 0; i < nkpt_base; i++) { 954 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 955 ((pd_entry_t *)KPDbase)[i] |= 956 pmap_bits_default[PG_RW_IDX] | 957 pmap_bits_default[PG_V_IDX]; 958 } 959 for (i = 0; i < nkpt_phys; i++) { 960 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 961 ((pd_entry_t *)KPDphys)[i] |= 962 pmap_bits_default[PG_RW_IDX] | 963 pmap_bits_default[PG_V_IDX]; 964 } 965 966 /* 967 * Map from zero to end of allocations using 2M pages as an 968 * optimization. This will bypass some of the KPTBase pages 969 * above in the KERNBASE area. 970 */ 971 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 972 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 973 ((pd_entry_t *)KPDbase)[i] |= 974 pmap_bits_default[PG_RW_IDX] | 975 pmap_bits_default[PG_V_IDX] | 976 pmap_bits_default[PG_PS_IDX] | 977 pmap_bits_default[PG_G_IDX]; 978 } 979 980 /* 981 * Load PD addresses into the PDP pages for primary KVA space to 982 * cover existing page tables. PD's for KERNBASE are handled in 983 * the next loop. 984 * 985 * expected to pre-populate all of its PDs. See NKPDPE in vmparam.h. 986 */ 987 for (i = 0; i < nkpd_phys; i++) { 988 ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] = 989 KPDphys + (i << PAGE_SHIFT); 990 ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] |= 991 pmap_bits_default[PG_RW_IDX] | 992 pmap_bits_default[PG_V_IDX] | 993 pmap_bits_default[PG_A_IDX]; 994 } 995 996 /* 997 * Load PDs for KERNBASE to the end 998 */ 999 i = (NKPML4E - 1) * NPDPEPG + KPDPI; 1000 for (j = 0; j < NPDPEPG - KPDPI; ++j) { 1001 ((pdp_entry_t *)KPDPphys)[i + j] = 1002 KPDbase + (j << PAGE_SHIFT); 1003 ((pdp_entry_t *)KPDPphys)[i + j] |= 1004 pmap_bits_default[PG_RW_IDX] | 1005 pmap_bits_default[PG_V_IDX] | 1006 pmap_bits_default[PG_A_IDX]; 1007 } 1008 1009 /* 1010 * Now set up the direct map space using either 2MB or 1GB pages 1011 * Preset PG_M and PG_A because demotion expects it. 1012 * 1013 * When filling in entries in the PD pages make sure any excess 1014 * entries are set to zero as we allocated enough PD pages 1015 */ 1016 if ((amd_feature & AMDID_PAGE1GB) == 0) { 1017 /* 1018 * Use 2MB pages 1019 */ 1020 for (i = 0; i < NPDEPG * ndmpdp; i++) { 1021 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 1022 ((pd_entry_t *)DMPDphys)[i] |= 1023 pmap_bits_default[PG_RW_IDX] | 1024 pmap_bits_default[PG_V_IDX] | 1025 pmap_bits_default[PG_PS_IDX] | 1026 pmap_bits_default[PG_G_IDX] | 1027 pmap_bits_default[PG_M_IDX] | 1028 pmap_bits_default[PG_A_IDX]; 1029 } 1030 1031 /* 1032 * And the direct map space's PDP 1033 */ 1034 for (i = 0; i < ndmpdp; i++) { 1035 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 1036 (i << PAGE_SHIFT); 1037 ((pdp_entry_t *)DMPDPphys)[i] |= 1038 pmap_bits_default[PG_RW_IDX] | 1039 pmap_bits_default[PG_V_IDX]; 1040 } 1041 } else { 1042 /* 1043 * 1GB pages 1044 */ 1045 for (i = 0; i < ndmpdp; i++) { 1046 ((pdp_entry_t *)DMPDPphys)[i] = 1047 (vm_paddr_t)i << PDPSHIFT; 1048 ((pdp_entry_t *)DMPDPphys)[i] |= 1049 pmap_bits_default[PG_RW_IDX] | 1050 pmap_bits_default[PG_V_IDX] | 1051 pmap_bits_default[PG_PS_IDX] | 1052 pmap_bits_default[PG_G_IDX] | 1053 pmap_bits_default[PG_M_IDX] | 1054 pmap_bits_default[PG_A_IDX]; 1055 } 1056 } 1057 1058 /* And recursively map PML4 to itself in order to get PTmap */ 1059 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 1060 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= 1061 pmap_bits_default[PG_RW_IDX] | 1062 pmap_bits_default[PG_V_IDX] | 1063 pmap_bits_default[PG_A_IDX]; 1064 1065 /* 1066 * Connect the Direct Map slots up to the PML4 1067 */ 1068 for (j = 0; j < NDMPML4E; ++j) { 1069 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 1070 (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 1071 pmap_bits_default[PG_RW_IDX] | 1072 pmap_bits_default[PG_V_IDX] | 1073 pmap_bits_default[PG_A_IDX]; 1074 } 1075 1076 /* 1077 * Connect the KVA slot up to the PML4 1078 */ 1079 for (j = 0; j < NKPML4E; ++j) { 1080 ((pdp_entry_t *)KPML4phys)[KPML4I + j] = 1081 KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT); 1082 ((pdp_entry_t *)KPML4phys)[KPML4I + j] |= 1083 pmap_bits_default[PG_RW_IDX] | 1084 pmap_bits_default[PG_V_IDX] | 1085 pmap_bits_default[PG_A_IDX]; 1086 } 1087 cpu_mfence(); 1088 cpu_invltlb(); 1089 } 1090 1091 /* 1092 * Bootstrap the system enough to run with virtual memory. 1093 * 1094 * On x86_64 this is called after mapping has already been enabled 1095 * and just syncs the pmap module with what has already been done. 1096 * [We can't call it easily with mapping off since the kernel is not 1097 * mapped with PA == VA, hence we would have to relocate every address 1098 * from the linked base (virtual) address "KERNBASE" to the actual 1099 * (physical) address starting relative to 0] 1100 */ 1101 void 1102 pmap_bootstrap(vm_paddr_t *firstaddr) 1103 { 1104 vm_offset_t va; 1105 pt_entry_t *pte; 1106 int i; 1107 1108 KvaStart = VM_MIN_KERNEL_ADDRESS; 1109 KvaEnd = VM_MAX_KERNEL_ADDRESS; 1110 KvaSize = KvaEnd - KvaStart; 1111 1112 avail_start = *firstaddr; 1113 1114 /* 1115 * Create an initial set of page tables to run the kernel in. 1116 */ 1117 create_pagetables(firstaddr); 1118 1119 virtual2_start = KvaStart; 1120 virtual2_end = PTOV_OFFSET; 1121 1122 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 1123 virtual_start = pmap_kmem_choose(virtual_start); 1124 1125 virtual_end = VM_MAX_KERNEL_ADDRESS; 1126 1127 /* XXX do %cr0 as well */ 1128 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 1129 load_cr3(KPML4phys); 1130 1131 /* 1132 * Initialize protection array. 1133 */ 1134 x86_64_protection_init(); 1135 1136 /* 1137 * The kernel's pmap is statically allocated so we don't have to use 1138 * pmap_create, which is unlikely to work correctly at this part of 1139 * the boot sequence (XXX and which no longer exists). 1140 */ 1141 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 1142 kernel_pmap.pm_count = 1; 1143 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 1144 RB_INIT(&kernel_pmap.pm_pvroot); 1145 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 1146 for (i = 0; i < PM_PLACEMARKS; ++i) 1147 kernel_pmap.pm_placemarks[i] = PM_NOPLACEMARK; 1148 1149 /* 1150 * Reserve some special page table entries/VA space for temporary 1151 * mapping of pages. 1152 */ 1153 #define SYSMAP(c, p, v, n) \ 1154 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1155 1156 va = virtual_start; 1157 pte = vtopte(va); 1158 1159 /* 1160 * CMAP1/CMAP2 are used for zeroing and copying pages. 1161 */ 1162 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 1163 1164 /* 1165 * Crashdump maps. 1166 */ 1167 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 1168 1169 /* 1170 * ptvmmap is used for reading arbitrary physical pages via 1171 * /dev/mem. 1172 */ 1173 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 1174 1175 /* 1176 * msgbufp is used to map the system message buffer. 1177 * XXX msgbufmap is not used. 1178 */ 1179 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1180 atop(round_page(MSGBUF_SIZE))) 1181 1182 virtual_start = va; 1183 virtual_start = pmap_kmem_choose(virtual_start); 1184 1185 *CMAP1 = 0; 1186 1187 /* 1188 * PG_G is terribly broken on SMP because we IPI invltlb's in some 1189 * cases rather then invl1pg. Actually, I don't even know why it 1190 * works under UP because self-referential page table mappings 1191 */ 1192 // pgeflag = 0; 1193 1194 cpu_invltlb(); 1195 1196 /* Initialize the PAT MSR */ 1197 pmap_init_pat(); 1198 pmap_pinit_defaults(&kernel_pmap); 1199 1200 TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", 1201 &pmap_fast_kernel_cpusync); 1202 1203 } 1204 1205 /* 1206 * Setup the PAT MSR. 1207 */ 1208 void 1209 pmap_init_pat(void) 1210 { 1211 uint64_t pat_msr; 1212 u_long cr0, cr4; 1213 1214 /* 1215 * Default values mapping PATi,PCD,PWT bits at system reset. 1216 * The default values effectively ignore the PATi bit by 1217 * repeating the encodings for 0-3 in 4-7, and map the PCD 1218 * and PWT bit combinations to the expected PAT types. 1219 */ 1220 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 1221 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 1222 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 1223 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 1224 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 1225 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 1226 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 1227 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 1228 pat_pte_index[PAT_WRITE_BACK] = 0; 1229 pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; 1230 pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; 1231 pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; 1232 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 1233 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 1234 1235 if (cpu_feature & CPUID_PAT) { 1236 /* 1237 * If we support the PAT then set-up entries for 1238 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 1239 * 5 and 6. 1240 */ 1241 pat_msr = (pat_msr & ~PAT_MASK(5)) | 1242 PAT_VALUE(5, PAT_WRITE_PROTECTED); 1243 pat_msr = (pat_msr & ~PAT_MASK(6)) | 1244 PAT_VALUE(6, PAT_WRITE_COMBINING); 1245 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | X86_PG_NC_PWT; 1246 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PCD; 1247 1248 /* 1249 * Then enable the PAT 1250 */ 1251 1252 /* Disable PGE. */ 1253 cr4 = rcr4(); 1254 load_cr4(cr4 & ~CR4_PGE); 1255 1256 /* Disable caches (CD = 1, NW = 0). */ 1257 cr0 = rcr0(); 1258 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1259 1260 /* Flushes caches and TLBs. */ 1261 wbinvd(); 1262 cpu_invltlb(); 1263 1264 /* Update PAT and index table. */ 1265 wrmsr(MSR_PAT, pat_msr); 1266 1267 /* Flush caches and TLBs again. */ 1268 wbinvd(); 1269 cpu_invltlb(); 1270 1271 /* Restore caches and PGE. */ 1272 load_cr0(cr0); 1273 load_cr4(cr4); 1274 PatMsr = pat_msr; 1275 } 1276 } 1277 1278 /* 1279 * Set 4mb pdir for mp startup 1280 */ 1281 void 1282 pmap_set_opt(void) 1283 { 1284 if (cpu_feature & CPUID_PSE) { 1285 load_cr4(rcr4() | CR4_PSE); 1286 if (mycpu->gd_cpuid == 0) /* only on BSP */ 1287 cpu_invltlb(); 1288 } 1289 1290 /* 1291 * Check for SMAP support and enable if available. Must be done 1292 * after cr3 is loaded, and on all cores. 1293 */ 1294 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) { 1295 load_cr4(rcr4() | CR4_SMAP); 1296 } 1297 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) { 1298 load_cr4(rcr4() | CR4_SMEP); 1299 } 1300 } 1301 1302 /* 1303 * Early initialization of the pmap module. 1304 * 1305 * Called by vm_init, to initialize any structures that the pmap 1306 * system needs to map virtual memory. pmap_init has been enhanced to 1307 * support in a fairly consistant way, discontiguous physical memory. 1308 */ 1309 void 1310 pmap_init(void) 1311 { 1312 vm_pindex_t initial_pvs; 1313 vm_pindex_t i; 1314 1315 /* 1316 * Allocate memory for random pmap data structures. Includes the 1317 * pv_head_table. 1318 */ 1319 for (i = 0; i < vm_page_array_size; i++) { 1320 vm_page_t m; 1321 1322 m = &vm_page_array[i]; 1323 m->md.pmap_count = 0; 1324 m->md.writeable_count = 0; 1325 } 1326 1327 /* 1328 * init the pv free list 1329 */ 1330 initial_pvs = vm_page_array_size; 1331 if (initial_pvs < MINPV) 1332 initial_pvs = MINPV; 1333 pvzone = &pvzone_store; 1334 pvinit = (void *)kmem_alloc(&kernel_map, 1335 initial_pvs * sizeof (struct pv_entry), 1336 VM_SUBSYS_PVENTRY); 1337 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 1338 pvinit, initial_pvs); 1339 1340 /* 1341 * Now it is safe to enable pv_table recording. 1342 */ 1343 pmap_initialized = TRUE; 1344 } 1345 1346 /* 1347 * Initialize the address space (zone) for the pv_entries. Set a 1348 * high water mark so that the system can recover from excessive 1349 * numbers of pv entries. 1350 * 1351 * Also create the kernel page table template for isolated user 1352 * pmaps. 1353 */ 1354 static void pmap_init_iso_range(vm_offset_t base, size_t bytes); 1355 static void pmap_init2_iso_pmap(void); 1356 #if 0 1357 static void dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base); 1358 #endif 1359 1360 void 1361 pmap_init2(void) 1362 { 1363 vm_pindex_t entry_max; 1364 1365 /* 1366 * We can significantly reduce pv_entry_max from historical 1367 * levels because pv_entry's are no longer use for PTEs at the 1368 * leafs. This prevents excessive pcpu caching on many-core 1369 * boxes (even with the further '/ 16' done in zinitna(). 1370 * 1371 * Remember, however, that processes can share physical pages 1372 * with each process still needing the pdp/pd/pt infrstructure 1373 * (which still use pv_entry's). And don't just assume that 1374 * every PT will be completely filled up. So don't make it 1375 * too small. 1376 */ 1377 entry_max = maxproc * 32 + vm_page_array_size / 16; 1378 TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &entry_max); 1379 vm_pmap_pv_entries = entry_max; 1380 1381 /* 1382 * Subtract out pages already installed in the zone (hack) 1383 */ 1384 if (entry_max <= MINPV) 1385 entry_max = MINPV; 1386 1387 zinitna(pvzone, NULL, 0, entry_max, ZONE_INTERRUPT); 1388 1389 /* 1390 * Enable dynamic deletion of empty higher-level page table pages 1391 * by default only if system memory is < 8GB (use 7GB for slop). 1392 * This can save a little memory, but imposes significant 1393 * performance overhead for things like bulk builds, and for programs 1394 * which do a lot of memory mapping and memory unmapping. 1395 */ 1396 #if 0 1397 if (pmap_dynamic_delete < 0) { 1398 if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE) 1399 pmap_dynamic_delete = 1; 1400 else 1401 pmap_dynamic_delete = 0; 1402 } 1403 #endif 1404 /* 1405 * Disable so vm_map_backing iterations do not race 1406 */ 1407 pmap_dynamic_delete = 0; 1408 1409 /* 1410 * Automatic detection of Intel meltdown bug requiring user/kernel 1411 * mmap isolation. 1412 * 1413 * Currently there are so many Intel cpu's impacted that its better 1414 * to whitelist future Intel CPUs. Most? AMD cpus are not impacted 1415 * so the default is off for AMD. 1416 */ 1417 if (meltdown_mitigation < 0) { 1418 if (cpu_vendor_id == CPU_VENDOR_INTEL) 1419 meltdown_mitigation = 1; 1420 else 1421 meltdown_mitigation = 0; 1422 } 1423 if (meltdown_mitigation) { 1424 kprintf("machdep.meltdown_mitigation enabled to " 1425 "protect against (mostly Intel) meltdown bug\n"); 1426 kprintf("system call performance will be impacted\n"); 1427 } 1428 1429 pmap_init2_iso_pmap(); 1430 } 1431 1432 /* 1433 * Create the isolation pmap template. Once created, the template 1434 * is static and its PML4e entries are used to populate the 1435 * kernel portion of any isolated user pmaps. 1436 * 1437 * Our isolation pmap must contain: 1438 * (1) trampoline area for all cpus 1439 * (2) common_tss area for all cpus (its part of the trampoline area now) 1440 * (3) IDT for all cpus 1441 * (4) GDT for all cpus 1442 */ 1443 static void 1444 pmap_init2_iso_pmap(void) 1445 { 1446 int n; 1447 1448 if (bootverbose) 1449 kprintf("Initialize isolation pmap\n"); 1450 1451 /* 1452 * Try to use our normal API calls to make this easier. We have 1453 * to scrap the shadowed kernel PDPs pmap_pinit() creates for our 1454 * iso_pmap. 1455 */ 1456 pmap_pinit(&iso_pmap); 1457 bzero(iso_pmap.pm_pml4, PAGE_SIZE); 1458 1459 /* 1460 * Install areas needed by the cpu and trampoline. 1461 */ 1462 for (n = 0; n < ncpus; ++n) { 1463 struct privatespace *ps; 1464 1465 ps = CPU_prvspace[n]; 1466 pmap_init_iso_range((vm_offset_t)&ps->trampoline, 1467 sizeof(ps->trampoline)); 1468 pmap_init_iso_range((vm_offset_t)&ps->dblstack, 1469 sizeof(ps->dblstack)); 1470 pmap_init_iso_range((vm_offset_t)&ps->dbgstack, 1471 sizeof(ps->dbgstack)); 1472 pmap_init_iso_range((vm_offset_t)&ps->common_tss, 1473 sizeof(ps->common_tss)); 1474 pmap_init_iso_range(r_idt_arr[n].rd_base, 1475 r_idt_arr[n].rd_limit + 1); 1476 } 1477 pmap_init_iso_range((register_t)gdt, sizeof(gdt)); 1478 pmap_init_iso_range((vm_offset_t)(int *)btext, 1479 (vm_offset_t)(int *)etext - 1480 (vm_offset_t)(int *)btext); 1481 1482 #if 0 1483 kprintf("Dump iso_pmap:\n"); 1484 dump_pmap(&iso_pmap, vtophys(iso_pmap.pm_pml4), 0, 0); 1485 kprintf("\nDump kernel_pmap:\n"); 1486 dump_pmap(&kernel_pmap, vtophys(kernel_pmap.pm_pml4), 0, 0); 1487 #endif 1488 } 1489 1490 /* 1491 * This adds a kernel virtual address range to the isolation pmap. 1492 */ 1493 static void 1494 pmap_init_iso_range(vm_offset_t base, size_t bytes) 1495 { 1496 pv_entry_t pv; 1497 pv_entry_t pvp; 1498 pt_entry_t *ptep; 1499 pt_entry_t pte; 1500 vm_offset_t va; 1501 1502 if (bootverbose) { 1503 kprintf("isolate %016jx-%016jx (%zd)\n", 1504 base, base + bytes, bytes); 1505 } 1506 va = base & ~(vm_offset_t)PAGE_MASK; 1507 while (va < base + bytes) { 1508 if ((va & PDRMASK) == 0 && va + NBPDR <= base + bytes && 1509 (ptep = pmap_pt(&kernel_pmap, va)) != NULL && 1510 (*ptep & kernel_pmap.pmap_bits[PG_V_IDX]) && 1511 (*ptep & kernel_pmap.pmap_bits[PG_PS_IDX])) { 1512 /* 1513 * Use 2MB pages if possible 1514 */ 1515 pte = *ptep; 1516 pv = pmap_allocpte(&iso_pmap, pmap_pd_pindex(va), &pvp); 1517 ptep = pv_pte_lookup(pv, (va >> PDRSHIFT) & 511); 1518 *ptep = pte; 1519 va += NBPDR; 1520 } else { 1521 /* 1522 * Otherwise use 4KB pages 1523 */ 1524 pv = pmap_allocpte(&iso_pmap, pmap_pt_pindex(va), &pvp); 1525 ptep = pv_pte_lookup(pv, (va >> PAGE_SHIFT) & 511); 1526 *ptep = vtophys(va) | kernel_pmap.pmap_bits[PG_RW_IDX] | 1527 kernel_pmap.pmap_bits[PG_V_IDX] | 1528 kernel_pmap.pmap_bits[PG_A_IDX] | 1529 kernel_pmap.pmap_bits[PG_M_IDX]; 1530 1531 va += PAGE_SIZE; 1532 } 1533 pv_put(pv); 1534 pv_put(pvp); 1535 } 1536 } 1537 1538 #if 0 1539 /* 1540 * Useful debugging pmap dumper, do not remove (#if 0 when not in use) 1541 */ 1542 static 1543 void 1544 dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base) 1545 { 1546 pt_entry_t *ptp; 1547 vm_offset_t incr; 1548 int i; 1549 1550 switch(level) { 1551 case 0: /* PML4e page, 512G entries */ 1552 incr = (1LL << 48) / 512; 1553 break; 1554 case 1: /* PDP page, 1G entries */ 1555 incr = (1LL << 39) / 512; 1556 break; 1557 case 2: /* PD page, 2MB entries */ 1558 incr = (1LL << 30) / 512; 1559 break; 1560 case 3: /* PT page, 4KB entries */ 1561 incr = (1LL << 21) / 512; 1562 break; 1563 default: 1564 incr = 0; 1565 break; 1566 } 1567 1568 if (level == 0) 1569 kprintf("cr3 %016jx @ va=%016jx\n", pte, base); 1570 ptp = (void *)PHYS_TO_DMAP(pte & ~(pt_entry_t)PAGE_MASK); 1571 for (i = 0; i < 512; ++i) { 1572 if (level == 0 && i == 128) 1573 base += 0xFFFF000000000000LLU; 1574 if (ptp[i]) { 1575 kprintf("%*.*s ", level * 4, level * 4, ""); 1576 if (level == 1 && (ptp[i] & 0x180) == 0x180) { 1577 kprintf("va=%016jx %3d term %016jx (1GB)\n", 1578 base, i, ptp[i]); 1579 } else if (level == 2 && (ptp[i] & 0x180) == 0x180) { 1580 kprintf("va=%016jx %3d term %016jx (2MB)\n", 1581 base, i, ptp[i]); 1582 } else if (level == 3) { 1583 kprintf("va=%016jx %3d term %016jx\n", 1584 base, i, ptp[i]); 1585 } else { 1586 kprintf("va=%016jx %3d deep %016jx\n", 1587 base, i, ptp[i]); 1588 dump_pmap(pmap, ptp[i], level + 1, base); 1589 } 1590 } 1591 base += incr; 1592 } 1593 } 1594 1595 #endif 1596 1597 /* 1598 * Typically used to initialize a fictitious page by vm/device_pager.c 1599 */ 1600 void 1601 pmap_page_init(struct vm_page *m) 1602 { 1603 vm_page_init(m); 1604 m->md.pmap_count = 0; 1605 m->md.writeable_count = 0; 1606 } 1607 1608 /*************************************************** 1609 * Low level helper routines..... 1610 ***************************************************/ 1611 1612 /* 1613 * Extract the physical page address associated with the map/VA pair. 1614 * The page must be wired for this to work reliably. 1615 */ 1616 vm_paddr_t 1617 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 1618 { 1619 vm_paddr_t rtval; 1620 pv_entry_t pt_pv; 1621 pt_entry_t *ptep; 1622 1623 rtval = 0; 1624 if (va >= VM_MAX_USER_ADDRESS) { 1625 /* 1626 * Kernel page directories might be direct-mapped and 1627 * there is typically no PV tracking of pte's 1628 */ 1629 pd_entry_t *pt; 1630 1631 pt = pmap_pt(pmap, va); 1632 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { 1633 if (*pt & pmap->pmap_bits[PG_PS_IDX]) { 1634 rtval = *pt & PG_PS_FRAME; 1635 rtval |= va & PDRMASK; 1636 } else { 1637 ptep = pmap_pt_to_pte(*pt, va); 1638 if (*pt & pmap->pmap_bits[PG_V_IDX]) { 1639 rtval = *ptep & PG_FRAME; 1640 rtval |= va & PAGE_MASK; 1641 } 1642 } 1643 } 1644 if (handlep) 1645 *handlep = NULL; 1646 } else { 1647 /* 1648 * User pages currently do not direct-map the page directory 1649 * and some pages might not used managed PVs. But all PT's 1650 * will have a PV. 1651 */ 1652 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1653 if (pt_pv) { 1654 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1655 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 1656 rtval = *ptep & PG_FRAME; 1657 rtval |= va & PAGE_MASK; 1658 } 1659 if (handlep) 1660 *handlep = pt_pv; /* locked until done */ 1661 else 1662 pv_put (pt_pv); 1663 } else if (handlep) { 1664 *handlep = NULL; 1665 } 1666 } 1667 return rtval; 1668 } 1669 1670 void 1671 pmap_extract_done(void *handle) 1672 { 1673 if (handle) 1674 pv_put((pv_entry_t)handle); 1675 } 1676 1677 /* 1678 * Similar to extract but checks protections, SMP-friendly short-cut for 1679 * vm_fault_page[_quick](). Can return NULL to cause the caller to 1680 * fall-through to the real fault code. Does not work with HVM page 1681 * tables. 1682 * 1683 * if busyp is NULL the returned page, if not NULL, is held (and not busied). 1684 * 1685 * If busyp is not NULL and this function sets *busyp non-zero, the returned 1686 * page is busied (and not held). 1687 * 1688 * If busyp is not NULL and this function sets *busyp to zero, the returned 1689 * page is held (and not busied). 1690 * 1691 * If VM_PROT_WRITE is set in prot, and the pte is already writable, the 1692 * returned page will be dirtied. If the pte is not already writable NULL 1693 * is returned. In otherwords, if the bit is set and a vm_page_t is returned, 1694 * any COW will already have happened and that page can be written by the 1695 * caller. 1696 * 1697 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NOT SUITABLE FOR READING 1698 * OR WRITING AS-IS. 1699 */ 1700 vm_page_t 1701 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot, int *busyp) 1702 { 1703 if (pmap && 1704 va < VM_MAX_USER_ADDRESS && 1705 (pmap->pm_flags & PMAP_HVM) == 0) { 1706 pv_entry_t pt_pv; 1707 pv_entry_t pte_pv; 1708 pt_entry_t *ptep; 1709 pt_entry_t req; 1710 vm_page_t m; 1711 int error; 1712 1713 req = pmap->pmap_bits[PG_V_IDX] | 1714 pmap->pmap_bits[PG_U_IDX]; 1715 if (prot & VM_PROT_WRITE) 1716 req |= pmap->pmap_bits[PG_RW_IDX]; 1717 1718 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1719 if (pt_pv == NULL) 1720 return (NULL); 1721 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1722 if ((*ptep & req) != req) { 1723 pv_put(pt_pv); 1724 return (NULL); 1725 } 1726 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), NULL, &error); 1727 if (pte_pv && error == 0) { 1728 m = pte_pv->pv_m; 1729 if (prot & VM_PROT_WRITE) { 1730 /* interlocked by presence of pv_entry */ 1731 vm_page_dirty(m); 1732 } 1733 if (busyp) { 1734 if (prot & VM_PROT_WRITE) { 1735 if (vm_page_busy_try(m, TRUE)) 1736 m = NULL; 1737 *busyp = 1; 1738 } else { 1739 vm_page_hold(m); 1740 *busyp = 0; 1741 } 1742 } else { 1743 vm_page_hold(m); 1744 } 1745 pv_put(pte_pv); 1746 } else if (pte_pv) { 1747 pv_drop(pte_pv); 1748 m = NULL; 1749 } else { 1750 /* error, since we didn't request a placemarker */ 1751 m = NULL; 1752 } 1753 pv_put(pt_pv); 1754 return(m); 1755 } else { 1756 return(NULL); 1757 } 1758 } 1759 1760 /* 1761 * Extract the physical page address associated kernel virtual address. 1762 */ 1763 vm_paddr_t 1764 pmap_kextract(vm_offset_t va) 1765 { 1766 pd_entry_t pt; /* pt entry in pd */ 1767 vm_paddr_t pa; 1768 1769 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1770 pa = DMAP_TO_PHYS(va); 1771 } else { 1772 pt = *vtopt(va); 1773 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) { 1774 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1775 } else { 1776 /* 1777 * Beware of a concurrent promotion that changes the 1778 * PDE at this point! For example, vtopte() must not 1779 * be used to access the PTE because it would use the 1780 * new PDE. It is, however, safe to use the old PDE 1781 * because the page table page is preserved by the 1782 * promotion. 1783 */ 1784 pa = *pmap_pt_to_pte(pt, va); 1785 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1786 } 1787 } 1788 return pa; 1789 } 1790 1791 /*************************************************** 1792 * Low level mapping routines..... 1793 ***************************************************/ 1794 1795 /* 1796 * Routine: pmap_kenter 1797 * Function: 1798 * Add a wired page to the KVA 1799 * NOTE! note that in order for the mapping to take effect -- you 1800 * should do an invltlb after doing the pmap_kenter(). 1801 */ 1802 void 1803 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1804 { 1805 pt_entry_t *ptep; 1806 pt_entry_t npte; 1807 1808 npte = pa | 1809 kernel_pmap.pmap_bits[PG_RW_IDX] | 1810 kernel_pmap.pmap_bits[PG_V_IDX]; 1811 // pgeflag; 1812 ptep = vtopte(va); 1813 #if 1 1814 pmap_inval_smp(&kernel_pmap, va, 1, ptep, npte); 1815 #else 1816 /* FUTURE */ 1817 if (*ptep) 1818 pmap_inval_smp(&kernel_pmap, va, ptep, npte); 1819 else 1820 *ptep = npte; 1821 #endif 1822 } 1823 1824 /* 1825 * Similar to pmap_kenter(), except we only invalidate the mapping on the 1826 * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't 1827 * (caller can conditionalize calling smp_invltlb()). 1828 */ 1829 int 1830 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1831 { 1832 pt_entry_t *ptep; 1833 pt_entry_t npte; 1834 int res; 1835 1836 npte = pa | kernel_pmap.pmap_bits[PG_RW_IDX] | 1837 kernel_pmap.pmap_bits[PG_V_IDX]; 1838 // npte |= pgeflag; 1839 ptep = vtopte(va); 1840 #if 1 1841 res = 1; 1842 #else 1843 /* FUTURE */ 1844 res = (*ptep != 0); 1845 #endif 1846 atomic_swap_long(ptep, npte); 1847 cpu_invlpg((void *)va); 1848 1849 return res; 1850 } 1851 1852 /* 1853 * Enter addresses into the kernel pmap but don't bother 1854 * doing any tlb invalidations. Caller will do a rollup 1855 * invalidation via pmap_rollup_inval(). 1856 */ 1857 int 1858 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 1859 { 1860 pt_entry_t *ptep; 1861 pt_entry_t npte; 1862 int res; 1863 1864 npte = pa | 1865 kernel_pmap.pmap_bits[PG_RW_IDX] | 1866 kernel_pmap.pmap_bits[PG_V_IDX]; 1867 // pgeflag; 1868 ptep = vtopte(va); 1869 #if 1 1870 res = 1; 1871 #else 1872 /* FUTURE */ 1873 res = (*ptep != 0); 1874 #endif 1875 atomic_swap_long(ptep, npte); 1876 cpu_invlpg((void *)va); 1877 1878 return res; 1879 } 1880 1881 /* 1882 * remove a page from the kernel pagetables 1883 */ 1884 void 1885 pmap_kremove(vm_offset_t va) 1886 { 1887 pt_entry_t *ptep; 1888 1889 ptep = vtopte(va); 1890 pmap_inval_smp(&kernel_pmap, va, 1, ptep, 0); 1891 } 1892 1893 void 1894 pmap_kremove_quick(vm_offset_t va) 1895 { 1896 pt_entry_t *ptep; 1897 1898 ptep = vtopte(va); 1899 (void)pte_load_clear(ptep); 1900 cpu_invlpg((void *)va); 1901 } 1902 1903 /* 1904 * Remove addresses from the kernel pmap but don't bother 1905 * doing any tlb invalidations. Caller will do a rollup 1906 * invalidation via pmap_rollup_inval(). 1907 */ 1908 void 1909 pmap_kremove_noinval(vm_offset_t va) 1910 { 1911 pt_entry_t *ptep; 1912 1913 ptep = vtopte(va); 1914 (void)pte_load_clear(ptep); 1915 } 1916 1917 /* 1918 * XXX these need to be recoded. They are not used in any critical path. 1919 */ 1920 void 1921 pmap_kmodify_rw(vm_offset_t va) 1922 { 1923 atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]); 1924 cpu_invlpg((void *)va); 1925 } 1926 1927 /* NOT USED 1928 void 1929 pmap_kmodify_nc(vm_offset_t va) 1930 { 1931 atomic_set_long(vtopte(va), PG_N); 1932 cpu_invlpg((void *)va); 1933 } 1934 */ 1935 1936 /* 1937 * Used to map a range of physical addresses into kernel virtual 1938 * address space during the low level boot, typically to map the 1939 * dump bitmap, message buffer, and vm_page_array. 1940 * 1941 * These mappings are typically made at some pointer after the end of the 1942 * kernel text+data. 1943 * 1944 * We could return PHYS_TO_DMAP(start) here and not allocate any 1945 * via (*virtp), but then kmem from userland and kernel dumps won't 1946 * have access to the related pointers. 1947 */ 1948 vm_offset_t 1949 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1950 { 1951 vm_offset_t va; 1952 vm_offset_t va_start; 1953 1954 /*return PHYS_TO_DMAP(start);*/ 1955 1956 va_start = *virtp; 1957 va = va_start; 1958 1959 while (start < end) { 1960 pmap_kenter_quick(va, start); 1961 va += PAGE_SIZE; 1962 start += PAGE_SIZE; 1963 } 1964 *virtp = va; 1965 return va_start; 1966 } 1967 1968 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1969 1970 /* 1971 * Remove the specified set of pages from the data and instruction caches. 1972 * 1973 * In contrast to pmap_invalidate_cache_range(), this function does not 1974 * rely on the CPU's self-snoop feature, because it is intended for use 1975 * when moving pages into a different cache domain. 1976 */ 1977 void 1978 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1979 { 1980 vm_offset_t daddr, eva; 1981 int i; 1982 1983 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1984 (cpu_feature & CPUID_CLFSH) == 0) 1985 wbinvd(); 1986 else { 1987 cpu_mfence(); 1988 for (i = 0; i < count; i++) { 1989 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1990 eva = daddr + PAGE_SIZE; 1991 for (; daddr < eva; daddr += cpu_clflush_line_size) 1992 clflush(daddr); 1993 } 1994 cpu_mfence(); 1995 } 1996 } 1997 1998 void 1999 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 2000 { 2001 KASSERT((sva & PAGE_MASK) == 0, 2002 ("pmap_invalidate_cache_range: sva not page-aligned")); 2003 KASSERT((eva & PAGE_MASK) == 0, 2004 ("pmap_invalidate_cache_range: eva not page-aligned")); 2005 2006 if (cpu_feature & CPUID_SS) { 2007 ; /* If "Self Snoop" is supported, do nothing. */ 2008 } else { 2009 /* Globally invalidate caches */ 2010 cpu_wbinvd_on_all_cpus(); 2011 } 2012 } 2013 2014 /* 2015 * Invalidate the specified range of virtual memory on all cpus associated 2016 * with the pmap. 2017 */ 2018 void 2019 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2020 { 2021 pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); 2022 } 2023 2024 /* 2025 * Add a list of wired pages to the kva. This routine is used for temporary 2026 * kernel mappings such as those found in buffer cache buffer. Page 2027 * modifications and accesses are not tracked or recorded. 2028 * 2029 * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed 2030 * semantics as previous mappings may have been zerod without any 2031 * invalidation. 2032 * 2033 * The page *must* be wired. 2034 */ 2035 static __inline void 2036 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) 2037 { 2038 vm_offset_t end_va; 2039 vm_offset_t va; 2040 2041 end_va = beg_va + count * PAGE_SIZE; 2042 2043 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2044 pt_entry_t pte; 2045 pt_entry_t *ptep; 2046 2047 ptep = vtopte(va); 2048 pte = VM_PAGE_TO_PHYS(*m) | 2049 kernel_pmap.pmap_bits[PG_RW_IDX] | 2050 kernel_pmap.pmap_bits[PG_V_IDX] | 2051 kernel_pmap.pmap_cache_bits[(*m)->pat_mode]; 2052 // pgeflag; 2053 atomic_swap_long(ptep, pte); 2054 m++; 2055 } 2056 if (doinval) 2057 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 2058 } 2059 2060 void 2061 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 2062 { 2063 _pmap_qenter(beg_va, m, count, 1); 2064 } 2065 2066 void 2067 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) 2068 { 2069 _pmap_qenter(beg_va, m, count, 0); 2070 } 2071 2072 /* 2073 * This routine jerks page mappings from the kernel -- it is meant only 2074 * for temporary mappings such as those found in buffer cache buffers. 2075 * No recording modified or access status occurs. 2076 * 2077 * MPSAFE, INTERRUPT SAFE (cluster callback) 2078 */ 2079 void 2080 pmap_qremove(vm_offset_t beg_va, int count) 2081 { 2082 vm_offset_t end_va; 2083 vm_offset_t va; 2084 2085 end_va = beg_va + count * PAGE_SIZE; 2086 2087 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2088 pt_entry_t *pte; 2089 2090 pte = vtopte(va); 2091 (void)pte_load_clear(pte); 2092 cpu_invlpg((void *)va); 2093 } 2094 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 2095 } 2096 2097 /* 2098 * This routine removes temporary kernel mappings, only invalidating them 2099 * on the current cpu. It should only be used under carefully controlled 2100 * conditions. 2101 */ 2102 void 2103 pmap_qremove_quick(vm_offset_t beg_va, int count) 2104 { 2105 vm_offset_t end_va; 2106 vm_offset_t va; 2107 2108 end_va = beg_va + count * PAGE_SIZE; 2109 2110 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2111 pt_entry_t *pte; 2112 2113 pte = vtopte(va); 2114 (void)pte_load_clear(pte); 2115 cpu_invlpg((void *)va); 2116 } 2117 } 2118 2119 /* 2120 * This routine removes temporary kernel mappings *without* invalidating 2121 * the TLB. It can only be used on permanent kva reservations such as those 2122 * found in buffer cache buffers, under carefully controlled circumstances. 2123 * 2124 * NOTE: Repopulating these KVAs requires unconditional invalidation. 2125 * (pmap_qenter() does unconditional invalidation). 2126 */ 2127 void 2128 pmap_qremove_noinval(vm_offset_t beg_va, int count) 2129 { 2130 vm_offset_t end_va; 2131 vm_offset_t va; 2132 2133 end_va = beg_va + count * PAGE_SIZE; 2134 2135 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2136 pt_entry_t *pte; 2137 2138 pte = vtopte(va); 2139 (void)pte_load_clear(pte); 2140 } 2141 } 2142 2143 /* 2144 * Create a new thread and optionally associate it with a (new) process. 2145 * NOTE! the new thread's cpu may not equal the current cpu. 2146 */ 2147 void 2148 pmap_init_thread(thread_t td) 2149 { 2150 /* enforce pcb placement & alignment */ 2151 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 2152 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 2153 td->td_savefpu = &td->td_pcb->pcb_save; 2154 td->td_sp = (char *)td->td_pcb; /* no -16 */ 2155 } 2156 2157 /* 2158 * This routine directly affects the fork perf for a process. 2159 */ 2160 void 2161 pmap_init_proc(struct proc *p) 2162 { 2163 } 2164 2165 static void 2166 pmap_pinit_defaults(struct pmap *pmap) 2167 { 2168 bcopy(pmap_bits_default, pmap->pmap_bits, 2169 sizeof(pmap_bits_default)); 2170 bcopy(protection_codes, pmap->protection_codes, 2171 sizeof(protection_codes)); 2172 bcopy(pat_pte_index, pmap->pmap_cache_bits, 2173 sizeof(pat_pte_index)); 2174 pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; 2175 pmap->copyinstr = std_copyinstr; 2176 pmap->copyin = std_copyin; 2177 pmap->copyout = std_copyout; 2178 pmap->fubyte = std_fubyte; 2179 pmap->subyte = std_subyte; 2180 pmap->fuword32 = std_fuword32; 2181 pmap->fuword64 = std_fuword64; 2182 pmap->suword32 = std_suword32; 2183 pmap->suword64 = std_suword64; 2184 pmap->swapu32 = std_swapu32; 2185 pmap->swapu64 = std_swapu64; 2186 pmap->fuwordadd32 = std_fuwordadd32; 2187 pmap->fuwordadd64 = std_fuwordadd64; 2188 } 2189 /* 2190 * Initialize pmap0/vmspace0. 2191 * 2192 * On architectures where the kernel pmap is not integrated into the user 2193 * process pmap, this pmap represents the process pmap, not the kernel pmap. 2194 * kernel_pmap should be used to directly access the kernel_pmap. 2195 */ 2196 void 2197 pmap_pinit0(struct pmap *pmap) 2198 { 2199 int i; 2200 2201 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 2202 pmap->pm_count = 1; 2203 CPUMASK_ASSZERO(pmap->pm_active); 2204 pmap->pm_pvhint_pt = NULL; 2205 pmap->pm_pvhint_unused = NULL; 2206 RB_INIT(&pmap->pm_pvroot); 2207 spin_init(&pmap->pm_spin, "pmapinit0"); 2208 for (i = 0; i < PM_PLACEMARKS; ++i) 2209 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 2210 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2211 pmap_pinit_defaults(pmap); 2212 } 2213 2214 /* 2215 * Initialize a preallocated and zeroed pmap structure, 2216 * such as one in a vmspace structure. 2217 */ 2218 static void 2219 pmap_pinit_simple(struct pmap *pmap) 2220 { 2221 int i; 2222 2223 /* 2224 * Misc initialization 2225 */ 2226 pmap->pm_count = 1; 2227 CPUMASK_ASSZERO(pmap->pm_active); 2228 pmap->pm_pvhint_pt = NULL; 2229 pmap->pm_pvhint_unused = NULL; 2230 pmap->pm_flags = PMAP_FLAG_SIMPLE; 2231 2232 pmap_pinit_defaults(pmap); 2233 2234 /* 2235 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 2236 * for this). 2237 */ 2238 if (pmap->pm_pmlpv == NULL) { 2239 RB_INIT(&pmap->pm_pvroot); 2240 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2241 spin_init(&pmap->pm_spin, "pmapinitsimple"); 2242 for (i = 0; i < PM_PLACEMARKS; ++i) 2243 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 2244 } 2245 } 2246 2247 void 2248 pmap_pinit(struct pmap *pmap) 2249 { 2250 pv_entry_t pv; 2251 int j; 2252 2253 if (pmap->pm_pmlpv) { 2254 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { 2255 pmap_puninit(pmap); 2256 } 2257 } 2258 2259 pmap_pinit_simple(pmap); 2260 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 2261 2262 /* 2263 * No need to allocate page table space yet but we do need a valid 2264 * page directory table. 2265 */ 2266 if (pmap->pm_pml4 == NULL) { 2267 pmap->pm_pml4 = 2268 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, 2269 PAGE_SIZE * 2, 2270 VM_SUBSYS_PML4); 2271 pmap->pm_pml4_iso = (void *)((char *)pmap->pm_pml4 + PAGE_SIZE); 2272 } 2273 2274 /* 2275 * Allocate the PML4e table, which wires it even though it isn't 2276 * being entered into some higher level page table (it being the 2277 * highest level). If one is already cached we don't have to do 2278 * anything. 2279 */ 2280 if ((pv = pmap->pm_pmlpv) == NULL) { 2281 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2282 pmap->pm_pmlpv = pv; 2283 pmap_kenter((vm_offset_t)pmap->pm_pml4, 2284 VM_PAGE_TO_PHYS(pv->pv_m)); 2285 pv_put(pv); 2286 2287 /* 2288 * Install DMAP and KMAP. 2289 */ 2290 for (j = 0; j < NDMPML4E; ++j) { 2291 pmap->pm_pml4[DMPML4I + j] = 2292 (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 2293 pmap->pmap_bits[PG_RW_IDX] | 2294 pmap->pmap_bits[PG_V_IDX] | 2295 pmap->pmap_bits[PG_A_IDX]; 2296 } 2297 for (j = 0; j < NKPML4E; ++j) { 2298 pmap->pm_pml4[KPML4I + j] = 2299 (KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 2300 pmap->pmap_bits[PG_RW_IDX] | 2301 pmap->pmap_bits[PG_V_IDX] | 2302 pmap->pmap_bits[PG_A_IDX]; 2303 } 2304 2305 /* 2306 * install self-referential address mapping entry 2307 */ 2308 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 2309 pmap->pmap_bits[PG_V_IDX] | 2310 pmap->pmap_bits[PG_RW_IDX] | 2311 pmap->pmap_bits[PG_A_IDX]; 2312 } else { 2313 KKASSERT(pv->pv_m->flags & PG_MAPPED); 2314 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 2315 } 2316 KKASSERT(pmap->pm_pml4[255] == 0); 2317 2318 /* 2319 * When implementing an isolated userland pmap, a second PML4e table 2320 * is needed. We use pmap_pml4_pindex() + 1 for convenience, but 2321 * note that we do not operate on this table using our API functions 2322 * so handling of the + 1 case is mostly just to prevent implosions. 2323 * 2324 * We install an isolated version of the kernel PDPs into this 2325 * second PML4e table. The pmap code will mirror all user PDPs 2326 * between the primary and secondary PML4e table. 2327 */ 2328 if ((pv = pmap->pm_pmlpv_iso) == NULL && meltdown_mitigation && 2329 pmap != &iso_pmap) { 2330 pv = pmap_allocpte(pmap, pmap_pml4_pindex() + 1, NULL); 2331 pmap->pm_pmlpv_iso = pv; 2332 pmap_kenter((vm_offset_t)pmap->pm_pml4_iso, 2333 VM_PAGE_TO_PHYS(pv->pv_m)); 2334 pv_put(pv); 2335 2336 /* 2337 * Install an isolated version of the kernel pmap for 2338 * user consumption, using PDPs constructed in iso_pmap. 2339 */ 2340 for (j = 0; j < NKPML4E; ++j) { 2341 pmap->pm_pml4_iso[KPML4I + j] = 2342 iso_pmap.pm_pml4[KPML4I + j]; 2343 } 2344 } else if (pv) { 2345 KKASSERT(pv->pv_m->flags & PG_MAPPED); 2346 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 2347 } 2348 } 2349 2350 /* 2351 * Clean up a pmap structure so it can be physically freed. This routine 2352 * is called by the vmspace dtor function. A great deal of pmap data is 2353 * left passively mapped to improve vmspace management so we have a bit 2354 * of cleanup work to do here. 2355 */ 2356 void 2357 pmap_puninit(pmap_t pmap) 2358 { 2359 pv_entry_t pv; 2360 vm_page_t p; 2361 2362 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 2363 if ((pv = pmap->pm_pmlpv) != NULL) { 2364 if (pv_hold_try(pv) == 0) 2365 pv_lock(pv); 2366 KKASSERT(pv == pmap->pm_pmlpv); 2367 p = pmap_remove_pv_page(pv); 2368 pv_free(pv, NULL); 2369 pv = NULL; /* safety */ 2370 pmap_kremove((vm_offset_t)pmap->pm_pml4); 2371 vm_page_busy_wait(p, FALSE, "pgpun"); 2372 KKASSERT(p->flags & PG_UNQUEUED); 2373 vm_page_unwire(p, 0); 2374 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2375 vm_page_free(p); 2376 pmap->pm_pmlpv = NULL; 2377 } 2378 if ((pv = pmap->pm_pmlpv_iso) != NULL) { 2379 if (pv_hold_try(pv) == 0) 2380 pv_lock(pv); 2381 KKASSERT(pv == pmap->pm_pmlpv_iso); 2382 p = pmap_remove_pv_page(pv); 2383 pv_free(pv, NULL); 2384 pv = NULL; /* safety */ 2385 pmap_kremove((vm_offset_t)pmap->pm_pml4_iso); 2386 vm_page_busy_wait(p, FALSE, "pgpun"); 2387 KKASSERT(p->flags & PG_UNQUEUED); 2388 vm_page_unwire(p, 0); 2389 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2390 vm_page_free(p); 2391 pmap->pm_pmlpv_iso = NULL; 2392 } 2393 if (pmap->pm_pml4) { 2394 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 2395 kmem_free(&kernel_map, 2396 (vm_offset_t)pmap->pm_pml4, PAGE_SIZE * 2); 2397 pmap->pm_pml4 = NULL; 2398 pmap->pm_pml4_iso = NULL; 2399 } 2400 KKASSERT(pmap->pm_stats.resident_count == 0); 2401 KKASSERT(pmap->pm_stats.wired_count == 0); 2402 } 2403 2404 /* 2405 * This function is now unused (used to add the pmap to the pmap_list) 2406 */ 2407 void 2408 pmap_pinit2(struct pmap *pmap) 2409 { 2410 } 2411 2412 /* 2413 * This routine is called when various levels in the page table need to 2414 * be populated. This routine cannot fail. 2415 * 2416 * This function returns two locked pv_entry's, one representing the 2417 * requested pv and one representing the requested pv's parent pv. If 2418 * an intermediate page table does not exist it will be created, mapped, 2419 * wired, and the parent page table will be given an additional hold 2420 * count representing the presence of the child pv_entry. 2421 */ 2422 static 2423 pv_entry_t 2424 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 2425 { 2426 pt_entry_t *ptep; 2427 pt_entry_t *ptep_iso; 2428 pv_entry_t pv; 2429 pv_entry_t pvp; 2430 pt_entry_t v; 2431 vm_page_t m; 2432 int isnew; 2433 int ispt; 2434 2435 /* 2436 * If the pv already exists and we aren't being asked for the 2437 * parent page table page we can just return it. A locked+held pv 2438 * is returned. The pv will also have a second hold related to the 2439 * pmap association that we don't have to worry about. 2440 */ 2441 ispt = 0; 2442 pv = pv_alloc(pmap, ptepindex, &isnew); 2443 if (isnew == 0 && pvpp == NULL) 2444 return(pv); 2445 2446 /* 2447 * DragonFly doesn't use PV's to represent terminal PTEs any more. 2448 * The index range is still used for placemarkers, but not for 2449 * actual pv_entry's. 2450 */ 2451 KKASSERT(ptepindex >= pmap_pt_pindex(0)); 2452 2453 /* 2454 * Note that pt_pv's are only returned for user VAs. We assert that 2455 * a pt_pv is not being requested for kernel VAs. The kernel 2456 * pre-wires all higher-level page tables so don't overload managed 2457 * higher-level page tables on top of it! 2458 * 2459 * However, its convenient for us to allow the case when creating 2460 * iso_pmap. This is a bit of a hack but it simplifies iso_pmap 2461 * a lot. 2462 */ 2463 2464 /* 2465 * The kernel never uses managed PT/PD/PDP pages. 2466 */ 2467 KKASSERT(pmap != &kernel_pmap); 2468 2469 /* 2470 * Non-terminal PVs allocate a VM page to represent the page table, 2471 * so we have to resolve pvp and calculate ptepindex for the pvp 2472 * and then for the page table entry index in the pvp for 2473 * fall-through. 2474 */ 2475 if (ptepindex < pmap_pd_pindex(0)) { 2476 /* 2477 * pv is PT, pvp is PD 2478 */ 2479 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 2480 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 2481 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2482 2483 /* 2484 * PT index in PD 2485 */ 2486 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 2487 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 2488 ispt = 1; 2489 } else if (ptepindex < pmap_pdp_pindex(0)) { 2490 /* 2491 * pv is PD, pvp is PDP 2492 * 2493 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 2494 * the PD. 2495 */ 2496 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 2497 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2498 2499 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 2500 KKASSERT(pvpp == NULL); 2501 pvp = NULL; 2502 } else { 2503 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2504 } 2505 2506 /* 2507 * PD index in PDP 2508 */ 2509 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 2510 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 2511 } else if (ptepindex < pmap_pml4_pindex()) { 2512 /* 2513 * pv is PDP, pvp is the root pml4 table 2514 */ 2515 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2516 2517 /* 2518 * PDP index in PML4 2519 */ 2520 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 2521 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 2522 } else { 2523 /* 2524 * pv represents the top-level PML4, there is no parent. 2525 */ 2526 pvp = NULL; 2527 } 2528 2529 if (isnew == 0) 2530 goto notnew; 2531 2532 /* 2533 * (isnew) is TRUE, pv is not terminal. 2534 * 2535 * (1) Add a wire count to the parent page table (pvp). 2536 * (2) Allocate a VM page for the page table. 2537 * (3) Enter the VM page into the parent page table. 2538 * 2539 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 2540 */ 2541 if (pvp) 2542 vm_page_wire_quick(pvp->pv_m); 2543 2544 for (;;) { 2545 m = vm_page_alloc(NULL, pv->pv_pindex, 2546 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2547 VM_ALLOC_INTERRUPT); 2548 if (m) 2549 break; 2550 vm_wait(0); 2551 } 2552 vm_page_wire(m); /* wire for mapping in parent */ 2553 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 2554 m->valid = VM_PAGE_BITS_ALL; 2555 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE | PG_UNQUEUED); 2556 KKASSERT(m->queue == PQ_NONE); 2557 2558 pv->pv_m = m; 2559 2560 /* 2561 * (isnew) is TRUE, pv is not terminal. 2562 * 2563 * Wire the page into pvp. Bump the resident_count for the pmap. 2564 * There is no pvp for the top level, address the pm_pml4[] array 2565 * directly. 2566 * 2567 * If the caller wants the parent we return it, otherwise 2568 * we just put it away. 2569 * 2570 * No interlock is needed for pte 0 -> non-zero. 2571 * 2572 * In the situation where *ptep is valid we might have an unmanaged 2573 * page table page shared from another page table which we need to 2574 * unshare before installing our private page table page. 2575 */ 2576 if (pvp) { 2577 v = VM_PAGE_TO_PHYS(m) | 2578 (pmap->pmap_bits[PG_RW_IDX] | 2579 pmap->pmap_bits[PG_V_IDX] | 2580 pmap->pmap_bits[PG_A_IDX]); 2581 if (ptepindex < NUPTE_USER) 2582 v |= pmap->pmap_bits[PG_U_IDX]; 2583 if (ptepindex < pmap_pt_pindex(0)) 2584 v |= pmap->pmap_bits[PG_M_IDX]; 2585 2586 ptep = pv_pte_lookup(pvp, ptepindex); 2587 if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) 2588 ptep_iso = pv_pte_lookup(pmap->pm_pmlpv_iso, ptepindex); 2589 else 2590 ptep_iso = NULL; 2591 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 2592 panic("pmap_allocpte: ptpte present without pv_entry!"); 2593 } else { 2594 pt_entry_t pte; 2595 2596 pte = atomic_swap_long(ptep, v); 2597 if (ptep_iso) 2598 atomic_swap_long(ptep_iso, v); 2599 if (pte != 0) { 2600 kprintf("install pgtbl mixup 0x%016jx " 2601 "old/new 0x%016jx/0x%016jx\n", 2602 (intmax_t)ptepindex, pte, v); 2603 } 2604 } 2605 } 2606 vm_page_wakeup(m); 2607 2608 /* 2609 * (isnew) may be TRUE or FALSE, pv may or may not be terminal. 2610 */ 2611 notnew: 2612 if (pvp) { 2613 KKASSERT(pvp->pv_m != NULL); 2614 ptep = pv_pte_lookup(pvp, ptepindex); 2615 v = VM_PAGE_TO_PHYS(pv->pv_m) | 2616 (pmap->pmap_bits[PG_RW_IDX] | 2617 pmap->pmap_bits[PG_V_IDX] | 2618 pmap->pmap_bits[PG_A_IDX]); 2619 if (ptepindex < NUPTE_USER) 2620 v |= pmap->pmap_bits[PG_U_IDX]; 2621 if (ptepindex < pmap_pt_pindex(0)) 2622 v |= pmap->pmap_bits[PG_M_IDX]; 2623 if (*ptep != v) { 2624 kprintf("mismatched upper level pt %016jx/%016jx\n", 2625 *ptep, v); 2626 } 2627 } 2628 if (pvpp) 2629 *pvpp = pvp; 2630 else if (pvp) 2631 pv_put(pvp); 2632 return (pv); 2633 } 2634 2635 /* 2636 * Release any resources held by the given physical map. 2637 * 2638 * Called when a pmap initialized by pmap_pinit is being released. Should 2639 * only be called if the map contains no valid mappings. 2640 */ 2641 struct pmap_release_info { 2642 pmap_t pmap; 2643 int retry; 2644 pv_entry_t pvp; 2645 }; 2646 2647 static int pmap_release_callback(pv_entry_t pv, void *data); 2648 2649 void 2650 pmap_release(struct pmap *pmap) 2651 { 2652 struct pmap_release_info info; 2653 2654 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 2655 ("pmap still active! %016jx", 2656 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 2657 2658 /* 2659 * There is no longer a pmap_list, if there were we would remove the 2660 * pmap from it here. 2661 */ 2662 2663 /* 2664 * Pull pv's off the RB tree in order from low to high and release 2665 * each page. 2666 */ 2667 info.pmap = pmap; 2668 do { 2669 info.retry = 0; 2670 info.pvp = NULL; 2671 2672 spin_lock(&pmap->pm_spin); 2673 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2674 pmap_release_callback, &info); 2675 spin_unlock(&pmap->pm_spin); 2676 2677 if (info.pvp) 2678 pv_put(info.pvp); 2679 } while (info.retry); 2680 2681 2682 /* 2683 * One resident page (the pml4 page) should remain. Two if 2684 * the pmap has implemented an isolated userland PML4E table. 2685 * No wired pages should remain. 2686 */ 2687 int expected_res = 0; 2688 2689 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0) 2690 ++expected_res; 2691 if (pmap->pm_pmlpv_iso) 2692 ++expected_res; 2693 2694 #if 1 2695 if (pmap->pm_stats.resident_count != expected_res || 2696 pmap->pm_stats.wired_count != 0) { 2697 kprintf("fatal pmap problem - pmap %p flags %08x " 2698 "rescnt=%jd wirecnt=%jd\n", 2699 pmap, 2700 pmap->pm_flags, 2701 pmap->pm_stats.resident_count, 2702 pmap->pm_stats.wired_count); 2703 tsleep(pmap, 0, "DEAD", 0); 2704 } 2705 #else 2706 KKASSERT(pmap->pm_stats.resident_count == expected_res); 2707 KKASSERT(pmap->pm_stats.wired_count == 0); 2708 #endif 2709 } 2710 2711 /* 2712 * Called from low to high. We must cache the proper parent pv so we 2713 * can adjust its wired count. 2714 */ 2715 static int 2716 pmap_release_callback(pv_entry_t pv, void *data) 2717 { 2718 struct pmap_release_info *info = data; 2719 pmap_t pmap = info->pmap; 2720 vm_pindex_t pindex; 2721 int r; 2722 2723 /* 2724 * Acquire a held and locked pv, check for release race 2725 */ 2726 pindex = pv->pv_pindex; 2727 if (info->pvp == pv) { 2728 spin_unlock(&pmap->pm_spin); 2729 info->pvp = NULL; 2730 } else if (pv_hold_try(pv)) { 2731 spin_unlock(&pmap->pm_spin); 2732 } else { 2733 spin_unlock(&pmap->pm_spin); 2734 pv_lock(pv); 2735 pv_put(pv); 2736 info->retry = 1; 2737 spin_lock(&pmap->pm_spin); 2738 2739 return -1; 2740 } 2741 KKASSERT(pv->pv_pmap == pmap && pindex == pv->pv_pindex); 2742 2743 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2744 /* 2745 * I am PTE, parent is PT 2746 */ 2747 pindex = pv->pv_pindex >> NPTEPGSHIFT; 2748 pindex += NUPTE_TOTAL; 2749 } else if (pv->pv_pindex < pmap_pd_pindex(0)) { 2750 /* 2751 * I am PT, parent is PD 2752 */ 2753 pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT; 2754 pindex += NUPTE_TOTAL + NUPT_TOTAL; 2755 } else if (pv->pv_pindex < pmap_pdp_pindex(0)) { 2756 /* 2757 * I am PD, parent is PDP 2758 */ 2759 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >> 2760 NPDPEPGSHIFT; 2761 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2762 } else if (pv->pv_pindex < pmap_pml4_pindex()) { 2763 /* 2764 * I am PDP, parent is PML4. We always calculate the 2765 * normal PML4 here, not the isolated PML4. 2766 */ 2767 pindex = pmap_pml4_pindex(); 2768 } else { 2769 /* 2770 * parent is NULL 2771 */ 2772 if (info->pvp) { 2773 pv_put(info->pvp); 2774 info->pvp = NULL; 2775 } 2776 pindex = 0; 2777 } 2778 if (pindex) { 2779 if (info->pvp && info->pvp->pv_pindex != pindex) { 2780 pv_put(info->pvp); 2781 info->pvp = NULL; 2782 } 2783 if (info->pvp == NULL) 2784 info->pvp = pv_get(pmap, pindex, NULL); 2785 } else { 2786 if (info->pvp) { 2787 pv_put(info->pvp); 2788 info->pvp = NULL; 2789 } 2790 } 2791 r = pmap_release_pv(pv, info->pvp, NULL); 2792 spin_lock(&pmap->pm_spin); 2793 2794 return(r); 2795 } 2796 2797 /* 2798 * Called with held (i.e. also locked) pv. This function will dispose of 2799 * the lock along with the pv. 2800 * 2801 * If the caller already holds the locked parent page table for pv it 2802 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2803 * pass NULL for pvp. 2804 */ 2805 static int 2806 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2807 { 2808 vm_page_t p; 2809 2810 /* 2811 * The pmap is currently not spinlocked, pv is held+locked. 2812 * Remove the pv's page from its parent's page table. The 2813 * parent's page table page's wire_count will be decremented. 2814 * 2815 * This will clean out the pte at any level of the page table. 2816 * If smp != 0 all cpus are affected. 2817 * 2818 * Do not tear-down recursively, its faster to just let the 2819 * release run its course. 2820 */ 2821 pmap_remove_pv_pte(pv, pvp, bulk, 0); 2822 2823 /* 2824 * Terminal pvs are unhooked from their vm_pages. Because 2825 * terminal pages aren't page table pages they aren't wired 2826 * by us, so we have to be sure not to unwire them either. 2827 */ 2828 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2829 pmap_remove_pv_page(pv); 2830 goto skip; 2831 } 2832 2833 /* 2834 * We leave the top-level page table page cached, wired, and 2835 * mapped in the pmap until the dtor function (pmap_puninit()) 2836 * gets called. 2837 * 2838 * Since we are leaving the top-level pv intact we need 2839 * to break out of what would otherwise be an infinite loop. 2840 * 2841 * This covers both the normal and the isolated PML4 page. 2842 */ 2843 if (pv->pv_pindex >= pmap_pml4_pindex()) { 2844 pv_put(pv); 2845 return(-1); 2846 } 2847 2848 /* 2849 * For page table pages (other than the top-level page), 2850 * remove and free the vm_page. The representitive mapping 2851 * removed above by pmap_remove_pv_pte() did not undo the 2852 * last wire_count so we have to do that as well. 2853 */ 2854 p = pmap_remove_pv_page(pv); 2855 vm_page_busy_wait(p, FALSE, "pmaprl"); 2856 if (p->wire_count != 1) { 2857 const char *tstr; 2858 2859 if (pv->pv_pindex >= pmap_pdp_pindex(0)) 2860 tstr = "PDP"; 2861 else if (pv->pv_pindex >= pmap_pd_pindex(0)) 2862 tstr = "PD"; 2863 else if (pv->pv_pindex >= pmap_pt_pindex(0)) 2864 tstr = "PT"; 2865 else 2866 tstr = "PTE"; 2867 2868 kprintf("p(%s) p->wire_count was %016lx %d\n", 2869 tstr, pv->pv_pindex, p->wire_count); 2870 } 2871 KKASSERT(p->wire_count == 1); 2872 KKASSERT(p->flags & PG_UNQUEUED); 2873 2874 vm_page_unwire(p, 0); 2875 KKASSERT(p->wire_count == 0); 2876 2877 vm_page_free(p); 2878 skip: 2879 pv_free(pv, pvp); 2880 2881 return 0; 2882 } 2883 2884 /* 2885 * This function will remove the pte associated with a pv from its parent. 2886 * Terminal pv's are supported. All cpus specified by (bulk) are properly 2887 * invalidated. 2888 * 2889 * The wire count will be dropped on the parent page table. The wire 2890 * count on the page being removed (pv->pv_m) from the parent page table 2891 * is NOT touched. Note that terminal pages will not have any additional 2892 * wire counts while page table pages will have at least one representing 2893 * the mapping, plus others representing sub-mappings. 2894 * 2895 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2896 * pages and user page table and terminal pages. 2897 * 2898 * NOTE: The pte being removed might be unmanaged, and the pv supplied might 2899 * be freshly allocated and not imply that the pte is managed. In this 2900 * case pv->pv_m should be NULL. 2901 * 2902 * The pv must be locked. The pvp, if supplied, must be locked. All 2903 * supplied pv's will remain locked on return. 2904 * 2905 * XXX must lock parent pv's if they exist to remove pte XXX 2906 */ 2907 static 2908 void 2909 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk, 2910 int destroy) 2911 { 2912 vm_pindex_t ptepindex = pv->pv_pindex; 2913 pmap_t pmap = pv->pv_pmap; 2914 vm_page_t p; 2915 int gotpvp = 0; 2916 2917 KKASSERT(pmap); 2918 2919 if (ptepindex >= pmap_pml4_pindex()) { 2920 /* 2921 * We are the top level PML4E table, there is no parent. 2922 * 2923 * This is either the normal or isolated PML4E table. 2924 * Only the normal is used in regular operation, the isolated 2925 * is only passed in when breaking down the whole pmap. 2926 */ 2927 p = pmap->pm_pmlpv->pv_m; 2928 KKASSERT(pv->pv_m == p); /* debugging */ 2929 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2930 /* 2931 * Remove a PDP page from the PML4E. This can only occur 2932 * with user page tables. We do not have to lock the 2933 * pml4 PV so just ignore pvp. 2934 */ 2935 vm_pindex_t pml4_pindex; 2936 vm_pindex_t pdp_index; 2937 pml4_entry_t *pdp; 2938 pml4_entry_t *pdp_iso; 2939 2940 pdp_index = ptepindex - pmap_pdp_pindex(0); 2941 if (pvp == NULL) { 2942 pml4_pindex = pmap_pml4_pindex(); 2943 pvp = pv_get(pv->pv_pmap, pml4_pindex, NULL); 2944 KKASSERT(pvp); 2945 gotpvp = 1; 2946 } 2947 2948 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2949 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); 2950 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2951 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); 2952 2953 /* 2954 * Also remove the PDP from the isolated PML4E if the 2955 * process uses one. 2956 */ 2957 if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) { 2958 pdp_iso = &pmap->pm_pml4_iso[pdp_index & 2959 ((1ul << NPML4EPGSHIFT) - 1)]; 2960 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp_iso, 0); 2961 } 2962 KKASSERT(pv->pv_m == p); /* debugging */ 2963 } else if (ptepindex >= pmap_pd_pindex(0)) { 2964 /* 2965 * Remove a PD page from the PDP 2966 * 2967 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 2968 * of a simple pmap because it stops at 2969 * the PD page. 2970 */ 2971 vm_pindex_t pdp_pindex; 2972 vm_pindex_t pd_index; 2973 pdp_entry_t *pd; 2974 2975 pd_index = ptepindex - pmap_pd_pindex(0); 2976 2977 if (pvp == NULL) { 2978 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 2979 (pd_index >> NPML4EPGSHIFT); 2980 pvp = pv_get(pv->pv_pmap, pdp_pindex, NULL); 2981 gotpvp = 1; 2982 } 2983 2984 if (pvp) { 2985 pd = pv_pte_lookup(pvp, pd_index & 2986 ((1ul << NPDPEPGSHIFT) - 1)); 2987 KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); 2988 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2989 pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); 2990 } else { 2991 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 2992 p = pv->pv_m; /* degenerate test later */ 2993 } 2994 KKASSERT(pv->pv_m == p); /* debugging */ 2995 } else if (ptepindex >= pmap_pt_pindex(0)) { 2996 /* 2997 * Remove a PT page from the PD 2998 */ 2999 vm_pindex_t pd_pindex; 3000 vm_pindex_t pt_index; 3001 pd_entry_t *pt; 3002 3003 pt_index = ptepindex - pmap_pt_pindex(0); 3004 3005 if (pvp == NULL) { 3006 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 3007 (pt_index >> NPDPEPGSHIFT); 3008 pvp = pv_get(pv->pv_pmap, pd_pindex, NULL); 3009 KKASSERT(pvp); 3010 gotpvp = 1; 3011 } 3012 3013 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 3014 #if 0 3015 KASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0, 3016 ("*pt unexpectedly invalid %016jx " 3017 "gotpvp=%d ptepindex=%ld ptindex=%ld pv=%p pvp=%p", 3018 *pt, gotpvp, ptepindex, pt_index, pv, pvp)); 3019 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 3020 #else 3021 if ((*pt & pmap->pmap_bits[PG_V_IDX]) == 0) { 3022 kprintf("*pt unexpectedly invalid %016jx " 3023 "gotpvp=%d ptepindex=%ld ptindex=%ld " 3024 "pv=%p pvp=%p\n", 3025 *pt, gotpvp, ptepindex, pt_index, pv, pvp); 3026 tsleep(pt, 0, "DEAD", 0); 3027 p = pv->pv_m; 3028 } else { 3029 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 3030 } 3031 #endif 3032 pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); 3033 KKASSERT(pv->pv_m == p); /* debugging */ 3034 } else { 3035 KKASSERT(0); 3036 } 3037 3038 /* 3039 * If requested, scrap the underlying pv->pv_m and the underlying 3040 * pv. If this is a page-table-page we must also free the page. 3041 * 3042 * pvp must be returned locked. 3043 */ 3044 if (destroy == 1) { 3045 /* 3046 * page table page (PT, PD, PDP, PML4), caller was responsible 3047 * for testing wired_count. 3048 */ 3049 KKASSERT(pv->pv_m->wire_count == 1); 3050 p = pmap_remove_pv_page(pv); 3051 pv_free(pv, pvp); 3052 pv = NULL; 3053 3054 vm_page_busy_wait(p, FALSE, "pgpun"); 3055 vm_page_unwire(p, 0); 3056 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 3057 vm_page_free(p); 3058 } else if (destroy == 2) { 3059 /* 3060 * Normal page, remove from pmap and leave the underlying 3061 * page untouched. 3062 */ 3063 pmap_remove_pv_page(pv); 3064 pv_free(pv, pvp); 3065 pv = NULL; /* safety */ 3066 } 3067 3068 /* 3069 * If we acquired pvp ourselves then we are responsible for 3070 * recursively deleting it. 3071 */ 3072 if (pvp && gotpvp) { 3073 /* 3074 * Recursively destroy higher-level page tables. 3075 * 3076 * This is optional. If we do not, they will still 3077 * be destroyed when the process exits. 3078 * 3079 * NOTE: Do not destroy pv_entry's with extra hold refs, 3080 * a caller may have unlocked it and intends to 3081 * continue to use it. 3082 */ 3083 if (pmap_dynamic_delete && 3084 pvp->pv_m && 3085 pvp->pv_m->wire_count == 1 && 3086 (pvp->pv_hold & PV_HOLD_MASK) == 2 && 3087 pvp->pv_pindex < pmap_pml4_pindex()) { 3088 if (pmap != &kernel_pmap) { 3089 pmap_remove_pv_pte(pvp, NULL, bulk, 1); 3090 pvp = NULL; /* safety */ 3091 } else { 3092 kprintf("Attempt to remove kernel_pmap pindex " 3093 "%jd\n", pvp->pv_pindex); 3094 pv_put(pvp); 3095 } 3096 } else { 3097 pv_put(pvp); 3098 } 3099 } 3100 } 3101 3102 /* 3103 * Remove the vm_page association to a pv. The pv must be locked. 3104 */ 3105 static 3106 vm_page_t 3107 pmap_remove_pv_page(pv_entry_t pv) 3108 { 3109 vm_page_t m; 3110 3111 m = pv->pv_m; 3112 pv->pv_m = NULL; 3113 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3114 3115 return(m); 3116 } 3117 3118 /* 3119 * Grow the number of kernel page table entries, if needed. 3120 * 3121 * This routine is always called to validate any address space 3122 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 3123 * space below KERNBASE. 3124 * 3125 * kernel_map must be locked exclusively by the caller. 3126 */ 3127 void 3128 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 3129 { 3130 vm_paddr_t paddr; 3131 vm_offset_t ptppaddr; 3132 vm_page_t nkpg; 3133 pd_entry_t *pt, newpt; 3134 pdp_entry_t *pd, newpd; 3135 int update_kernel_vm_end; 3136 3137 /* 3138 * bootstrap kernel_vm_end on first real VM use 3139 */ 3140 if (kernel_vm_end == 0) { 3141 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 3142 3143 for (;;) { 3144 pt = pmap_pt(&kernel_pmap, kernel_vm_end); 3145 if (pt == NULL) 3146 break; 3147 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) == 0) 3148 break; 3149 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 3150 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3151 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 3152 kernel_vm_end = vm_map_max(&kernel_map); 3153 break; 3154 } 3155 } 3156 } 3157 3158 /* 3159 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 3160 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 3161 * do not want to force-fill 128G worth of page tables. 3162 */ 3163 if (kstart < KERNBASE) { 3164 if (kstart > kernel_vm_end) 3165 kstart = kernel_vm_end; 3166 KKASSERT(kend <= KERNBASE); 3167 update_kernel_vm_end = 1; 3168 } else { 3169 update_kernel_vm_end = 0; 3170 } 3171 3172 kstart = rounddown2(kstart, (vm_offset_t)(PAGE_SIZE * NPTEPG)); 3173 kend = roundup2(kend, (vm_offset_t)(PAGE_SIZE * NPTEPG)); 3174 3175 if (kend - 1 >= vm_map_max(&kernel_map)) 3176 kend = vm_map_max(&kernel_map); 3177 3178 while (kstart < kend) { 3179 pt = pmap_pt(&kernel_pmap, kstart); 3180 if (pt == NULL) { 3181 /* 3182 * We need a new PD entry 3183 */ 3184 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3185 VM_ALLOC_NORMAL | 3186 VM_ALLOC_SYSTEM | 3187 VM_ALLOC_INTERRUPT); 3188 if (nkpg == NULL) { 3189 panic("pmap_growkernel: no memory to grow " 3190 "kernel"); 3191 } 3192 paddr = VM_PAGE_TO_PHYS(nkpg); 3193 pmap_zero_page(paddr); 3194 pd = pmap_pd(&kernel_pmap, kstart); 3195 3196 newpd = (pdp_entry_t) 3197 (paddr | 3198 kernel_pmap.pmap_bits[PG_V_IDX] | 3199 kernel_pmap.pmap_bits[PG_RW_IDX] | 3200 kernel_pmap.pmap_bits[PG_A_IDX]); 3201 atomic_swap_long(pd, newpd); 3202 3203 #if 0 3204 kprintf("NEWPD pd=%p pde=%016jx phys=%016jx\n", 3205 pd, newpd, paddr); 3206 #endif 3207 3208 continue; /* try again */ 3209 } 3210 3211 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3212 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3213 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3214 if (kstart - 1 >= vm_map_max(&kernel_map)) { 3215 kstart = vm_map_max(&kernel_map); 3216 break; 3217 } 3218 continue; 3219 } 3220 3221 /* 3222 * We need a new PT 3223 * 3224 * This index is bogus, but out of the way 3225 */ 3226 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3227 VM_ALLOC_NORMAL | 3228 VM_ALLOC_SYSTEM | 3229 VM_ALLOC_INTERRUPT); 3230 if (nkpg == NULL) 3231 panic("pmap_growkernel: no memory to grow kernel"); 3232 3233 vm_page_wire(nkpg); 3234 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 3235 pmap_zero_page(ptppaddr); 3236 newpt = (pd_entry_t)(ptppaddr | 3237 kernel_pmap.pmap_bits[PG_V_IDX] | 3238 kernel_pmap.pmap_bits[PG_RW_IDX] | 3239 kernel_pmap.pmap_bits[PG_A_IDX]); 3240 atomic_swap_long(pt, newpt); 3241 3242 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3243 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3244 3245 if (kstart - 1 >= vm_map_max(&kernel_map)) { 3246 kstart = vm_map_max(&kernel_map); 3247 break; 3248 } 3249 } 3250 3251 /* 3252 * Only update kernel_vm_end for areas below KERNBASE. 3253 */ 3254 if (update_kernel_vm_end && kernel_vm_end < kstart) 3255 kernel_vm_end = kstart; 3256 } 3257 3258 /* 3259 * Add a reference to the specified pmap. 3260 */ 3261 void 3262 pmap_reference(pmap_t pmap) 3263 { 3264 if (pmap != NULL) 3265 atomic_add_int(&pmap->pm_count, 1); 3266 } 3267 3268 void 3269 pmap_maybethreaded(pmap_t pmap) 3270 { 3271 atomic_set_int(&pmap->pm_flags, PMAP_MULTI); 3272 } 3273 3274 /* 3275 * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE 3276 * flags if able. 3277 */ 3278 int 3279 pmap_mapped_sync(vm_page_t m) 3280 { 3281 if (m->md.pmap_count == 0) 3282 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3283 return (m->flags); 3284 } 3285 3286 /*************************************************** 3287 * page management routines. 3288 ***************************************************/ 3289 3290 /* 3291 * Hold a pv without locking it 3292 */ 3293 #if 0 3294 static void 3295 pv_hold(pv_entry_t pv) 3296 { 3297 atomic_add_int(&pv->pv_hold, 1); 3298 } 3299 #endif 3300 3301 /* 3302 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 3303 * was successfully locked, FALSE if it wasn't. The caller must dispose of 3304 * the pv properly. 3305 * 3306 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 3307 * pv list via its page) must be held by the caller in order to stabilize 3308 * the pv. 3309 */ 3310 static int 3311 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 3312 { 3313 u_int count; 3314 3315 /* 3316 * Critical path shortcut expects pv to already have one ref 3317 * (for the pv->pv_pmap). 3318 */ 3319 count = pv->pv_hold; 3320 cpu_ccfence(); 3321 for (;;) { 3322 if ((count & PV_HOLD_LOCKED) == 0) { 3323 if (atomic_fcmpset_int(&pv->pv_hold, &count, 3324 (count + 1) | PV_HOLD_LOCKED)) { 3325 #ifdef PMAP_DEBUG 3326 pv->pv_func = func; 3327 pv->pv_line = lineno; 3328 #endif 3329 return TRUE; 3330 } 3331 } else { 3332 if (atomic_fcmpset_int(&pv->pv_hold, &count, count + 1)) 3333 return FALSE; 3334 } 3335 /* retry */ 3336 } 3337 } 3338 3339 /* 3340 * Drop a previously held pv_entry which could not be locked, allowing its 3341 * destruction. 3342 * 3343 * Must not be called with a spinlock held as we might zfree() the pv if it 3344 * is no longer associated with a pmap and this was the last hold count. 3345 */ 3346 static void 3347 pv_drop(pv_entry_t pv) 3348 { 3349 u_int count; 3350 3351 for (;;) { 3352 count = pv->pv_hold; 3353 cpu_ccfence(); 3354 KKASSERT((count & PV_HOLD_MASK) > 0); 3355 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 3356 (PV_HOLD_LOCKED | 1)); 3357 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 3358 if ((count & PV_HOLD_MASK) == 1) { 3359 #ifdef PMAP_DEBUG2 3360 if (pmap_enter_debug > 0) { 3361 --pmap_enter_debug; 3362 kprintf("pv_drop: free pv %p\n", pv); 3363 } 3364 #endif 3365 KKASSERT(count == 1); 3366 KKASSERT(pv->pv_pmap == NULL); 3367 zfree(pvzone, pv); 3368 } 3369 return; 3370 } 3371 /* retry */ 3372 } 3373 } 3374 3375 /* 3376 * Find or allocate the requested PV entry, returning a locked, held pv. 3377 * 3378 * If (*isnew) is non-zero, the returned pv will have two hold counts, one 3379 * for the caller and one representing the pmap and vm_page association. 3380 * 3381 * If (*isnew) is zero, the returned pv will have only one hold count. 3382 * 3383 * Since both associations can only be adjusted while the pv is locked, 3384 * together they represent just one additional hold. 3385 */ 3386 static 3387 pv_entry_t 3388 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 3389 { 3390 struct mdglobaldata *md = mdcpu; 3391 pv_entry_t pv; 3392 pv_entry_t pnew; 3393 int pmap_excl = 0; 3394 3395 pnew = NULL; 3396 if (md->gd_newpv) { 3397 #if 1 3398 pnew = atomic_swap_ptr((void *)&md->gd_newpv, NULL); 3399 #else 3400 crit_enter(); 3401 pnew = md->gd_newpv; /* might race NULL */ 3402 md->gd_newpv = NULL; 3403 crit_exit(); 3404 #endif 3405 } 3406 if (pnew == NULL) 3407 pnew = zalloc(pvzone); 3408 3409 spin_lock_shared(&pmap->pm_spin); 3410 for (;;) { 3411 /* 3412 * Shortcut cache 3413 */ 3414 pv = pv_entry_lookup(pmap, pindex); 3415 if (pv == NULL) { 3416 vm_pindex_t *pmark; 3417 3418 /* 3419 * Requires exclusive pmap spinlock 3420 */ 3421 if (pmap_excl == 0) { 3422 pmap_excl = 1; 3423 if (!spin_lock_upgrade_try(&pmap->pm_spin)) { 3424 spin_unlock_shared(&pmap->pm_spin); 3425 spin_lock(&pmap->pm_spin); 3426 continue; 3427 } 3428 } 3429 3430 /* 3431 * We need to block if someone is holding our 3432 * placemarker. As long as we determine the 3433 * placemarker has not been aquired we do not 3434 * need to get it as acquision also requires 3435 * the pmap spin lock. 3436 * 3437 * However, we can race the wakeup. 3438 */ 3439 pmark = pmap_placemarker_hash(pmap, pindex); 3440 3441 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3442 tsleep_interlock(pmark, 0); 3443 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3444 if (((*pmark ^ pindex) & 3445 ~PM_PLACEMARK_WAKEUP) == 0) { 3446 spin_unlock(&pmap->pm_spin); 3447 tsleep(pmark, PINTERLOCKED, "pvplc", 0); 3448 spin_lock(&pmap->pm_spin); 3449 } 3450 continue; 3451 } 3452 3453 /* 3454 * Setup the new entry 3455 */ 3456 pnew->pv_pmap = pmap; 3457 pnew->pv_pindex = pindex; 3458 pnew->pv_hold = PV_HOLD_LOCKED | 2; 3459 pnew->pv_flags = 0; 3460 #ifdef PMAP_DEBUG 3461 pnew->pv_func = func; 3462 pnew->pv_line = lineno; 3463 if (pnew->pv_line_lastfree > 0) { 3464 pnew->pv_line_lastfree = 3465 -pnew->pv_line_lastfree; 3466 } 3467 #endif 3468 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 3469 atomic_add_long(&pmap->pm_stats.resident_count, 1); 3470 spin_unlock(&pmap->pm_spin); 3471 *isnew = 1; 3472 3473 KASSERT(pv == NULL, ("pv insert failed %p->%p", pnew, pv)); 3474 return(pnew); 3475 } 3476 3477 /* 3478 * We already have an entry, cleanup the staged pnew if 3479 * we can get the lock, otherwise block and retry. 3480 */ 3481 if (__predict_true(_pv_hold_try(pv PMAP_DEBUG_COPY))) { 3482 if (pmap_excl) 3483 spin_unlock(&pmap->pm_spin); 3484 else 3485 spin_unlock_shared(&pmap->pm_spin); 3486 #if 1 3487 pnew = atomic_swap_ptr((void *)&md->gd_newpv, pnew); 3488 if (pnew) 3489 zfree(pvzone, pnew); 3490 #else 3491 crit_enter(); 3492 if (md->gd_newpv == NULL) 3493 md->gd_newpv = pnew; 3494 else 3495 zfree(pvzone, pnew); 3496 crit_exit(); 3497 #endif 3498 KKASSERT(pv->pv_pmap == pmap && 3499 pv->pv_pindex == pindex); 3500 *isnew = 0; 3501 return(pv); 3502 } 3503 if (pmap_excl) { 3504 spin_unlock(&pmap->pm_spin); 3505 _pv_lock(pv PMAP_DEBUG_COPY); 3506 pv_put(pv); 3507 spin_lock(&pmap->pm_spin); 3508 } else { 3509 spin_unlock_shared(&pmap->pm_spin); 3510 _pv_lock(pv PMAP_DEBUG_COPY); 3511 pv_put(pv); 3512 spin_lock_shared(&pmap->pm_spin); 3513 } 3514 } 3515 /* NOT REACHED */ 3516 } 3517 3518 /* 3519 * Find the requested PV entry, returning a locked+held pv or NULL 3520 */ 3521 static 3522 pv_entry_t 3523 _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp PMAP_DEBUG_DECL) 3524 { 3525 pv_entry_t pv; 3526 int pmap_excl = 0; 3527 3528 spin_lock_shared(&pmap->pm_spin); 3529 for (;;) { 3530 /* 3531 * Shortcut cache 3532 */ 3533 pv = pv_entry_lookup(pmap, pindex); 3534 if (pv == NULL) { 3535 /* 3536 * Block if there is ANY placemarker. If we are to 3537 * return it, we must also aquire the spot, so we 3538 * have to block even if the placemarker is held on 3539 * a different address. 3540 * 3541 * OPTIMIZATION: If pmarkp is passed as NULL the 3542 * caller is just probing (or looking for a real 3543 * pv_entry), and in this case we only need to check 3544 * to see if the placemarker matches pindex. 3545 */ 3546 vm_pindex_t *pmark; 3547 3548 /* 3549 * Requires exclusive pmap spinlock 3550 */ 3551 if (pmap_excl == 0) { 3552 pmap_excl = 1; 3553 if (!spin_lock_upgrade_try(&pmap->pm_spin)) { 3554 spin_unlock_shared(&pmap->pm_spin); 3555 spin_lock(&pmap->pm_spin); 3556 continue; 3557 } 3558 } 3559 3560 pmark = pmap_placemarker_hash(pmap, pindex); 3561 3562 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3563 ((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3564 tsleep_interlock(pmark, 0); 3565 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3566 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3567 ((*pmark ^ pindex) & 3568 ~PM_PLACEMARK_WAKEUP) == 0) { 3569 spin_unlock(&pmap->pm_spin); 3570 tsleep(pmark, PINTERLOCKED, "pvpld", 0); 3571 spin_lock(&pmap->pm_spin); 3572 } 3573 continue; 3574 } 3575 if (pmarkp) { 3576 if (atomic_swap_long(pmark, pindex) != 3577 PM_NOPLACEMARK) { 3578 panic("_pv_get: pmark race"); 3579 } 3580 *pmarkp = pmark; 3581 } 3582 spin_unlock(&pmap->pm_spin); 3583 return NULL; 3584 } 3585 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3586 if (pmap_excl) 3587 spin_unlock(&pmap->pm_spin); 3588 else 3589 spin_unlock_shared(&pmap->pm_spin); 3590 KKASSERT(pv->pv_pmap == pmap && 3591 pv->pv_pindex == pindex); 3592 return(pv); 3593 } 3594 if (pmap_excl) { 3595 spin_unlock(&pmap->pm_spin); 3596 _pv_lock(pv PMAP_DEBUG_COPY); 3597 pv_put(pv); 3598 spin_lock(&pmap->pm_spin); 3599 } else { 3600 spin_unlock_shared(&pmap->pm_spin); 3601 _pv_lock(pv PMAP_DEBUG_COPY); 3602 pv_put(pv); 3603 spin_lock_shared(&pmap->pm_spin); 3604 } 3605 } 3606 } 3607 3608 /* 3609 * Lookup, hold, and attempt to lock (pmap,pindex). 3610 * 3611 * If the entry does not exist NULL is returned and *errorp is set to 0 3612 * 3613 * If the entry exists and could be successfully locked it is returned and 3614 * errorp is set to 0. 3615 * 3616 * If the entry exists but could NOT be successfully locked it is returned 3617 * held and *errorp is set to 1. 3618 * 3619 * If the entry is placemarked by someone else NULL is returned and *errorp 3620 * is set to 1. 3621 */ 3622 static 3623 pv_entry_t 3624 pv_get_try(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp, int *errorp) 3625 { 3626 pv_entry_t pv; 3627 3628 spin_lock_shared(&pmap->pm_spin); 3629 3630 pv = pv_entry_lookup(pmap, pindex); 3631 if (pv == NULL) { 3632 vm_pindex_t *pmark; 3633 3634 pmark = pmap_placemarker_hash(pmap, pindex); 3635 3636 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3637 *errorp = 1; 3638 } else if (pmarkp && 3639 atomic_cmpset_long(pmark, PM_NOPLACEMARK, pindex)) { 3640 *errorp = 0; 3641 } else { 3642 /* 3643 * Can't set a placemark with a NULL pmarkp, or if 3644 * pmarkp is non-NULL but we failed to set our 3645 * placemark. 3646 */ 3647 *errorp = 1; 3648 } 3649 if (pmarkp) 3650 *pmarkp = pmark; 3651 spin_unlock_shared(&pmap->pm_spin); 3652 3653 return NULL; 3654 } 3655 3656 /* 3657 * XXX This has problems if the lock is shared, why? 3658 */ 3659 if (pv_hold_try(pv)) { 3660 spin_unlock_shared(&pmap->pm_spin); 3661 *errorp = 0; 3662 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); 3663 return(pv); /* lock succeeded */ 3664 } 3665 spin_unlock_shared(&pmap->pm_spin); 3666 *errorp = 1; 3667 3668 return (pv); /* lock failed */ 3669 } 3670 3671 /* 3672 * Lock a held pv, keeping the hold count 3673 */ 3674 static 3675 void 3676 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 3677 { 3678 u_int count; 3679 3680 for (;;) { 3681 count = pv->pv_hold; 3682 cpu_ccfence(); 3683 if ((count & PV_HOLD_LOCKED) == 0) { 3684 if (atomic_cmpset_int(&pv->pv_hold, count, 3685 count | PV_HOLD_LOCKED)) { 3686 #ifdef PMAP_DEBUG 3687 pv->pv_func = func; 3688 pv->pv_line = lineno; 3689 #endif 3690 return; 3691 } 3692 continue; 3693 } 3694 tsleep_interlock(pv, 0); 3695 if (atomic_cmpset_int(&pv->pv_hold, count, 3696 count | PV_HOLD_WAITING)) { 3697 #ifdef PMAP_DEBUG2 3698 if (pmap_enter_debug > 0) { 3699 --pmap_enter_debug; 3700 kprintf("pv waiting on %s:%d\n", 3701 pv->pv_func, pv->pv_line); 3702 } 3703 #endif 3704 tsleep(pv, PINTERLOCKED, "pvwait", hz); 3705 } 3706 /* retry */ 3707 } 3708 } 3709 3710 /* 3711 * Unlock a held and locked pv, keeping the hold count. 3712 */ 3713 static 3714 void 3715 pv_unlock(pv_entry_t pv) 3716 { 3717 u_int count; 3718 3719 for (;;) { 3720 count = pv->pv_hold; 3721 cpu_ccfence(); 3722 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= 3723 (PV_HOLD_LOCKED | 1)); 3724 if (atomic_cmpset_int(&pv->pv_hold, count, 3725 count & 3726 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 3727 if (count & PV_HOLD_WAITING) 3728 wakeup(pv); 3729 break; 3730 } 3731 } 3732 } 3733 3734 /* 3735 * Unlock and drop a pv. If the pv is no longer associated with a pmap 3736 * and the hold count drops to zero we will free it. 3737 * 3738 * Caller should not hold any spin locks. We are protected from hold races 3739 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 3740 * lock held. A pv cannot be located otherwise. 3741 */ 3742 static 3743 void 3744 pv_put(pv_entry_t pv) 3745 { 3746 #ifdef PMAP_DEBUG2 3747 if (pmap_enter_debug > 0) { 3748 --pmap_enter_debug; 3749 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); 3750 } 3751 #endif 3752 3753 /* 3754 * Normal put-aways must have a pv_m associated with the pv, 3755 * but allow the case where the pv has been destructed due 3756 * to pmap_dynamic_delete. 3757 */ 3758 KKASSERT(pv->pv_pmap == NULL || pv->pv_m != NULL); 3759 3760 /* 3761 * Fast - shortcut most common condition 3762 */ 3763 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) 3764 return; 3765 3766 /* 3767 * Slow 3768 */ 3769 pv_unlock(pv); 3770 pv_drop(pv); 3771 } 3772 3773 /* 3774 * Remove the pmap association from a pv, require that pv_m already be removed, 3775 * then unlock and drop the pv. Any pte operations must have already been 3776 * completed. This call may result in a last-drop which will physically free 3777 * the pv. 3778 * 3779 * Removing the pmap association entails an additional drop. 3780 * 3781 * pv must be exclusively locked on call and will be disposed of on return. 3782 */ 3783 static 3784 void 3785 _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL) 3786 { 3787 pmap_t pmap; 3788 3789 #ifdef PMAP_DEBUG 3790 pv->pv_func_lastfree = func; 3791 pv->pv_line_lastfree = lineno; 3792 #endif 3793 KKASSERT(pv->pv_m == NULL); 3794 KKASSERT((pv->pv_hold & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= 3795 (PV_HOLD_LOCKED|1)); 3796 if ((pmap = pv->pv_pmap) != NULL) { 3797 spin_lock(&pmap->pm_spin); 3798 KKASSERT(pv->pv_pmap == pmap); 3799 if (pmap->pm_pvhint_pt == pv) 3800 pmap->pm_pvhint_pt = NULL; 3801 if (pmap->pm_pvhint_unused == pv) 3802 pmap->pm_pvhint_unused = NULL; 3803 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 3804 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3805 pv->pv_pmap = NULL; 3806 pv->pv_pindex = 0; 3807 spin_unlock(&pmap->pm_spin); 3808 3809 /* 3810 * Try to shortcut three atomic ops, otherwise fall through 3811 * and do it normally. Drop two refs and the lock all in 3812 * one go. 3813 */ 3814 if (pvp) { 3815 if (vm_page_unwire_quick(pvp->pv_m)) 3816 panic("_pv_free: bad wirecount on pvp"); 3817 } 3818 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { 3819 #ifdef PMAP_DEBUG2 3820 if (pmap_enter_debug > 0) { 3821 --pmap_enter_debug; 3822 kprintf("pv_free: free pv %p\n", pv); 3823 } 3824 #endif 3825 zfree(pvzone, pv); 3826 return; 3827 } 3828 pv_drop(pv); /* ref for pv_pmap */ 3829 } 3830 pv_unlock(pv); 3831 pv_drop(pv); 3832 } 3833 3834 /* 3835 * This routine is very drastic, but can save the system 3836 * in a pinch. 3837 */ 3838 void 3839 pmap_collect(void) 3840 { 3841 int i; 3842 vm_page_t m; 3843 static int warningdone=0; 3844 3845 if (pmap_pagedaemon_waken == 0) 3846 return; 3847 pmap_pagedaemon_waken = 0; 3848 if (warningdone < 5) { 3849 kprintf("pmap_collect: pv_entries exhausted -- " 3850 "suggest increasing vm.pmap_pv_entries above %ld\n", 3851 vm_pmap_pv_entries); 3852 warningdone++; 3853 } 3854 3855 for (i = 0; i < vm_page_array_size; i++) { 3856 m = &vm_page_array[i]; 3857 if (m->wire_count || m->hold_count) 3858 continue; 3859 if (vm_page_busy_try(m, TRUE) == 0) { 3860 if (m->wire_count == 0 && m->hold_count == 0) { 3861 pmap_remove_all(m); 3862 } 3863 vm_page_wakeup(m); 3864 } 3865 } 3866 } 3867 3868 /* 3869 * Scan the pmap for active page table entries and issue a callback. 3870 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 3871 * its parent page table. 3872 * 3873 * pte_pv will be NULL if the page or page table is unmanaged. 3874 * pt_pv will point to the page table page containing the pte for the page. 3875 * 3876 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 3877 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 3878 * process pmap's PD and page to the callback function. This can be 3879 * confusing because the pt_pv is really a pd_pv, and the target page 3880 * table page is simply aliased by the pmap and not owned by it. 3881 * 3882 * It is assumed that the start and end are properly rounded to the page size. 3883 * 3884 * It is assumed that PD pages and above are managed and thus in the RB tree, 3885 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 3886 */ 3887 struct pmap_scan_info { 3888 struct pmap *pmap; 3889 vm_offset_t sva; 3890 vm_offset_t eva; 3891 vm_pindex_t sva_pd_pindex; 3892 vm_pindex_t eva_pd_pindex; 3893 void (*func)(pmap_t, struct pmap_scan_info *, 3894 vm_pindex_t *, pv_entry_t, vm_offset_t, 3895 pt_entry_t *, void *); 3896 void *arg; 3897 pmap_inval_bulk_t bulk_core; 3898 pmap_inval_bulk_t *bulk; 3899 int count; 3900 int stop; 3901 }; 3902 3903 static int pmap_scan_cmp(pv_entry_t pv, void *data); 3904 static int pmap_scan_callback(pv_entry_t pv, void *data); 3905 3906 static void 3907 pmap_scan(struct pmap_scan_info *info, int smp_inval) 3908 { 3909 struct pmap *pmap = info->pmap; 3910 pv_entry_t pt_pv; /* A page table PV */ 3911 pv_entry_t pte_pv; /* A page table entry PV */ 3912 vm_pindex_t *pte_placemark; 3913 vm_pindex_t *pt_placemark; 3914 pt_entry_t *ptep; 3915 pt_entry_t oldpte; 3916 struct pv_entry dummy_pv; 3917 3918 info->stop = 0; 3919 if (pmap == NULL) 3920 return; 3921 if (info->sva == info->eva) 3922 return; 3923 if (smp_inval) { 3924 info->bulk = &info->bulk_core; 3925 pmap_inval_bulk_init(&info->bulk_core, pmap); 3926 } else { 3927 info->bulk = NULL; 3928 } 3929 3930 /* 3931 * Hold the token for stability; if the pmap is empty we have nothing 3932 * to do. 3933 */ 3934 #if 0 3935 if (pmap->pm_stats.resident_count == 0) { 3936 return; 3937 } 3938 #endif 3939 3940 info->count = 0; 3941 3942 /* 3943 * Special handling for scanning one page, which is a very common 3944 * operation (it is?). 3945 * 3946 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 3947 */ 3948 if (info->sva + PAGE_SIZE == info->eva) { 3949 if (info->sva >= VM_MAX_USER_ADDRESS) { 3950 /* 3951 * Kernel mappings do not track wire counts on 3952 * page table pages and only maintain pd_pv and 3953 * pte_pv levels so pmap_scan() works. 3954 */ 3955 pt_pv = NULL; 3956 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3957 &pte_placemark); 3958 KKASSERT(pte_pv == NULL); 3959 ptep = vtopte(info->sva); 3960 } else { 3961 /* 3962 * We hold pte_placemark across the operation for 3963 * unmanaged pages. 3964 * 3965 * WARNING! We must hold pt_placemark across the 3966 * *ptep test to prevent misintepreting 3967 * a non-zero *ptep as a shared page 3968 * table page. Hold it across the function 3969 * callback as well for SMP safety. 3970 */ 3971 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3972 &pte_placemark); 3973 KKASSERT(pte_pv == NULL); 3974 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva), 3975 &pt_placemark); 3976 if (pt_pv == NULL) { 3977 #if 0 3978 KKASSERT(0); 3979 pd_pv = pv_get(pmap, 3980 pmap_pd_pindex(info->sva), 3981 NULL); 3982 if (pd_pv) { 3983 ptep = pv_pte_lookup(pd_pv, 3984 pmap_pt_index(info->sva)); 3985 if (*ptep) { 3986 info->func(pmap, info, 3987 pt_placemark, pd_pv, 3988 info->sva, ptep, 3989 info->arg); 3990 } else { 3991 pv_placemarker_wakeup(pmap, 3992 pt_placemark); 3993 } 3994 pv_put(pd_pv); 3995 } else { 3996 pv_placemarker_wakeup(pmap, 3997 pt_placemark); 3998 } 3999 #else 4000 pv_placemarker_wakeup(pmap, pt_placemark); 4001 #endif 4002 pv_placemarker_wakeup(pmap, pte_placemark); 4003 goto fast_skip; 4004 } 4005 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 4006 } 4007 4008 /* 4009 * NOTE: *ptep can't be ripped out from under us if we hold 4010 * pte_pv (or pte_placemark) locked, but bits can 4011 * change. 4012 */ 4013 oldpte = *ptep; 4014 cpu_ccfence(); 4015 if (oldpte == 0) { 4016 KKASSERT(pte_pv == NULL); 4017 pv_placemarker_wakeup(pmap, pte_placemark); 4018 } else { 4019 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]) == 4020 pmap->pmap_bits[PG_V_IDX], 4021 ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL", 4022 *ptep, oldpte, info->sva)); 4023 info->func(pmap, info, pte_placemark, pt_pv, 4024 info->sva, ptep, info->arg); 4025 } 4026 if (pt_pv) 4027 pv_put(pt_pv); 4028 fast_skip: 4029 pmap_inval_bulk_flush(info->bulk); 4030 return; 4031 } 4032 4033 /* 4034 * Nominal scan case, RB_SCAN() for PD pages and iterate from 4035 * there. 4036 * 4037 * WARNING! eva can overflow our standard ((N + mask) >> bits) 4038 * bounds, resulting in a pd_pindex of 0. To solve the 4039 * problem we use an inclusive range. 4040 */ 4041 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 4042 info->eva_pd_pindex = pmap_pd_pindex(info->eva - PAGE_SIZE); 4043 4044 if (info->sva >= VM_MAX_USER_ADDRESS) { 4045 /* 4046 * The kernel does not currently maintain any pv_entry's for 4047 * higher-level page tables. 4048 */ 4049 bzero(&dummy_pv, sizeof(dummy_pv)); 4050 dummy_pv.pv_pindex = info->sva_pd_pindex; 4051 spin_lock(&pmap->pm_spin); 4052 while (dummy_pv.pv_pindex <= info->eva_pd_pindex) { 4053 pmap_scan_callback(&dummy_pv, info); 4054 ++dummy_pv.pv_pindex; 4055 if (dummy_pv.pv_pindex < info->sva_pd_pindex) /*wrap*/ 4056 break; 4057 } 4058 spin_unlock(&pmap->pm_spin); 4059 } else { 4060 /* 4061 * User page tables maintain local PML4, PDP, PD, and PT 4062 * pv_entry's. pv_entry's are not used for PTEs. 4063 */ 4064 spin_lock(&pmap->pm_spin); 4065 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, pmap_scan_cmp, 4066 pmap_scan_callback, info); 4067 spin_unlock(&pmap->pm_spin); 4068 } 4069 pmap_inval_bulk_flush(info->bulk); 4070 } 4071 4072 /* 4073 * WARNING! pmap->pm_spin held 4074 * 4075 * WARNING! eva can overflow our standard ((N + mask) >> bits) 4076 * bounds, resulting in a pd_pindex of 0. To solve the 4077 * problem we use an inclusive range. 4078 */ 4079 static int 4080 pmap_scan_cmp(pv_entry_t pv, void *data) 4081 { 4082 struct pmap_scan_info *info = data; 4083 if (pv->pv_pindex < info->sva_pd_pindex) 4084 return(-1); 4085 if (pv->pv_pindex > info->eva_pd_pindex) 4086 return(1); 4087 return(0); 4088 } 4089 4090 /* 4091 * pmap_scan() by PDs 4092 * 4093 * WARNING! pmap->pm_spin held 4094 */ 4095 static int 4096 pmap_scan_callback(pv_entry_t pv, void *data) 4097 { 4098 struct pmap_scan_info *info = data; 4099 struct pmap *pmap = info->pmap; 4100 pv_entry_t pd_pv; /* A page directory PV */ 4101 pv_entry_t pt_pv; /* A page table PV */ 4102 vm_pindex_t *pt_placemark; 4103 pt_entry_t *ptep; 4104 pt_entry_t oldpte; 4105 vm_offset_t sva; 4106 vm_offset_t eva; 4107 vm_offset_t va_next; 4108 vm_pindex_t pd_pindex; 4109 int error; 4110 4111 /* 4112 * Stop if requested 4113 */ 4114 if (info->stop) 4115 return -1; 4116 4117 /* 4118 * Pull the PD pindex from the pv before releasing the spinlock. 4119 * 4120 * WARNING: pv is faked for kernel pmap scans. 4121 */ 4122 pd_pindex = pv->pv_pindex; 4123 spin_unlock(&pmap->pm_spin); 4124 pv = NULL; /* invalid after spinlock unlocked */ 4125 4126 /* 4127 * Calculate the page range within the PD. SIMPLE pmaps are 4128 * direct-mapped for the entire 2^64 address space. Normal pmaps 4129 * reflect the user and kernel address space which requires 4130 * cannonicalization w/regards to converting pd_pindex's back 4131 * into addresses. 4132 */ 4133 sva = (pd_pindex - pmap_pd_pindex(0)) << PDPSHIFT; 4134 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 4135 (sva & PML4_SIGNMASK)) { 4136 sva |= PML4_SIGNMASK; 4137 } 4138 eva = sva + NBPDP; /* can overflow */ 4139 if (sva < info->sva) 4140 sva = info->sva; 4141 if (eva < info->sva || eva > info->eva) 4142 eva = info->eva; 4143 4144 /* 4145 * NOTE: kernel mappings do not track page table pages, only 4146 * terminal pages. 4147 * 4148 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 4149 * However, for the scan to be efficient we try to 4150 * cache items top-down. 4151 */ 4152 pd_pv = NULL; 4153 pt_pv = NULL; 4154 4155 for (; sva < eva; sva = va_next) { 4156 if (info->stop) 4157 break; 4158 if (sva >= VM_MAX_USER_ADDRESS) { 4159 if (pt_pv) { 4160 pv_put(pt_pv); 4161 pt_pv = NULL; 4162 } 4163 goto kernel_skip; 4164 } 4165 4166 /* 4167 * PD cache, scan shortcut if it doesn't exist. 4168 */ 4169 if (pd_pv == NULL) { 4170 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4171 } else if (pd_pv->pv_pmap != pmap || 4172 pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 4173 pv_put(pd_pv); 4174 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4175 } 4176 if (pd_pv == NULL) { 4177 va_next = (sva + NBPDP) & ~PDPMASK; 4178 if (va_next < sva) 4179 va_next = eva; 4180 continue; 4181 } 4182 4183 /* 4184 * PT cache 4185 * 4186 * NOTE: The cached pt_pv can be removed from the pmap when 4187 * pmap_dynamic_delete is enabled. 4188 */ 4189 if (pt_pv && (pt_pv->pv_pmap != pmap || 4190 pt_pv->pv_pindex != pmap_pt_pindex(sva))) { 4191 pv_put(pt_pv); 4192 pt_pv = NULL; 4193 } 4194 if (pt_pv == NULL) { 4195 pt_pv = pv_get_try(pmap, pmap_pt_pindex(sva), 4196 &pt_placemark, &error); 4197 if (error) { 4198 pv_put(pd_pv); /* lock order */ 4199 pd_pv = NULL; 4200 if (pt_pv) { 4201 pv_lock(pt_pv); 4202 pv_put(pt_pv); 4203 pt_pv = NULL; 4204 } else { 4205 pv_placemarker_wait(pmap, pt_placemark); 4206 } 4207 va_next = sva; 4208 continue; 4209 } 4210 /* may have to re-check later if pt_pv is NULL here */ 4211 } 4212 4213 /* 4214 * If pt_pv is NULL we either have a shared page table 4215 * page (NOT IMPLEMENTED XXX) and must issue a callback 4216 * specific to that case, or there is no page table page. 4217 * 4218 * Either way we can skip the page table page. 4219 * 4220 * WARNING! pt_pv can also be NULL due to a pv creation 4221 * race where we find it to be NULL and then 4222 * later see a pte_pv. But its possible the pt_pv 4223 * got created inbetween the two operations, so 4224 * we must check. 4225 * 4226 * XXX This should no longer be the case because 4227 * we have pt_placemark. 4228 */ 4229 if (pt_pv == NULL) { 4230 #if 0 4231 /* XXX REMOVED */ 4232 /* 4233 * Possible unmanaged (shared from another pmap) 4234 * page table page. 4235 * 4236 * WARNING! We must hold pt_placemark across the 4237 * *ptep test to prevent misintepreting 4238 * a non-zero *ptep as a shared page 4239 * table page. Hold it across the function 4240 * callback as well for SMP safety. 4241 */ 4242 KKASSERT(0); 4243 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 4244 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 4245 info->func(pmap, info, pt_placemark, pd_pv, 4246 sva, ptep, info->arg); 4247 } else { 4248 pv_placemarker_wakeup(pmap, pt_placemark); 4249 } 4250 #else 4251 pv_placemarker_wakeup(pmap, pt_placemark); 4252 #endif 4253 4254 /* 4255 * Done, move to next page table page. 4256 */ 4257 va_next = (sva + NBPDR) & ~PDRMASK; 4258 if (va_next < sva) 4259 va_next = eva; 4260 continue; 4261 } 4262 4263 /* 4264 * From this point in the loop testing pt_pv for non-NULL 4265 * means we are in UVM, else if it is NULL we are in KVM. 4266 * 4267 * Limit our scan to either the end of the va represented 4268 * by the current page table page, or to the end of the 4269 * range being removed. 4270 */ 4271 kernel_skip: 4272 va_next = (sva + NBPDR) & ~PDRMASK; 4273 if (va_next < sva) 4274 va_next = eva; 4275 if (va_next > eva) 4276 va_next = eva; 4277 4278 /* 4279 * Scan the page table for pages. Some pages may not be 4280 * managed (might not have a pv_entry). 4281 * 4282 * There is no page table management for kernel pages so 4283 * pt_pv will be NULL in that case, but otherwise pt_pv 4284 * is non-NULL, locked, and referenced. 4285 */ 4286 4287 /* 4288 * At this point a non-NULL pt_pv means a UVA, and a NULL 4289 * pt_pv means a KVA. 4290 */ 4291 if (pt_pv) 4292 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 4293 else 4294 ptep = vtopte(sva); 4295 4296 while (sva < va_next) { 4297 vm_pindex_t *pte_placemark; 4298 pv_entry_t pte_pv; 4299 4300 /* 4301 * Yield every 64 pages, stop if requested. 4302 */ 4303 if ((++info->count & 63) == 0) 4304 lwkt_user_yield(); 4305 if (info->stop) 4306 break; 4307 4308 /* 4309 * We can shortcut our scan if *ptep == 0. This is 4310 * an unlocked check. 4311 */ 4312 if (*ptep == 0) { 4313 sva += PAGE_SIZE; 4314 ++ptep; 4315 continue; 4316 } 4317 cpu_ccfence(); 4318 4319 /* 4320 * Acquire the pte_placemark. pte_pv's won't exist 4321 * for leaf pages. 4322 * 4323 * A multitude of races are possible here so if we 4324 * cannot lock definite state we clean out our cache 4325 * and break the inner while() loop to force a loop 4326 * up to the top of the for(). 4327 * 4328 * XXX unlock/relock pd_pv, pt_pv, and re-test their 4329 * validity instead of looping up? 4330 */ 4331 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 4332 &pte_placemark, &error); 4333 KKASSERT(pte_pv == NULL); 4334 if (error) { 4335 if (pd_pv) { 4336 pv_put(pd_pv); /* lock order */ 4337 pd_pv = NULL; 4338 } 4339 if (pt_pv) { 4340 pv_put(pt_pv); /* lock order */ 4341 pt_pv = NULL; 4342 } 4343 pv_placemarker_wait(pmap, pte_placemark); 4344 va_next = sva; /* retry */ 4345 break; 4346 } 4347 4348 /* 4349 * Reload *ptep after successfully locking the 4350 * pindex. 4351 */ 4352 cpu_ccfence(); 4353 oldpte = *ptep; 4354 if (oldpte == 0) { 4355 pv_placemarker_wakeup(pmap, pte_placemark); 4356 sva += PAGE_SIZE; 4357 ++ptep; 4358 continue; 4359 } 4360 4361 /* 4362 * We can't hold pd_pv across the callback (because 4363 * we don't pass it to the callback and the callback 4364 * might deadlock) 4365 */ 4366 if (pd_pv) { 4367 vm_page_wire_quick(pd_pv->pv_m); 4368 pv_unlock(pd_pv); 4369 } 4370 4371 /* 4372 * Ready for the callback. The locked placemarker 4373 * is consumed by the callback. 4374 */ 4375 if (oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4376 /* 4377 * Managed pte 4378 */ 4379 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]), 4380 ("badC *ptep %016lx/%016lx sva %016lx", 4381 *ptep, oldpte, sva)); 4382 /* 4383 * We must unlock pd_pv across the callback 4384 * to avoid deadlocks on any recursive 4385 * disposal. Re-check that it still exists 4386 * after re-locking. 4387 * 4388 * Call target disposes of pte_placemark 4389 * and may destroy but will not dispose 4390 * of pt_pv. 4391 */ 4392 info->func(pmap, info, pte_placemark, pt_pv, 4393 sva, ptep, info->arg); 4394 } else { 4395 /* 4396 * Unmanaged pte 4397 * 4398 * We must unlock pd_pv across the callback 4399 * to avoid deadlocks on any recursive 4400 * disposal. Re-check that it still exists 4401 * after re-locking. 4402 * 4403 * Call target disposes of pte_placemark 4404 * and may destroy but will not dispose 4405 * of pt_pv. 4406 */ 4407 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]), 4408 ("badD *ptep %016lx/%016lx sva %016lx ", 4409 *ptep, oldpte, sva)); 4410 info->func(pmap, info, pte_placemark, pt_pv, 4411 sva, ptep, info->arg); 4412 } 4413 if (pd_pv) { 4414 pv_lock(pd_pv); 4415 if (vm_page_unwire_quick(pd_pv->pv_m)) { 4416 panic("pmap_scan_callback: " 4417 "bad wirecount on pd_pv"); 4418 } 4419 if (pd_pv->pv_pmap == NULL) { 4420 va_next = sva; /* retry */ 4421 break; 4422 } 4423 } 4424 4425 /* 4426 * NOTE: The cached pt_pv can be removed from the 4427 * pmap when pmap_dynamic_delete is enabled, 4428 * which will cause ptep to become stale. 4429 * 4430 * This also means that no pages remain under 4431 * the PT, so we can just break out of the inner 4432 * loop and let the outer loop clean everything 4433 * up. 4434 */ 4435 if (pt_pv && pt_pv->pv_pmap != pmap) 4436 break; 4437 sva += PAGE_SIZE; 4438 ++ptep; 4439 } 4440 } 4441 if (pd_pv) { 4442 pv_put(pd_pv); 4443 pd_pv = NULL; 4444 } 4445 if (pt_pv) { 4446 pv_put(pt_pv); 4447 pt_pv = NULL; 4448 } 4449 if ((++info->count & 7) == 0) 4450 lwkt_user_yield(); 4451 4452 /* 4453 * Relock before returning. 4454 */ 4455 spin_lock(&pmap->pm_spin); 4456 return (0); 4457 } 4458 4459 void 4460 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4461 { 4462 struct pmap_scan_info info; 4463 4464 info.pmap = pmap; 4465 info.sva = sva; 4466 info.eva = eva; 4467 info.func = pmap_remove_callback; 4468 info.arg = NULL; 4469 pmap_scan(&info, 1); 4470 #if 0 4471 cpu_invltlb(); 4472 if (eva - sva < 1024*1024) { 4473 while (sva < eva) { 4474 cpu_invlpg((void *)sva); 4475 sva += PAGE_SIZE; 4476 } 4477 } 4478 #endif 4479 } 4480 4481 static void 4482 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4483 { 4484 struct pmap_scan_info info; 4485 4486 info.pmap = pmap; 4487 info.sva = sva; 4488 info.eva = eva; 4489 info.func = pmap_remove_callback; 4490 info.arg = NULL; 4491 pmap_scan(&info, 0); 4492 } 4493 4494 static void 4495 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 4496 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 4497 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4498 { 4499 pt_entry_t pte; 4500 4501 /* 4502 * Managed or unmanaged pte (pte_placemark is non-NULL) 4503 * 4504 * pt_pv's wire_count is still bumped by unmanaged pages 4505 * so we must decrement it manually. 4506 * 4507 * We have to unwire the target page table page. 4508 */ 4509 pte = pmap_inval_bulk(info->bulk, va, ptep, 0); 4510 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4511 vm_page_t p; 4512 4513 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 4514 KKASSERT(pte & pmap->pmap_bits[PG_V_IDX]); 4515 if (pte & pmap->pmap_bits[PG_M_IDX]) 4516 vm_page_dirty(p); 4517 if (pte & pmap->pmap_bits[PG_A_IDX]) 4518 vm_page_flag_set(p, PG_REFERENCED); 4519 4520 /* 4521 * NOTE: p is not hard-busied so it is not safe to 4522 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4523 * transition against them being set in 4524 * pmap_enter(). 4525 */ 4526 if (pte & pmap->pmap_bits[PG_RW_IDX]) 4527 atomic_add_long(&p->md.writeable_count, -1); 4528 pmap_page_stats_deleting( 4529 atomic_fetchadd_long(&p->md.pmap_count, -1)); 4530 } 4531 if (pte & pmap->pmap_bits[PG_V_IDX]) { 4532 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4533 if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m)) 4534 panic("pmap_remove: insufficient wirecount"); 4535 } 4536 if (pte & pmap->pmap_bits[PG_W_IDX]) 4537 atomic_add_long(&pmap->pm_stats.wired_count, -1); 4538 if (pte & pmap->pmap_bits[PG_G_IDX]) 4539 cpu_invlpg((void *)va); 4540 pv_placemarker_wakeup(pmap, pte_placemark); 4541 } 4542 4543 /* 4544 * Removes this physical page from all physical maps in which it resides. 4545 * Reflects back modify bits to the pager. 4546 * 4547 * This routine may not be called from an interrupt. 4548 * 4549 * The page must be busied by its caller, preventing new ptes from being 4550 * installed. This allows us to assert that pmap_count is zero and safely 4551 * clear the MAPPED and WRITEABLE bits upon completion. 4552 */ 4553 static 4554 void 4555 pmap_remove_all(vm_page_t m) 4556 { 4557 int retry; 4558 4559 if (!pmap_initialized) 4560 return; 4561 4562 /* 4563 * pmap_count doesn't cover fictitious pages, but PG_MAPPED does 4564 * (albeit without certain race protections). 4565 */ 4566 #if 0 4567 if (m->md.pmap_count == 0) 4568 return; 4569 #endif 4570 if ((m->flags & PG_MAPPED) == 0) 4571 return; 4572 4573 retry = ticks + hz * 60; 4574 again: 4575 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 4576 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0)) 4577 PMAP_PAGE_BACKING_RETRY; 4578 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) { 4579 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 4580 vm_page_dirty(m); 4581 if (ipte & ipmap->pmap_bits[PG_A_IDX]) 4582 vm_page_flag_set(m, PG_REFERENCED); 4583 4584 /* 4585 * NOTE: m is not hard-busied so it is not safe to 4586 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4587 * transition against them being set in 4588 * pmap_enter(). 4589 */ 4590 if (ipte & ipmap->pmap_bits[PG_RW_IDX]) 4591 atomic_add_long(&m->md.writeable_count, -1); 4592 pmap_page_stats_deleting( 4593 atomic_fetchadd_long(&m->md.pmap_count, -1)); 4594 } 4595 4596 /* 4597 * Cleanup various tracking counters. pt_pv can't go away 4598 * due to our wired ref. 4599 */ 4600 if (ipmap != &kernel_pmap) { 4601 pv_entry_t pt_pv; 4602 4603 spin_lock_shared(&ipmap->pm_spin); 4604 pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva)); 4605 spin_unlock_shared(&ipmap->pm_spin); 4606 4607 if (pt_pv) { 4608 if (vm_page_unwire_quick(pt_pv->pv_m)) { 4609 panic("pmap_remove_all: bad " 4610 "wire_count on pt_pv"); 4611 } 4612 atomic_add_long( 4613 &ipmap->pm_stats.resident_count, -1); 4614 } 4615 } 4616 if (ipte & ipmap->pmap_bits[PG_W_IDX]) 4617 atomic_add_long(&ipmap->pm_stats.wired_count, -1); 4618 if (ipte & ipmap->pmap_bits[PG_G_IDX]) 4619 cpu_invlpg((void *)iva); 4620 } PMAP_PAGE_BACKING_DONE; 4621 4622 /* 4623 * pmap_count should be zero but it is possible to race a pmap_enter() 4624 * replacement (see 'oldm'). Once it is zero it cannot become 4625 * non-zero because the page is hard-busied. 4626 */ 4627 if (m->md.pmap_count || m->md.writeable_count) { 4628 tsleep(&m->md.pmap_count, 0, "pgunm", 1); 4629 if (retry - ticks > 0) 4630 goto again; 4631 panic("pmap_remove_all: cannot return pmap_count " 4632 "to 0 (%ld, %ld)", 4633 m->md.pmap_count, m->md.writeable_count); 4634 } 4635 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 4636 } 4637 4638 /* 4639 * Removes the page from a particular pmap. 4640 * 4641 * The page must be busied by the caller. 4642 */ 4643 void 4644 pmap_remove_specific(pmap_t pmap_match, vm_page_t m) 4645 { 4646 if (!pmap_initialized) 4647 return; 4648 4649 /* 4650 * PG_MAPPED test works for both non-fictitious and fictitious pages. 4651 */ 4652 if ((m->flags & PG_MAPPED) == 0) 4653 return; 4654 4655 PMAP_PAGE_BACKING_SCAN(m, pmap_match, ipmap, iptep, ipte, iva) { 4656 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0)) 4657 PMAP_PAGE_BACKING_RETRY; 4658 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) { 4659 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 4660 vm_page_dirty(m); 4661 if (ipte & ipmap->pmap_bits[PG_A_IDX]) 4662 vm_page_flag_set(m, PG_REFERENCED); 4663 4664 /* 4665 * NOTE: m is not hard-busied so it is not safe to 4666 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4667 * transition against them being set in 4668 * pmap_enter(). 4669 */ 4670 if (ipte & ipmap->pmap_bits[PG_RW_IDX]) 4671 atomic_add_long(&m->md.writeable_count, -1); 4672 pmap_page_stats_deleting( 4673 atomic_fetchadd_long(&m->md.pmap_count, -1)); 4674 } 4675 4676 /* 4677 * Cleanup various tracking counters. pt_pv can't go away 4678 * due to our wired ref. 4679 */ 4680 if (ipmap != &kernel_pmap) { 4681 pv_entry_t pt_pv; 4682 4683 spin_lock_shared(&ipmap->pm_spin); 4684 pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva)); 4685 spin_unlock_shared(&ipmap->pm_spin); 4686 4687 if (pt_pv) { 4688 atomic_add_long( 4689 &ipmap->pm_stats.resident_count, -1); 4690 if (vm_page_unwire_quick(pt_pv->pv_m)) { 4691 panic("pmap_remove_specific: bad " 4692 "wire_count on pt_pv"); 4693 } 4694 } 4695 } 4696 if (ipte & ipmap->pmap_bits[PG_W_IDX]) 4697 atomic_add_long(&ipmap->pm_stats.wired_count, -1); 4698 if (ipte & ipmap->pmap_bits[PG_G_IDX]) 4699 cpu_invlpg((void *)iva); 4700 } PMAP_PAGE_BACKING_DONE; 4701 } 4702 4703 /* 4704 * Set the physical protection on the specified range of this map 4705 * as requested. This function is typically only used for debug watchpoints 4706 * and COW pages. 4707 * 4708 * This function may not be called from an interrupt if the map is 4709 * not the kernel_pmap. 4710 * 4711 * NOTE! For shared page table pages we just unmap the page. 4712 */ 4713 void 4714 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4715 { 4716 struct pmap_scan_info info; 4717 /* JG review for NX */ 4718 4719 if (pmap == NULL) 4720 return; 4721 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 4722 pmap_remove(pmap, sva, eva); 4723 return; 4724 } 4725 if (prot & VM_PROT_WRITE) 4726 return; 4727 info.pmap = pmap; 4728 info.sva = sva; 4729 info.eva = eva; 4730 info.func = pmap_protect_callback; 4731 info.arg = &prot; 4732 pmap_scan(&info, 1); 4733 } 4734 4735 static 4736 void 4737 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 4738 vm_pindex_t *pte_placemark, 4739 pv_entry_t pt_pv, vm_offset_t va, 4740 pt_entry_t *ptep, void *arg __unused) 4741 { 4742 pt_entry_t pbits; 4743 pt_entry_t cbits; 4744 vm_page_t m; 4745 4746 again: 4747 pbits = *ptep; 4748 cpu_ccfence(); 4749 cbits = pbits; 4750 if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) { 4751 cbits &= ~pmap->pmap_bits[PG_A_IDX]; 4752 cbits &= ~pmap->pmap_bits[PG_M_IDX]; 4753 } 4754 /* else unmanaged page, adjust bits, no wire changes */ 4755 4756 if (ptep) { 4757 cbits &= ~pmap->pmap_bits[PG_RW_IDX]; 4758 #ifdef PMAP_DEBUG2 4759 if (pmap_enter_debug > 0) { 4760 --pmap_enter_debug; 4761 kprintf("pmap_protect va=%lx ptep=%p " 4762 "pt_pv=%p cbits=%08lx\n", 4763 va, ptep, pt_pv, cbits 4764 ); 4765 } 4766 #endif 4767 if (pbits != cbits) { 4768 if (!pmap_inval_smp_cmpset(pmap, va, 4769 ptep, pbits, cbits)) { 4770 goto again; 4771 } 4772 } 4773 if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) { 4774 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4775 if (pbits & pmap->pmap_bits[PG_A_IDX]) 4776 vm_page_flag_set(m, PG_REFERENCED); 4777 if (pbits & pmap->pmap_bits[PG_M_IDX]) 4778 vm_page_dirty(m); 4779 if (pbits & pmap->pmap_bits[PG_RW_IDX]) 4780 atomic_add_long(&m->md.writeable_count, -1); 4781 4782 } 4783 } 4784 pv_placemarker_wakeup(pmap, pte_placemark); 4785 } 4786 4787 /* 4788 * Insert the vm_page (m) at the virtual address (va), replacing any prior 4789 * mapping at that address. Set protection and wiring as requested. 4790 * 4791 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 4792 * possible. If it is we enter the page into the appropriate shared pmap 4793 * hanging off the related VM object instead of the passed pmap, then we 4794 * share the page table page from the VM object's pmap into the current pmap. 4795 * 4796 * NOTE: This routine MUST insert the page into the pmap now, it cannot 4797 * lazy-evaluate. 4798 */ 4799 void 4800 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4801 boolean_t wired, vm_map_entry_t entry) 4802 { 4803 pv_entry_t pt_pv; /* page table */ 4804 pv_entry_t pte_pv; /* page table entry */ 4805 vm_pindex_t *pte_placemark; 4806 pt_entry_t *ptep; 4807 pt_entry_t origpte; 4808 vm_paddr_t opa; 4809 vm_page_t oldm; 4810 pt_entry_t newpte; 4811 vm_paddr_t pa; 4812 4813 if (pmap == NULL) 4814 return; 4815 va = trunc_page(va); 4816 #ifdef PMAP_DIAGNOSTIC 4817 if (va >= KvaEnd) 4818 panic("pmap_enter: toobig"); 4819 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 4820 panic("pmap_enter: invalid to pmap_enter page table " 4821 "pages (va: 0x%lx)", va); 4822 #endif 4823 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 4824 kprintf("Warning: pmap_enter called on UVA with " 4825 "kernel_pmap\n"); 4826 #ifdef DDB 4827 db_print_backtrace(); 4828 #endif 4829 } 4830 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 4831 kprintf("Warning: pmap_enter called on KVA without" 4832 "kernel_pmap\n"); 4833 #ifdef DDB 4834 db_print_backtrace(); 4835 #endif 4836 } 4837 4838 /* 4839 * Get the locked page table page (pt_pv) for our new page table 4840 * entry, allocating it if necessary. 4841 * 4842 * There is no pte_pv for a terminal pte so the terminal pte will 4843 * be locked via pte_placemark. 4844 * 4845 * Only MMU actions by the CPU itself can modify the ptep out from 4846 * under us. 4847 * 4848 * If the pmap is still being initialized we assume existing 4849 * page tables. 4850 * 4851 * NOTE: Kernel mapppings do not track page table pages 4852 * (i.e. there is no pt_pv pt_pv structure). 4853 * 4854 * NOTE: origpte here is 'tentative', used only to check for 4855 * the degenerate case where the entry already exists and 4856 * matches. 4857 */ 4858 if (pmap_initialized == FALSE) { 4859 pte_pv = NULL; 4860 pt_pv = NULL; 4861 pte_placemark = NULL; 4862 ptep = vtopte(va); 4863 origpte = *ptep; 4864 } else { 4865 pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark); 4866 KKASSERT(pte_pv == NULL); 4867 if (va >= VM_MAX_USER_ADDRESS) { 4868 pt_pv = NULL; 4869 ptep = vtopte(va); 4870 } else { 4871 pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL); 4872 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4873 } 4874 origpte = *ptep; 4875 cpu_ccfence(); 4876 } 4877 4878 pa = VM_PAGE_TO_PHYS(m); 4879 4880 /* 4881 * Calculate the new PTE. 4882 */ 4883 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | 4884 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); 4885 if (wired) 4886 newpte |= pmap->pmap_bits[PG_W_IDX]; 4887 if (va < VM_MAX_USER_ADDRESS) 4888 newpte |= pmap->pmap_bits[PG_U_IDX]; 4889 if ((m->flags & PG_FICTITIOUS) == 0) 4890 newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; 4891 // if (pmap == &kernel_pmap) 4892 // newpte |= pgeflag; 4893 newpte |= pmap->pmap_cache_bits[m->pat_mode]; 4894 4895 /* 4896 * It is possible for multiple faults to occur in threaded 4897 * environments, the existing pte might be correct. 4898 */ 4899 if (((origpte ^ newpte) & 4900 ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | 4901 pmap->pmap_bits[PG_A_IDX])) == 0) { 4902 goto done; 4903 } 4904 4905 /* 4906 * Adjust page flags. The page is soft-busied or hard-busied, we 4907 * should be able to safely set PG_* flag bits even with the (shared) 4908 * soft-busy. 4909 * 4910 * The pmap_count and writeable_count is only tracked for 4911 * non-fictitious pages. As a bit of a safety, bump pmap_count 4912 * and set the PG_* bits before mapping the page. If another part 4913 * of the system does not properly hard-busy the page (against our 4914 * soft-busy or hard-busy) in order to remove mappings it might not 4915 * see the pte that we are about to add and thus will not be able to 4916 * drop pmap_count to 0. 4917 * 4918 * The PG_MAPPED and PG_WRITEABLE flags are set for any type of page. 4919 * 4920 * NOTE! PG_MAPPED and PG_WRITEABLE can only be cleared when 4921 * the page is hard-busied AND pmap_count is 0. This 4922 * interlocks our setting of the flags here. 4923 */ 4924 /*vm_page_spin_lock(m);*/ 4925 if ((m->flags & PG_FICTITIOUS) == 0) { 4926 pmap_page_stats_adding( 4927 atomic_fetchadd_long(&m->md.pmap_count, 1)); 4928 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4929 atomic_add_long(&m->md.writeable_count, 1); 4930 } 4931 if (newpte & pmap->pmap_bits[PG_RW_IDX]) { 4932 if ((m->flags & (PG_MAPPED | PG_WRITEABLE)) == 0) 4933 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 4934 } else { 4935 if ((m->flags & PG_MAPPED) == 0) 4936 vm_page_flag_set(m, PG_MAPPED); 4937 } 4938 /*vm_page_spin_unlock(m);*/ 4939 4940 /* 4941 * A race can develop when replacing an existing mapping. The new 4942 * page has been busied and the pte is placemark-locked, but the 4943 * old page is could be ripped out from under us at any time by 4944 * a backing scan. 4945 * 4946 * The race is handled by having the backing scans check pmap_count 4947 * writeable_count when doing operations that should ensure one 4948 * becomes 0. 4949 */ 4950 opa = origpte & PG_FRAME; 4951 if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) { 4952 oldm = PHYS_TO_VM_PAGE(opa); 4953 KKASSERT(opa == oldm->phys_addr); 4954 KKASSERT(entry != NULL); 4955 } else { 4956 oldm = NULL; 4957 } 4958 4959 /* 4960 * Swap the new and old PTEs and perform any necessary SMP 4961 * synchronization. 4962 */ 4963 if ((prot & VM_PROT_NOSYNC) || (opa == 0 && pt_pv != NULL)) { 4964 /* 4965 * Explicitly permitted to avoid pmap cpu mask synchronization 4966 * or the prior content of a non-kernel-related pmap was 4967 * invalid. 4968 */ 4969 origpte = atomic_swap_long(ptep, newpte); 4970 if (opa) 4971 cpu_invlpg((void *)va); 4972 } else { 4973 /* 4974 * Not permitted to avoid pmap cpu mask synchronization 4975 * or there prior content being replaced or this is a kernel 4976 * related pmap. 4977 * 4978 * Due to other kernel optimizations, we cannot assume a 4979 * 0->non_zero transition of *ptep can be done with a swap. 4980 */ 4981 origpte = pmap_inval_smp(pmap, va, 1, ptep, newpte); 4982 } 4983 opa = origpte & PG_FRAME; 4984 4985 #ifdef PMAP_DEBUG2 4986 if (pmap_enter_debug > 0) { 4987 --pmap_enter_debug; 4988 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" 4989 " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", 4990 va, m, 4991 origpte, newpte, ptep, 4992 pte_pv, pt_pv, opa, prot); 4993 } 4994 #endif 4995 4996 /* 4997 * Account for the changes in the pt_pv and pmap. 4998 * 4999 * Retain the same wiring count due to replacing an existing page, 5000 * or bump the wiring count for a new page. 5001 */ 5002 if (pt_pv && opa == 0) { 5003 vm_page_wire_quick(pt_pv->pv_m); 5004 atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1); 5005 } 5006 if (wired && (origpte & pmap->pmap_bits[PG_W_IDX]) == 0) 5007 atomic_add_long(&pmap->pm_stats.wired_count, 1); 5008 5009 /* 5010 * Account for the removal of the old page. pmap and pt_pv stats 5011 * have already been fully adjusted for both. 5012 * 5013 * WARNING! oldm is not soft or hard-busied. The pte at worst can 5014 * only be removed out from under us since we hold the 5015 * placemarker. So if it is still there, it must not have 5016 * changed. 5017 */ 5018 if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) { 5019 KKASSERT(oldm == PHYS_TO_VM_PAGE(opa)); 5020 if (origpte & pmap->pmap_bits[PG_M_IDX]) 5021 vm_page_dirty(oldm); 5022 if (origpte & pmap->pmap_bits[PG_A_IDX]) 5023 vm_page_flag_set(oldm, PG_REFERENCED); 5024 5025 /* 5026 * NOTE: oldm is not hard-busied so it is not safe to 5027 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 5028 * transition against them being set in 5029 * pmap_enter(). 5030 */ 5031 if (origpte & pmap->pmap_bits[PG_RW_IDX]) 5032 atomic_add_long(&oldm->md.writeable_count, -1); 5033 pmap_page_stats_deleting( 5034 atomic_fetchadd_long(&oldm->md.pmap_count, -1)); 5035 } 5036 5037 done: 5038 KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || 5039 (m->flags & PG_MAPPED)); 5040 5041 /* 5042 * Cleanup the pv entry, allowing other accessors. If the new page 5043 * is not managed but we have a pte_pv (which was locking our 5044 * operation), we can free it now. pte_pv->pv_m should be NULL. 5045 */ 5046 if (pte_placemark) 5047 pv_placemarker_wakeup(pmap, pte_placemark); 5048 if (pt_pv) 5049 pv_put(pt_pv); 5050 } 5051 5052 /* 5053 * Make a temporary mapping for a physical address. This is only intended 5054 * to be used for panic dumps. 5055 * 5056 * The caller is responsible for calling smp_invltlb(). 5057 */ 5058 void * 5059 pmap_kenter_temporary(vm_paddr_t pa, long i) 5060 { 5061 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 5062 return ((void *)crashdumpmap); 5063 } 5064 5065 #if 0 5066 #define MAX_INIT_PT (96) 5067 5068 /* 5069 * This routine preloads the ptes for a given object into the specified pmap. 5070 * This eliminates the blast of soft faults on process startup and 5071 * immediately after an mmap. 5072 */ 5073 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 5074 #endif 5075 5076 void 5077 pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry, 5078 vm_offset_t addr, vm_size_t size, int limit) 5079 { 5080 #if 0 5081 vm_prot_t prot = entry->protection; 5082 vm_object_t object = entry->ba.object; 5083 vm_pindex_t pindex = atop(entry->ba.offset + (addr - entry->ba.start)); 5084 struct rb_vm_page_scan_info info; 5085 struct lwp *lp; 5086 vm_size_t psize; 5087 5088 /* 5089 * We can't preinit if read access isn't set or there is no pmap 5090 * or object. 5091 */ 5092 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 5093 return; 5094 5095 /* 5096 * We can't preinit if the pmap is not the current pmap 5097 */ 5098 lp = curthread->td_lwp; 5099 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 5100 return; 5101 5102 /* 5103 * Misc additional checks 5104 */ 5105 psize = x86_64_btop(size); 5106 5107 if ((object->type != OBJT_VNODE) || 5108 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 5109 (object->resident_page_count > MAX_INIT_PT))) { 5110 return; 5111 } 5112 5113 if (pindex + psize > object->size) { 5114 if (object->size < pindex) 5115 return; 5116 psize = object->size - pindex; 5117 } 5118 5119 if (psize == 0) 5120 return; 5121 5122 /* 5123 * If everything is segment-aligned do not pre-init here. Instead 5124 * allow the normal vm_fault path to pass a segment hint to 5125 * pmap_enter() which will then use an object-referenced shared 5126 * page table page. 5127 */ 5128 if ((addr & SEG_MASK) == 0 && 5129 (ctob(psize) & SEG_MASK) == 0 && 5130 (ctob(pindex) & SEG_MASK) == 0) { 5131 return; 5132 } 5133 5134 /* 5135 * Use a red-black scan to traverse the requested range and load 5136 * any valid pages found into the pmap. 5137 * 5138 * We cannot safely scan the object's memq without holding the 5139 * object token. 5140 */ 5141 info.start_pindex = pindex; 5142 info.end_pindex = pindex + psize - 1; 5143 info.limit = limit; 5144 info.mpte = NULL; 5145 info.addr = addr; 5146 info.pmap = pmap; 5147 info.object = object; 5148 info.entry = entry; 5149 5150 /* 5151 * By using the NOLK scan, the callback function must be sure 5152 * to return -1 if the VM page falls out of the object. 5153 */ 5154 vm_object_hold_shared(object); 5155 vm_page_rb_tree_RB_SCAN_NOLK(&object->rb_memq, rb_vm_page_scancmp, 5156 pmap_object_init_pt_callback, &info); 5157 vm_object_drop(object); 5158 #endif 5159 } 5160 5161 #if 0 5162 5163 static 5164 int 5165 pmap_object_init_pt_callback(vm_page_t p, void *data) 5166 { 5167 struct rb_vm_page_scan_info *info = data; 5168 vm_pindex_t rel_index; 5169 int hard_busy; 5170 5171 /* 5172 * don't allow an madvise to blow away our really 5173 * free pages allocating pv entries. 5174 */ 5175 if ((info->limit & MAP_PREFAULT_MADVISE) && 5176 vmstats.v_free_count < vmstats.v_free_reserved) { 5177 return(-1); 5178 } 5179 5180 /* 5181 * Ignore list markers and ignore pages we cannot instantly 5182 * busy (while holding the object token). 5183 */ 5184 if (p->flags & PG_MARKER) 5185 return 0; 5186 hard_busy = 0; 5187 again: 5188 if (hard_busy) { 5189 if (vm_page_busy_try(p, TRUE)) 5190 return 0; 5191 } else { 5192 if (vm_page_sbusy_try(p)) 5193 return 0; 5194 } 5195 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 5196 (p->flags & PG_FICTITIOUS) == 0) { 5197 if ((p->queue - p->pc) == PQ_CACHE) { 5198 if (hard_busy == 0) { 5199 vm_page_sbusy_drop(p); 5200 hard_busy = 1; 5201 goto again; 5202 } 5203 vm_page_deactivate(p); 5204 } 5205 rel_index = p->pindex - info->start_pindex; 5206 pmap_enter(info->pmap, info->addr + x86_64_ptob(rel_index), p, 5207 VM_PROT_READ, FALSE, info->entry); 5208 } 5209 if (hard_busy) 5210 vm_page_wakeup(p); 5211 else 5212 vm_page_sbusy_drop(p); 5213 5214 /* 5215 * We are using an unlocked scan (that is, the scan expects its 5216 * current element to remain in the tree on return). So we have 5217 * to check here and abort the scan if it isn't. 5218 */ 5219 if (p->object != info->object) 5220 return -1; 5221 lwkt_yield(); 5222 return(0); 5223 } 5224 5225 #endif 5226 5227 /* 5228 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 5229 * address. 5230 * 5231 * Returns FALSE if it would be non-trivial or if a pte is already loaded 5232 * into the slot. 5233 * 5234 * The address must reside within a vm_map mapped range to ensure that the 5235 * page table doesn't get ripped out from under us. 5236 * 5237 * XXX This is safe only because page table pages are not freed. 5238 */ 5239 int 5240 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 5241 { 5242 pt_entry_t *pte; 5243 5244 /*spin_lock(&pmap->pm_spin);*/ 5245 if ((pte = pmap_pte(pmap, addr)) != NULL) { 5246 if (*pte & pmap->pmap_bits[PG_V_IDX]) { 5247 /*spin_unlock(&pmap->pm_spin);*/ 5248 return FALSE; 5249 } 5250 } 5251 /*spin_unlock(&pmap->pm_spin);*/ 5252 return TRUE; 5253 } 5254 5255 /* 5256 * Change the wiring attribute for a pmap/va pair. The mapping must already 5257 * exist in the pmap. The mapping may or may not be managed. The wiring in 5258 * the page is not changed, the page is returned so the caller can adjust 5259 * its wiring (the page is not locked in any way). 5260 * 5261 * Wiring is not a hardware characteristic so there is no need to invalidate 5262 * TLB. However, in an SMP environment we must use a locked bus cycle to 5263 * update the pte (if we are not using the pmap_inval_*() API that is)... 5264 * it's ok to do this for simple wiring changes. 5265 */ 5266 vm_page_t 5267 pmap_unwire(pmap_t pmap, vm_offset_t va) 5268 { 5269 pt_entry_t *ptep; 5270 pv_entry_t pt_pv; 5271 vm_paddr_t pa; 5272 vm_page_t m; 5273 5274 if (pmap == NULL) 5275 return NULL; 5276 5277 /* 5278 * Assume elements in the kernel pmap are stable 5279 */ 5280 if (pmap == &kernel_pmap) { 5281 if (pmap_pt(pmap, va) == 0) 5282 return NULL; 5283 ptep = pmap_pte_quick(pmap, va); 5284 if (pmap_pte_v(pmap, ptep)) { 5285 if (pmap_pte_w(pmap, ptep)) 5286 atomic_add_long(&pmap->pm_stats.wired_count,-1); 5287 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5288 pa = *ptep & PG_FRAME; 5289 m = PHYS_TO_VM_PAGE(pa); 5290 } else { 5291 m = NULL; 5292 } 5293 } else { 5294 /* 5295 * We can only [un]wire pmap-local pages (we cannot wire 5296 * shared pages) 5297 */ 5298 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 5299 if (pt_pv == NULL) 5300 return NULL; 5301 5302 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 5303 if ((*ptep & pmap->pmap_bits[PG_V_IDX]) == 0) { 5304 pv_put(pt_pv); 5305 return NULL; 5306 } 5307 5308 if (pmap_pte_w(pmap, ptep)) { 5309 atomic_add_long(&pt_pv->pv_pmap->pm_stats.wired_count, 5310 -1); 5311 } 5312 /* XXX else return NULL so caller doesn't unwire m ? */ 5313 5314 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5315 5316 pa = *ptep & PG_FRAME; 5317 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 5318 pv_put(pt_pv); 5319 } 5320 return m; 5321 } 5322 5323 /* 5324 * Copy the range specified by src_addr/len from the source map to 5325 * the range dst_addr/len in the destination map. 5326 * 5327 * This routine is only advisory and need not do anything. 5328 */ 5329 void 5330 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 5331 vm_size_t len, vm_offset_t src_addr) 5332 { 5333 } 5334 5335 /* 5336 * pmap_zero_page: 5337 * 5338 * Zero the specified physical page. 5339 * 5340 * This function may be called from an interrupt and no locking is 5341 * required. 5342 */ 5343 void 5344 pmap_zero_page(vm_paddr_t phys) 5345 { 5346 vm_offset_t va = PHYS_TO_DMAP(phys); 5347 5348 pagezero((void *)va); 5349 } 5350 5351 /* 5352 * pmap_zero_page: 5353 * 5354 * Zero part of a physical page by mapping it into memory and clearing 5355 * its contents with bzero. 5356 * 5357 * off and size may not cover an area beyond a single hardware page. 5358 */ 5359 void 5360 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 5361 { 5362 vm_offset_t virt = PHYS_TO_DMAP(phys); 5363 5364 bzero((char *)virt + off, size); 5365 } 5366 5367 /* 5368 * pmap_copy_page: 5369 * 5370 * Copy the physical page from the source PA to the target PA. 5371 * This function may be called from an interrupt. No locking 5372 * is required. 5373 */ 5374 void 5375 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 5376 { 5377 vm_offset_t src_virt, dst_virt; 5378 5379 src_virt = PHYS_TO_DMAP(src); 5380 dst_virt = PHYS_TO_DMAP(dst); 5381 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 5382 } 5383 5384 /* 5385 * pmap_copy_page_frag: 5386 * 5387 * Copy the physical page from the source PA to the target PA. 5388 * This function may be called from an interrupt. No locking 5389 * is required. 5390 */ 5391 void 5392 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 5393 { 5394 vm_offset_t src_virt, dst_virt; 5395 5396 src_virt = PHYS_TO_DMAP(src); 5397 dst_virt = PHYS_TO_DMAP(dst); 5398 5399 bcopy((char *)src_virt + (src & PAGE_MASK), 5400 (char *)dst_virt + (dst & PAGE_MASK), 5401 bytes); 5402 } 5403 5404 /* 5405 * Remove all pages from specified address space this aids process exit 5406 * speeds. Also, this code may be special cased for the current process 5407 * only. 5408 */ 5409 void 5410 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5411 { 5412 pmap_remove_noinval(pmap, sva, eva); 5413 cpu_invltlb(); 5414 } 5415 5416 /* 5417 * pmap_testbit tests bits in pte's note that the testbit/clearbit 5418 * routines are inline, and a lot of things compile-time evaluate. 5419 */ 5420 static 5421 boolean_t 5422 pmap_testbit(vm_page_t m, int bit) 5423 { 5424 int res = FALSE; 5425 5426 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5427 return FALSE; 5428 /* 5429 * Nothing to do if all the mappings are already read-only. 5430 * The page's [M]odify bits have already been synchronized 5431 * to the vm_page_t and cleaned out. 5432 */ 5433 if (bit == PG_M_IDX && m->md.writeable_count == 0) 5434 return FALSE; 5435 5436 /* 5437 * Iterate the mapping 5438 */ 5439 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5440 if (ipte & ipmap->pmap_bits[bit]) { 5441 res = TRUE; 5442 break; 5443 } 5444 } PMAP_PAGE_BACKING_DONE; 5445 return res; 5446 } 5447 5448 /* 5449 * This routine is used to modify bits in ptes. Only one bit should be 5450 * specified. PG_RW requires special handling. This call works with 5451 * any sort of mapped page. PG_FICTITIOUS pages might not be optimal. 5452 * 5453 * Caller must NOT hold any spin locks 5454 * Caller must hold (m) hard-busied 5455 * 5456 * NOTE: When clearing PG_M we could also (not implemented) drop 5457 * through to the PG_RW code and clear PG_RW too, forcing 5458 * a fault on write to redetect PG_M for virtual kernels, but 5459 * it isn't necessary since virtual kernels invalidate the 5460 * pte when they clear the VPTE_M bit in their virtual page 5461 * tables. 5462 * 5463 * NOTE: Does not re-dirty the page when clearing only PG_M. 5464 * 5465 * NOTE: Because we do not lock the pv, *pte can be in a state of 5466 * flux. Despite this the value of *pte is still somewhat 5467 * related while we hold the vm_page spin lock. 5468 * 5469 * *pte can be zero due to this race. Since we are clearing 5470 * bits we basically do no harm when this race occurs. 5471 */ 5472 static __inline 5473 void 5474 pmap_clearbit(vm_page_t m, int bit_index) 5475 { 5476 pt_entry_t npte; 5477 int retry; 5478 5479 /* 5480 * Too early in the boot 5481 */ 5482 if (!pmap_initialized) { 5483 if (bit_index == PG_RW_IDX) 5484 vm_page_flag_clear(m, PG_WRITEABLE); 5485 return; 5486 } 5487 5488 /* 5489 * Being asked to clear other random bits, we don't track them 5490 * so we have to iterate. 5491 */ 5492 if (bit_index != PG_RW_IDX) { 5493 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5494 if (ipte & ipmap->pmap_bits[bit_index]) { 5495 atomic_clear_long(iptep, 5496 ipmap->pmap_bits[bit_index]); 5497 } 5498 } PMAP_PAGE_BACKING_DONE; 5499 return; 5500 } 5501 5502 /* 5503 * Being asked to clear the RW bit. 5504 * 5505 * Nothing to do if all the mappings are already read-only 5506 */ 5507 if (m->md.writeable_count == 0) 5508 return; 5509 5510 /* 5511 * Iterate the mappings and check. 5512 */ 5513 retry = ticks + hz * 60; 5514 again: 5515 /* 5516 * Clear PG_RW. This also clears PG_M and marks the page dirty if 5517 * PG_M was set. 5518 * 5519 * Since the caller holds the page hard-busied we can safely clear 5520 * PG_WRITEABLE, and callers expect us to for the PG_RW_IDX path. 5521 */ 5522 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5523 #if 0 5524 if ((ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) == 0) 5525 continue; 5526 #endif 5527 if ((ipte & ipmap->pmap_bits[PG_RW_IDX]) == 0) 5528 continue; 5529 npte = ipte & ~(ipmap->pmap_bits[PG_RW_IDX] | 5530 ipmap->pmap_bits[PG_M_IDX]); 5531 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, npte)) 5532 PMAP_PAGE_BACKING_RETRY; 5533 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 5534 vm_page_dirty(m); 5535 5536 /* 5537 * NOTE: m is not hard-busied so it is not safe to 5538 * clear PG_WRITEABLE on the 1->0 transition 5539 * against it being set in pmap_enter(). 5540 * 5541 * pmap_count and writeable_count are only applicable 5542 * to non-fictitious pages (PG_MANAGED_IDX from pte) 5543 */ 5544 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) 5545 atomic_add_long(&m->md.writeable_count, -1); 5546 } PMAP_PAGE_BACKING_DONE; 5547 5548 /* 5549 * writeable_count should be zero but it is possible to race 5550 * a pmap_enter() replacement (see 'oldm'). Once it is zero 5551 * it cannot become non-zero because the page is hard-busied. 5552 */ 5553 if (m->md.writeable_count != 0) { 5554 tsleep(&m->md.writeable_count, 0, "pgwab", 1); 5555 if (retry - ticks > 0) 5556 goto again; 5557 panic("pmap_remove_all: cannot return writeable_count " 5558 "to 0 (%ld)", 5559 m->md.writeable_count); 5560 } 5561 vm_page_flag_clear(m, PG_WRITEABLE); 5562 } 5563 5564 /* 5565 * Lower the permission for all mappings to a given page. 5566 * 5567 * Page must be hard-busied by caller. Because the page is busied by the 5568 * caller, this should not be able to race a pmap_enter(). 5569 */ 5570 void 5571 pmap_page_protect(vm_page_t m, vm_prot_t prot) 5572 { 5573 /* JG NX support? */ 5574 if ((prot & VM_PROT_WRITE) == 0) { 5575 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 5576 /* 5577 * NOTE: pmap_clearbit(.. PG_RW) also clears 5578 * the PG_WRITEABLE flag in (m). 5579 */ 5580 pmap_clearbit(m, PG_RW_IDX); 5581 } else { 5582 pmap_remove_all(m); 5583 } 5584 } 5585 } 5586 5587 vm_paddr_t 5588 pmap_phys_address(vm_pindex_t ppn) 5589 { 5590 return (x86_64_ptob(ppn)); 5591 } 5592 5593 /* 5594 * Return a count of reference bits for a page, clearing those bits. 5595 * It is not necessary for every reference bit to be cleared, but it 5596 * is necessary that 0 only be returned when there are truly no 5597 * reference bits set. 5598 * 5599 * XXX: The exact number of bits to check and clear is a matter that 5600 * should be tested and standardized at some point in the future for 5601 * optimal aging of shared pages. 5602 * 5603 * This routine may not block. 5604 */ 5605 int 5606 pmap_ts_referenced(vm_page_t m) 5607 { 5608 int rval = 0; 5609 pt_entry_t npte; 5610 5611 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5612 return rval; 5613 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5614 if (ipte & ipmap->pmap_bits[PG_A_IDX]) { 5615 npte = ipte & ~ipmap->pmap_bits[PG_A_IDX]; 5616 if (!atomic_cmpset_long(iptep, ipte, npte)) 5617 PMAP_PAGE_BACKING_RETRY; 5618 ++rval; 5619 if (rval > 4) 5620 break; 5621 } 5622 } PMAP_PAGE_BACKING_DONE; 5623 return rval; 5624 } 5625 5626 /* 5627 * pmap_is_modified: 5628 * 5629 * Return whether or not the specified physical page was modified 5630 * in any physical maps. 5631 */ 5632 boolean_t 5633 pmap_is_modified(vm_page_t m) 5634 { 5635 boolean_t res; 5636 5637 res = pmap_testbit(m, PG_M_IDX); 5638 return (res); 5639 } 5640 5641 /* 5642 * Clear the modify bit on the vm_page. 5643 * 5644 * The page must be hard-busied. 5645 */ 5646 void 5647 pmap_clear_modify(vm_page_t m) 5648 { 5649 pmap_clearbit(m, PG_M_IDX); 5650 } 5651 5652 /* 5653 * pmap_clear_reference: 5654 * 5655 * Clear the reference bit on the specified physical page. 5656 */ 5657 void 5658 pmap_clear_reference(vm_page_t m) 5659 { 5660 pmap_clearbit(m, PG_A_IDX); 5661 } 5662 5663 /* 5664 * Miscellaneous support routines follow 5665 */ 5666 5667 static 5668 void 5669 x86_64_protection_init(void) 5670 { 5671 uint64_t *kp; 5672 int prot; 5673 5674 /* 5675 * NX supported? (boot time loader.conf override only) 5676 * 5677 * -1 Automatic (sets mode 1) 5678 * 0 Disabled 5679 * 1 NX implemented, differentiates PROT_READ vs PROT_READ|PROT_EXEC 5680 * 2 NX implemented for all cases 5681 */ 5682 TUNABLE_INT_FETCH("machdep.pmap_nx_enable", &pmap_nx_enable); 5683 if ((amd_feature & AMDID_NX) == 0) { 5684 pmap_bits_default[PG_NX_IDX] = 0; 5685 pmap_nx_enable = 0; 5686 } else if (pmap_nx_enable < 0) { 5687 pmap_nx_enable = 1; /* default to mode 1 (READ) */ 5688 } 5689 5690 /* 5691 * 0 is basically read-only access, but also set the NX (no-execute) 5692 * bit when VM_PROT_EXECUTE is not specified. 5693 */ 5694 kp = protection_codes; 5695 for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { 5696 switch (prot) { 5697 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 5698 /* 5699 * This case handled elsewhere 5700 */ 5701 *kp = 0; 5702 break; 5703 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 5704 /* 5705 * Read-only is 0|NX (pmap_nx_enable mode >= 1) 5706 */ 5707 if (pmap_nx_enable >= 1) 5708 *kp = pmap_bits_default[PG_NX_IDX]; 5709 break; 5710 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 5711 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 5712 /* 5713 * Execute requires read access 5714 */ 5715 *kp = 0; 5716 break; 5717 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 5718 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 5719 /* 5720 * Write without execute is RW|NX 5721 * (pmap_nx_enable mode >= 2) 5722 */ 5723 *kp = pmap_bits_default[PG_RW_IDX]; 5724 if (pmap_nx_enable >= 2) 5725 *kp |= pmap_bits_default[PG_NX_IDX]; 5726 break; 5727 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 5728 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 5729 /* 5730 * Write with execute is RW 5731 */ 5732 *kp = pmap_bits_default[PG_RW_IDX]; 5733 break; 5734 } 5735 ++kp; 5736 } 5737 } 5738 5739 /* 5740 * Map a set of physical memory pages into the kernel virtual 5741 * address space. Return a pointer to where it is mapped. This 5742 * routine is intended to be used for mapping device memory, 5743 * NOT real memory. 5744 * 5745 * NOTE: We can't use pgeflag unless we invalidate the pages one at 5746 * a time. 5747 * 5748 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 5749 * work whether the cpu supports PAT or not. The remaining PAT 5750 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 5751 * supports PAT. 5752 */ 5753 void * 5754 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5755 { 5756 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5757 } 5758 5759 void * 5760 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 5761 { 5762 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5763 } 5764 5765 void * 5766 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5767 { 5768 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5769 } 5770 5771 /* 5772 * Map a set of physical memory pages into the kernel virtual 5773 * address space. Return a pointer to where it is mapped. This 5774 * routine is intended to be used for mapping device memory, 5775 * NOT real memory. 5776 */ 5777 void * 5778 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5779 { 5780 vm_offset_t va, tmpva, offset; 5781 pt_entry_t *pte; 5782 vm_size_t tmpsize; 5783 5784 offset = pa & PAGE_MASK; 5785 size = roundup(offset + size, PAGE_SIZE); 5786 5787 va = kmem_alloc_nofault(&kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE); 5788 if (va == 0) 5789 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5790 5791 pa = pa & ~PAGE_MASK; 5792 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 5793 pte = vtopte(tmpva); 5794 *pte = pa | 5795 kernel_pmap.pmap_bits[PG_RW_IDX] | 5796 kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */ 5797 kernel_pmap.pmap_cache_bits[mode]; 5798 tmpsize -= PAGE_SIZE; 5799 tmpva += PAGE_SIZE; 5800 pa += PAGE_SIZE; 5801 } 5802 pmap_invalidate_range(&kernel_pmap, va, va + size); 5803 pmap_invalidate_cache_range(va, va + size); 5804 5805 return ((void *)(va + offset)); 5806 } 5807 5808 void 5809 pmap_unmapdev(vm_offset_t va, vm_size_t size) 5810 { 5811 vm_offset_t base, offset; 5812 5813 base = va & ~PAGE_MASK; 5814 offset = va & PAGE_MASK; 5815 size = roundup(offset + size, PAGE_SIZE); 5816 pmap_qremove(va, size >> PAGE_SHIFT); 5817 kmem_free(&kernel_map, base, size); 5818 } 5819 5820 /* 5821 * Sets the memory attribute for the specified page. 5822 */ 5823 void 5824 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5825 { 5826 5827 m->pat_mode = ma; 5828 5829 /* 5830 * If "m" is a normal page, update its direct mapping. This update 5831 * can be relied upon to perform any cache operations that are 5832 * required for data coherence. 5833 */ 5834 if ((m->flags & PG_FICTITIOUS) == 0) 5835 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); 5836 } 5837 5838 /* 5839 * Change the PAT attribute on an existing kernel memory map. Caller 5840 * must ensure that the virtual memory in question is not accessed 5841 * during the adjustment. 5842 */ 5843 void 5844 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 5845 { 5846 pt_entry_t *pte; 5847 vm_offset_t base; 5848 int changed = 0; 5849 5850 if (va == 0) 5851 panic("pmap_change_attr: va is NULL"); 5852 base = trunc_page(va); 5853 5854 while (count) { 5855 pte = vtopte(va); 5856 *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) | 5857 kernel_pmap.pmap_cache_bits[mode]; 5858 --count; 5859 va += PAGE_SIZE; 5860 } 5861 5862 changed = 1; /* XXX: not optimal */ 5863 5864 /* 5865 * Flush CPU caches if required to make sure any data isn't cached that 5866 * shouldn't be, etc. 5867 */ 5868 if (changed) { 5869 pmap_invalidate_range(&kernel_pmap, base, va); 5870 pmap_invalidate_cache_range(base, va); 5871 } 5872 } 5873 5874 /* 5875 * perform the pmap work for mincore 5876 */ 5877 int 5878 pmap_mincore(pmap_t pmap, vm_offset_t addr) 5879 { 5880 pt_entry_t *ptep, pte; 5881 vm_page_t m; 5882 int val = 0; 5883 5884 ptep = pmap_pte(pmap, addr); 5885 5886 if (ptep && (pte = *ptep) != 0) { 5887 vm_offset_t pa; 5888 5889 val = MINCORE_INCORE; 5890 pa = pte & PG_FRAME; 5891 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) 5892 m = PHYS_TO_VM_PAGE(pa); 5893 else 5894 m = NULL; 5895 5896 /* 5897 * Modified by us 5898 */ 5899 if (pte & pmap->pmap_bits[PG_M_IDX]) 5900 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 5901 5902 /* 5903 * Modified by someone 5904 */ 5905 else if (m && (m->dirty || pmap_is_modified(m))) 5906 val |= MINCORE_MODIFIED_OTHER; 5907 5908 /* 5909 * Referenced by us, or someone else. 5910 */ 5911 if (pte & pmap->pmap_bits[PG_A_IDX]) { 5912 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 5913 } else if (m && ((m->flags & PG_REFERENCED) || 5914 pmap_ts_referenced(m))) { 5915 val |= MINCORE_REFERENCED_OTHER; 5916 vm_page_flag_set(m, PG_REFERENCED); 5917 } 5918 } 5919 return val; 5920 } 5921 5922 /* 5923 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 5924 * vmspace will be ref'd and the old one will be deref'd. 5925 * 5926 * The vmspace for all lwps associated with the process will be adjusted 5927 * and cr3 will be reloaded if any lwp is the current lwp. 5928 * 5929 * The process must hold the vmspace->vm_map.token for oldvm and newvm 5930 */ 5931 void 5932 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 5933 { 5934 struct vmspace *oldvm; 5935 struct lwp *lp; 5936 5937 oldvm = p->p_vmspace; 5938 if (oldvm != newvm) { 5939 if (adjrefs) 5940 vmspace_ref(newvm); 5941 p->p_vmspace = newvm; 5942 KKASSERT(p->p_nthreads == 1); 5943 lp = RB_ROOT(&p->p_lwp_tree); 5944 pmap_setlwpvm(lp, newvm); 5945 if (adjrefs) 5946 vmspace_rel(oldvm); 5947 } 5948 } 5949 5950 /* 5951 * Set the vmspace for a LWP. The vmspace is almost universally set the 5952 * same as the process vmspace, but virtual kernels need to swap out contexts 5953 * on a per-lwp basis. 5954 * 5955 * Caller does not necessarily hold any vmspace tokens. Caller must control 5956 * the lwp (typically be in the context of the lwp). We use a critical 5957 * section to protect against statclock and hardclock (statistics collection). 5958 */ 5959 void 5960 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 5961 { 5962 struct vmspace *oldvm; 5963 struct pmap *pmap; 5964 thread_t td; 5965 5966 oldvm = lp->lwp_vmspace; 5967 5968 if (oldvm != newvm) { 5969 crit_enter(); 5970 td = curthread; 5971 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 5972 lp->lwp_vmspace = newvm; 5973 if (td->td_lwp == lp) { 5974 pmap = vmspace_pmap(newvm); 5975 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 5976 if (pmap->pm_active_lock & CPULOCK_EXCL) 5977 pmap_interlock_wait(newvm); 5978 #if defined(SWTCH_OPTIM_STATS) 5979 tlb_flush_count++; 5980 #endif 5981 if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { 5982 td->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 5983 if (meltdown_mitigation && pmap->pm_pmlpv_iso) { 5984 td->td_pcb->pcb_cr3_iso = 5985 vtophys(pmap->pm_pml4_iso); 5986 td->td_pcb->pcb_flags |= PCB_ISOMMU; 5987 } else { 5988 td->td_pcb->pcb_cr3_iso = 0; 5989 td->td_pcb->pcb_flags &= ~PCB_ISOMMU; 5990 } 5991 } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { 5992 td->td_pcb->pcb_cr3 = KPML4phys; 5993 td->td_pcb->pcb_cr3_iso = 0; 5994 td->td_pcb->pcb_flags &= ~PCB_ISOMMU; 5995 } else { 5996 panic("pmap_setlwpvm: unknown pmap type\n"); 5997 } 5998 5999 /* 6000 * The MMU separation fields needs to be updated. 6001 * (it can't access the pcb directly from the 6002 * restricted user pmap). 6003 */ 6004 { 6005 struct trampframe *tramp; 6006 6007 tramp = &pscpu->trampoline; 6008 tramp->tr_pcb_cr3 = td->td_pcb->pcb_cr3; 6009 tramp->tr_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso; 6010 tramp->tr_pcb_flags = td->td_pcb->pcb_flags; 6011 tramp->tr_pcb_rsp = (register_t)td->td_pcb; 6012 /* tr_pcb_rsp doesn't change */ 6013 } 6014 6015 /* 6016 * In kernel-land we always use the normal PML4E 6017 * so the kernel is fully mapped and can also access 6018 * user memory. 6019 */ 6020 load_cr3(td->td_pcb->pcb_cr3); 6021 pmap = vmspace_pmap(oldvm); 6022 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 6023 mycpu->gd_cpuid); 6024 } 6025 crit_exit(); 6026 } 6027 } 6028 6029 /* 6030 * Called when switching to a locked pmap, used to interlock against pmaps 6031 * undergoing modifications to prevent us from activating the MMU for the 6032 * target pmap until all such modifications have completed. We have to do 6033 * this because the thread making the modifications has already set up its 6034 * SMP synchronization mask. 6035 * 6036 * This function cannot sleep! 6037 * 6038 * No requirements. 6039 */ 6040 void 6041 pmap_interlock_wait(struct vmspace *vm) 6042 { 6043 struct pmap *pmap = &vm->vm_pmap; 6044 6045 if (pmap->pm_active_lock & CPULOCK_EXCL) { 6046 crit_enter(); 6047 KKASSERT(curthread->td_critcount >= 2); 6048 DEBUG_PUSH_INFO("pmap_interlock_wait"); 6049 while (pmap->pm_active_lock & CPULOCK_EXCL) { 6050 cpu_ccfence(); 6051 lwkt_process_ipiq(); 6052 } 6053 DEBUG_POP_INFO(); 6054 crit_exit(); 6055 } 6056 } 6057 6058 vm_offset_t 6059 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 6060 { 6061 6062 if ((obj == NULL) || (size < NBPDR) || 6063 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { 6064 return addr; 6065 } 6066 6067 addr = roundup2(addr, NBPDR); 6068 return addr; 6069 } 6070 6071 /* 6072 * Used by kmalloc/kfree, page already exists at va 6073 */ 6074 vm_page_t 6075 pmap_kvtom(vm_offset_t va) 6076 { 6077 pt_entry_t *ptep = vtopte(va); 6078 6079 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 6080 } 6081 6082 /* 6083 * Initialize machine-specific shared page directory support. This 6084 * is executed when a VM object is created. 6085 */ 6086 void 6087 pmap_object_init(vm_object_t object) 6088 { 6089 } 6090 6091 /* 6092 * Clean up machine-specific shared page directory support. This 6093 * is executed when a VM object is destroyed. 6094 */ 6095 void 6096 pmap_object_free(vm_object_t object) 6097 { 6098 } 6099 6100 /* 6101 * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related 6102 * VM page and issue a pginfo->callback. 6103 */ 6104 static 6105 void 6106 pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info, 6107 vm_pindex_t *pte_placemark, 6108 pv_entry_t pt_pv, vm_offset_t va, 6109 pt_entry_t *ptep, void *arg) 6110 { 6111 struct pmap_pgscan_info *pginfo = arg; 6112 vm_page_t m; 6113 pt_entry_t pte; 6114 6115 pte = *ptep; 6116 cpu_ccfence(); 6117 6118 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { 6119 /* 6120 * Try to busy the page while we hold the pte_placemark locked. 6121 */ 6122 m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME); 6123 if (vm_page_busy_try(m, TRUE) == 0) { 6124 if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) { 6125 /* 6126 * The callback is issued with the pt_pv 6127 * unlocked. 6128 */ 6129 pv_placemarker_wakeup(pmap, pte_placemark); 6130 if (pt_pv) { 6131 vm_page_wire_quick(pt_pv->pv_m); 6132 pv_unlock(pt_pv); 6133 } 6134 if (pginfo->callback(pginfo, va, m) < 0) 6135 info->stop = 1; 6136 if (pt_pv) { 6137 pv_lock(pt_pv); 6138 if (vm_page_unwire_quick(pt_pv->pv_m)) { 6139 panic("pmap_pgscan: bad wire_" 6140 "count on pt_pv"); 6141 } 6142 } 6143 } else { 6144 vm_page_wakeup(m); 6145 pv_placemarker_wakeup(pmap, pte_placemark); 6146 } 6147 } else { 6148 ++pginfo->busycount; 6149 pv_placemarker_wakeup(pmap, pte_placemark); 6150 } 6151 } else { 6152 /* 6153 * Shared page table or unmanaged page (sharept or !sharept) 6154 */ 6155 pv_placemarker_wakeup(pmap, pte_placemark); 6156 } 6157 } 6158 6159 void 6160 pmap_pgscan(struct pmap_pgscan_info *pginfo) 6161 { 6162 struct pmap_scan_info info; 6163 6164 pginfo->offset = pginfo->beg_addr; 6165 info.pmap = pginfo->pmap; 6166 info.sva = pginfo->beg_addr; 6167 info.eva = pginfo->end_addr; 6168 info.func = pmap_pgscan_callback; 6169 info.arg = pginfo; 6170 pmap_scan(&info, 0); 6171 if (info.stop == 0) 6172 pginfo->offset = pginfo->end_addr; 6173 } 6174 6175 /* 6176 * Wait for a placemarker that we do not own to clear. The placemarker 6177 * in question is not necessarily set to the pindex we want, we may have 6178 * to wait on the element because we want to reserve it ourselves. 6179 * 6180 * NOTE: PM_PLACEMARK_WAKEUP sets a bit which is already set in 6181 * PM_NOPLACEMARK, so it does not interfere with placemarks 6182 * which have already been woken up. 6183 * 6184 * NOTE: This routine is called without the pmap spin-lock and so can 6185 * race changes to *pmark. Due to the sensitivity of the routine 6186 * to possible MULTIPLE interactions from other cpus, and the 6187 * overloading of the WAKEUP bit on PM_NOPLACEMARK, we have to 6188 * use a cmpset loop to avoid a race that might cause the WAKEUP 6189 * bit to be lost. 6190 * 6191 * Caller is expected to retry its operation upon return. 6192 */ 6193 static 6194 void 6195 pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark) 6196 { 6197 vm_pindex_t mark; 6198 6199 mark = *pmark; 6200 cpu_ccfence(); 6201 while (mark != PM_NOPLACEMARK) { 6202 tsleep_interlock(pmark, 0); 6203 if (atomic_fcmpset_long(pmark, &mark, 6204 mark | PM_PLACEMARK_WAKEUP)) { 6205 tsleep(pmark, PINTERLOCKED, "pvplw", 0); 6206 break; 6207 } 6208 } 6209 } 6210 6211 /* 6212 * Wakeup a placemarker that we own. Replace the entry with 6213 * PM_NOPLACEMARK and issue a wakeup() if necessary. 6214 */ 6215 static 6216 void 6217 pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark) 6218 { 6219 vm_pindex_t pindex; 6220 6221 pindex = atomic_swap_long(pmark, PM_NOPLACEMARK); 6222 KKASSERT(pindex != PM_NOPLACEMARK); 6223 if (pindex & PM_PLACEMARK_WAKEUP) 6224 wakeup(pmark); 6225 } 6226