1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2019 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 * 47 * Some notes: 48 * - The 'M'odified bit is only applicable to terminal PTEs. 49 * 50 * - The 'U'ser access bit can be set for higher-level PTEs as 51 * long as it isn't set for terminal PTEs for pages we don't 52 * want user access to. 53 */ 54 55 #if 0 /* JG */ 56 #include "opt_pmap.h" 57 #endif 58 #include "opt_msgbuf.h" 59 60 #include <sys/param.h> 61 #include <sys/kernel.h> 62 #include <sys/proc.h> 63 #include <sys/msgbuf.h> 64 #include <sys/vmmeter.h> 65 #include <sys/mman.h> 66 #include <sys/systm.h> 67 68 #include <vm/vm.h> 69 #include <vm/vm_param.h> 70 #include <sys/sysctl.h> 71 #include <sys/lock.h> 72 #include <vm/vm_kern.h> 73 #include <vm/vm_page.h> 74 #include <vm/vm_map.h> 75 #include <vm/vm_object.h> 76 #include <vm/vm_extern.h> 77 #include <vm/vm_pageout.h> 78 #include <vm/vm_pager.h> 79 #include <vm/vm_zone.h> 80 81 #include <sys/thread2.h> 82 #include <sys/spinlock2.h> 83 #include <vm/vm_page2.h> 84 85 #include <machine/cputypes.h> 86 #include <machine/cpu.h> 87 #include <machine/md_var.h> 88 #include <machine/specialreg.h> 89 #include <machine/smp.h> 90 #include <machine_base/apic/apicreg.h> 91 #include <machine/globaldata.h> 92 #include <machine/pmap.h> 93 #include <machine/pmap_inval.h> 94 95 #include <ddb/ddb.h> 96 97 #define PMAP_KEEP_PDIRS 98 99 #if defined(DIAGNOSTIC) 100 #define PMAP_DIAGNOSTIC 101 #endif 102 103 #define MINPV 2048 104 105 /* 106 * pmap debugging will report who owns a pv lock when blocking. 107 */ 108 #ifdef PMAP_DEBUG 109 110 #define PMAP_DEBUG_DECL ,const char *func, int lineno 111 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 112 #define PMAP_DEBUG_COPY , func, lineno 113 114 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp \ 115 PMAP_DEBUG_ARGS) 116 #define pv_lock(pv) _pv_lock(pv \ 117 PMAP_DEBUG_ARGS) 118 #define pv_hold_try(pv) _pv_hold_try(pv \ 119 PMAP_DEBUG_ARGS) 120 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 121 PMAP_DEBUG_ARGS) 122 123 #define pv_free(pv, pvp) _pv_free(pv, pvp PMAP_DEBUG_ARGS) 124 125 #else 126 127 #define PMAP_DEBUG_DECL 128 #define PMAP_DEBUG_ARGS 129 #define PMAP_DEBUG_COPY 130 131 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp) 132 #define pv_lock(pv) _pv_lock(pv) 133 #define pv_hold_try(pv) _pv_hold_try(pv) 134 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 135 #define pv_free(pv, pvp) _pv_free(pv, pvp) 136 137 #endif 138 139 /* 140 * Get PDEs and PTEs for user/kernel address space 141 */ 142 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 143 144 #define pmap_pde_v(pmap, pte) \ 145 ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 146 #define pmap_pte_w(pmap, pte) \ 147 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) 148 #define pmap_pte_m(pmap, pte) \ 149 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) 150 #define pmap_pte_u(pmap, pte) \ 151 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) 152 #define pmap_pte_v(pmap, pte) \ 153 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 154 155 /* 156 * Given a map and a machine independent protection code, 157 * convert to a vax protection code. 158 */ 159 #define pte_prot(m, p) \ 160 (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 161 static uint64_t protection_codes[PROTECTION_CODES_SIZE]; 162 163 /* 164 * Backing scan macros. Note that in the use case 'ipte' is only a tentitive 165 * value and must be validated by a pmap_inval_smp_cmpset*() or equivalent 166 * function. 167 * 168 * NOTE: cpu_ccfence() is required to prevent excessive optmization of 169 * of the (ipte) variable. 170 */ 171 #define PMAP_PAGE_BACKING_SCAN(m, match_pmap, ipmap, iptep, ipte, iva) \ 172 if (m->object) { \ 173 vm_object_t iobj = m->object; \ 174 vm_map_backing_t iba, next_ba; \ 175 struct pmap *ipmap; \ 176 pt_entry_t ipte; \ 177 pt_entry_t *iptep; \ 178 vm_offset_t iva; \ 179 vm_pindex_t ipindex_start; \ 180 vm_pindex_t ipindex_end; \ 181 \ 182 lockmgr(&iobj->backing_lk, LK_SHARED); \ 183 next_ba = TAILQ_FIRST(&iobj->backing_list); \ 184 while ((iba = next_ba) != NULL) { \ 185 next_ba = TAILQ_NEXT(iba, entry); \ 186 ipmap = iba->pmap; \ 187 if (match_pmap && ipmap != match_pmap) \ 188 continue; \ 189 ipindex_start = iba->offset >> PAGE_SHIFT; \ 190 ipindex_end = ipindex_start + \ 191 ((iba->end - iba->start) >> PAGE_SHIFT); \ 192 if (m->pindex < ipindex_start || \ 193 m->pindex >= ipindex_end) { \ 194 continue; \ 195 } \ 196 iva = iba->start + \ 197 ((m->pindex - ipindex_start) << PAGE_SHIFT); \ 198 iptep = pmap_pte(ipmap, iva); \ 199 if (iptep == NULL) \ 200 continue; \ 201 ipte = *iptep; \ 202 cpu_ccfence(); \ 203 if (m->phys_addr != (ipte & PG_FRAME)) \ 204 continue; \ 205 206 #define PMAP_PAGE_BACKING_RETRY \ 207 { \ 208 next_ba = iba; \ 209 continue; \ 210 } \ 211 212 #define PMAP_PAGE_BACKING_DONE \ 213 } \ 214 lockmgr(&iobj->backing_lk, LK_RELEASE); \ 215 } \ 216 217 struct pmap kernel_pmap; 218 struct pmap iso_pmap; 219 220 vm_paddr_t avail_start; /* PA of first available physical page */ 221 vm_paddr_t avail_end; /* PA of last available physical page */ 222 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 223 vm_offset_t virtual2_end; 224 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 225 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 226 vm_offset_t KvaStart; /* VA start of KVA space */ 227 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 228 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 229 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 230 //static int pgeflag; /* PG_G or-in */ 231 uint64_t PatMsr; 232 233 static int ndmpdp; 234 static vm_paddr_t dmaplimit; 235 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 236 237 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 238 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */ 239 240 static uint64_t KPTbase; 241 static uint64_t KPTphys; 242 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 243 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 244 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 245 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 246 247 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 248 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 249 250 /* 251 * Data for the pv entry allocation mechanism 252 */ 253 __read_mostly static vm_zone_t pvzone; 254 __read_mostly static int pmap_pagedaemon_waken = 0; 255 static struct vm_zone pvzone_store; 256 static struct pv_entry *pvinit; 257 258 /* 259 * All those kernel PT submaps that BSD is so fond of 260 */ 261 pt_entry_t *CMAP1 = NULL, *ptmmap; 262 caddr_t CADDR1 = NULL, ptvmmap = NULL; 263 static pt_entry_t *msgbufmap; 264 struct msgbuf *msgbufp=NULL; 265 266 /* 267 * PMAP default PG_* bits. Needed to be able to add 268 * EPT/NPT pagetable pmap_bits for the VMM module 269 */ 270 uint64_t pmap_bits_default[] = { 271 REGULAR_PMAP, /* TYPE_IDX 0 */ 272 X86_PG_V, /* PG_V_IDX 1 */ 273 X86_PG_RW, /* PG_RW_IDX 2 */ 274 X86_PG_U, /* PG_U_IDX 3 */ 275 X86_PG_A, /* PG_A_IDX 4 */ 276 X86_PG_M, /* PG_M_IDX 5 */ 277 X86_PG_PS, /* PG_PS_IDX3 6 */ 278 X86_PG_G, /* PG_G_IDX 7 */ 279 X86_PG_AVAIL1, /* PG_AVAIL1_IDX 8 */ 280 X86_PG_AVAIL2, /* PG_AVAIL2_IDX 9 */ 281 X86_PG_AVAIL3, /* PG_AVAIL3_IDX 10 */ 282 X86_PG_NC_PWT | X86_PG_NC_PCD, /* PG_N_IDX 11 */ 283 X86_PG_NX, /* PG_NX_IDX 12 */ 284 }; 285 /* 286 * Crashdump maps. 287 */ 288 static pt_entry_t *pt_crashdumpmap; 289 static caddr_t crashdumpmap; 290 291 static int pmap_debug = 0; 292 SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW, 293 &pmap_debug, 0, "Debug pmap's"); 294 #ifdef PMAP_DEBUG2 295 static int pmap_enter_debug = 0; 296 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, 297 &pmap_enter_debug, 0, "Debug pmap_enter's"); 298 #endif 299 static int pmap_yield_count = 64; 300 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 301 &pmap_yield_count, 0, "Yield during init_pt/release"); 302 int pmap_fast_kernel_cpusync = 0; 303 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, 304 &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); 305 int pmap_dynamic_delete = 0; 306 SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW, 307 &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs"); 308 int pmap_lock_delay = 100; 309 SYSCTL_INT(_machdep, OID_AUTO, pmap_lock_delay, CTLFLAG_RW, 310 &pmap_lock_delay, 0, "Spin loops"); 311 static int meltdown_mitigation = -1; 312 TUNABLE_INT("machdep.meltdown_mitigation", &meltdown_mitigation); 313 SYSCTL_INT(_machdep, OID_AUTO, meltdown_mitigation, CTLFLAG_RW, 314 &meltdown_mitigation, 0, "Userland pmap isolation"); 315 316 static int pmap_nx_enable = -1; /* -1 = auto */ 317 /* needs manual TUNABLE in early probe, see below */ 318 SYSCTL_INT(_machdep, OID_AUTO, pmap_nx_enable, CTLFLAG_RD, 319 &pmap_nx_enable, 0, 320 "no-execute support (0=disabled, 1=w/READ, 2=w/READ & WRITE)"); 321 322 static int pmap_pv_debug = 50; 323 SYSCTL_INT(_machdep, OID_AUTO, pmap_pv_debug, CTLFLAG_RW, 324 &pmap_pv_debug, 0, ""); 325 326 static long vm_pmap_pv_entries; 327 SYSCTL_LONG(_vm, OID_AUTO, pmap_pv_entries, CTLFLAG_RD, 328 &vm_pmap_pv_entries, 0, ""); 329 330 /* Standard user access funtions */ 331 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, 332 size_t *lencopied); 333 extern int std_copyin (const void *udaddr, void *kaddr, size_t len); 334 extern int std_copyout (const void *kaddr, void *udaddr, size_t len); 335 extern int std_fubyte (const uint8_t *base); 336 extern int std_subyte (uint8_t *base, uint8_t byte); 337 extern int32_t std_fuword32 (const uint32_t *base); 338 extern int64_t std_fuword64 (const uint64_t *base); 339 extern int std_suword64 (uint64_t *base, uint64_t word); 340 extern int std_suword32 (uint32_t *base, int word); 341 extern uint32_t std_swapu32 (volatile uint32_t *base, uint32_t v); 342 extern uint64_t std_swapu64 (volatile uint64_t *base, uint64_t v); 343 extern uint32_t std_fuwordadd32 (volatile uint32_t *base, uint32_t v); 344 extern uint64_t std_fuwordadd64 (volatile uint64_t *base, uint64_t v); 345 346 #if 0 347 static void pv_hold(pv_entry_t pv); 348 #endif 349 static int _pv_hold_try(pv_entry_t pv 350 PMAP_DEBUG_DECL); 351 static void pv_drop(pv_entry_t pv); 352 static void _pv_lock(pv_entry_t pv 353 PMAP_DEBUG_DECL); 354 static void pv_unlock(pv_entry_t pv); 355 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 356 PMAP_DEBUG_DECL); 357 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp 358 PMAP_DEBUG_DECL); 359 static void _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL); 360 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, 361 vm_pindex_t **pmarkp, int *errorp); 362 static void pv_put(pv_entry_t pv); 363 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 364 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 365 pv_entry_t *pvpp); 366 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 367 pmap_inval_bulk_t *bulk, int destroy); 368 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 369 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, 370 pmap_inval_bulk_t *bulk); 371 372 struct pmap_scan_info; 373 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 374 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 375 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 376 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 377 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 378 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 379 380 static void x86_64_protection_init (void); 381 static void create_pagetables(vm_paddr_t *firstaddr); 382 static void pmap_remove_all (vm_page_t m); 383 static boolean_t pmap_testbit (vm_page_t m, int bit); 384 385 static pt_entry_t *pmap_pte_quick (pmap_t pmap, vm_offset_t va); 386 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 387 388 static void pmap_pinit_defaults(struct pmap *pmap); 389 static void pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark); 390 static void pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark); 391 392 static int 393 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 394 { 395 if (pv1->pv_pindex < pv2->pv_pindex) 396 return(-1); 397 if (pv1->pv_pindex > pv2->pv_pindex) 398 return(1); 399 return(0); 400 } 401 402 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 403 pv_entry_compare, vm_pindex_t, pv_pindex); 404 405 /* 406 * Keep track of pages in the pmap. The procedure is handed 407 * the vm_page->md.pmap_count value prior to an increment or 408 * decrement. 409 * 410 * t_arm - Active real memory 411 * t_avm - Active virtual memory 412 * t_armshr - Active real memory that is also shared 413 * t_avmshr - Active virtual memory that is also shared 414 * 415 * NOTE: At the moment t_avm is effectively just the same as t_arm. 416 */ 417 static __inline 418 void 419 pmap_page_stats_adding(long prev_count) 420 { 421 globaldata_t gd = mycpu; 422 423 if (prev_count == 0) { 424 ++gd->gd_vmtotal.t_arm; 425 ++gd->gd_vmtotal.t_avm; 426 } else if (prev_count == 1) { 427 ++gd->gd_vmtotal.t_armshr; 428 ++gd->gd_vmtotal.t_avmshr; 429 } else { 430 ++gd->gd_vmtotal.t_avmshr; 431 } 432 } 433 434 static __inline 435 void 436 pmap_page_stats_deleting(long prev_count) 437 { 438 globaldata_t gd = mycpu; 439 440 if (prev_count == 1) { 441 --gd->gd_vmtotal.t_arm; 442 --gd->gd_vmtotal.t_avm; 443 } else if (prev_count == 2) { 444 --gd->gd_vmtotal.t_armshr; 445 --gd->gd_vmtotal.t_avmshr; 446 } else { 447 --gd->gd_vmtotal.t_avmshr; 448 } 449 } 450 451 /* 452 * Move the kernel virtual free pointer to the next 453 * 2MB. This is used to help improve performance 454 * by using a large (2MB) page for much of the kernel 455 * (.text, .data, .bss) 456 */ 457 static 458 vm_offset_t 459 pmap_kmem_choose(vm_offset_t addr) 460 { 461 vm_offset_t newaddr = addr; 462 463 newaddr = roundup2(addr, NBPDR); 464 return newaddr; 465 } 466 467 /* 468 * Returns the pindex of a page table entry (representing a terminal page). 469 * There are NUPTE_TOTAL page table entries possible (a huge number) 470 * 471 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 472 * We want to properly translate negative KVAs. 473 */ 474 static __inline 475 vm_pindex_t 476 pmap_pte_pindex(vm_offset_t va) 477 { 478 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 479 } 480 481 /* 482 * Returns the pindex of a page table. 483 */ 484 static __inline 485 vm_pindex_t 486 pmap_pt_pindex(vm_offset_t va) 487 { 488 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 489 } 490 491 /* 492 * Returns the pindex of a page directory. 493 */ 494 static __inline 495 vm_pindex_t 496 pmap_pd_pindex(vm_offset_t va) 497 { 498 return (NUPTE_TOTAL + NUPT_TOTAL + 499 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 500 } 501 502 static __inline 503 vm_pindex_t 504 pmap_pdp_pindex(vm_offset_t va) 505 { 506 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 507 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 508 } 509 510 static __inline 511 vm_pindex_t 512 pmap_pml4_pindex(void) 513 { 514 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 515 } 516 517 /* 518 * Return various clipped indexes for a given VA 519 * 520 * Returns the index of a pt in a page directory, representing a page 521 * table. 522 */ 523 static __inline 524 vm_pindex_t 525 pmap_pt_index(vm_offset_t va) 526 { 527 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 528 } 529 530 /* 531 * Returns the index of a pd in a page directory page, representing a page 532 * directory. 533 */ 534 static __inline 535 vm_pindex_t 536 pmap_pd_index(vm_offset_t va) 537 { 538 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 539 } 540 541 /* 542 * Returns the index of a pdp in the pml4 table, representing a page 543 * directory page. 544 */ 545 static __inline 546 vm_pindex_t 547 pmap_pdp_index(vm_offset_t va) 548 { 549 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 550 } 551 552 /* 553 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 554 * the PT layer. This will speed up core pmap operations considerably. 555 * We also cache the PTE layer to (hopefully) improve relative lookup 556 * speeds. 557 * 558 * NOTE: The pmap spinlock does not need to be held but the passed-in pv 559 * must be in a known associated state (typically by being locked when 560 * the pmap spinlock isn't held). We allow the race for that case. 561 * 562 * NOTE: pm_pvhint* is only accessed (read) with the spin-lock held, using 563 * cpu_ccfence() to prevent compiler optimizations from reloading the 564 * field. 565 */ 566 static __inline 567 void 568 pv_cache(pmap_t pmap, pv_entry_t pv, vm_pindex_t pindex) 569 { 570 if (pindex < pmap_pt_pindex(0)) { 571 ; 572 } else if (pindex < pmap_pd_pindex(0)) { 573 pmap->pm_pvhint_pt = pv; 574 } 575 } 576 577 /* 578 * Locate the requested pt_entry 579 */ 580 static __inline 581 pv_entry_t 582 pv_entry_lookup(pmap_t pmap, vm_pindex_t pindex) 583 { 584 pv_entry_t pv; 585 586 if (pindex < pmap_pt_pindex(0)) 587 return NULL; 588 #if 1 589 if (pindex < pmap_pd_pindex(0)) 590 pv = pmap->pm_pvhint_pt; 591 else 592 pv = NULL; 593 cpu_ccfence(); 594 if (pv == NULL || pv->pv_pmap != pmap) { 595 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 596 if (pv) 597 pv_cache(pmap, pv, pindex); 598 } else if (pv->pv_pindex != pindex) { 599 pv = pv_entry_rb_tree_RB_LOOKUP_REL(&pmap->pm_pvroot, 600 pindex, pv); 601 if (pv) 602 pv_cache(pmap, pv, pindex); 603 } 604 #else 605 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 606 #endif 607 return pv; 608 } 609 610 /* 611 * pmap_pte_quick: 612 * 613 * Super fast pmap_pte routine best used when scanning the pv lists. 614 * This eliminates many course-grained invltlb calls. Note that many of 615 * the pv list scans are across different pmaps and it is very wasteful 616 * to do an entire invltlb when checking a single mapping. 617 */ 618 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 619 620 static 621 pt_entry_t * 622 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 623 { 624 return pmap_pte(pmap, va); 625 } 626 627 /* 628 * The placemarker hash must be broken up into four zones so lock 629 * ordering semantics continue to work (e.g. pte, pt, pd, then pdp). 630 * 631 * Placemarkers are used to 'lock' page table indices that do not have 632 * a pv_entry. This allows the pmap to support managed and unmanaged 633 * pages and shared page tables. 634 */ 635 #define PM_PLACE_BASE (PM_PLACEMARKS >> 2) 636 637 static __inline 638 vm_pindex_t * 639 pmap_placemarker_hash(pmap_t pmap, vm_pindex_t pindex) 640 { 641 int hi; 642 643 if (pindex < pmap_pt_pindex(0)) /* zone 0 - PTE */ 644 hi = 0; 645 else if (pindex < pmap_pd_pindex(0)) /* zone 1 - PT */ 646 hi = PM_PLACE_BASE; 647 else if (pindex < pmap_pdp_pindex(0)) /* zone 2 - PD */ 648 hi = PM_PLACE_BASE << 1; 649 else /* zone 3 - PDP (and PML4E) */ 650 hi = PM_PLACE_BASE | (PM_PLACE_BASE << 1); 651 hi += pindex & (PM_PLACE_BASE - 1); 652 653 return (&pmap->pm_placemarks[hi]); 654 } 655 656 657 /* 658 * Generic procedure to index a pte from a pt, pd, or pdp. 659 * 660 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 661 * a page table page index but is instead of PV lookup index. 662 */ 663 static 664 void * 665 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 666 { 667 pt_entry_t *pte; 668 669 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 670 return(&pte[pindex]); 671 } 672 673 /* 674 * Return pointer to PDP slot in the PML4 675 */ 676 static __inline 677 pml4_entry_t * 678 pmap_pdp(pmap_t pmap, vm_offset_t va) 679 { 680 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 681 } 682 683 /* 684 * Return pointer to PD slot in the PDP given a pointer to the PDP 685 */ 686 static __inline 687 pdp_entry_t * 688 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 689 { 690 pdp_entry_t *pd; 691 692 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 693 return (&pd[pmap_pd_index(va)]); 694 } 695 696 /* 697 * Return pointer to PD slot in the PDP. 698 */ 699 static __inline 700 pdp_entry_t * 701 pmap_pd(pmap_t pmap, vm_offset_t va) 702 { 703 pml4_entry_t *pdp; 704 705 pdp = pmap_pdp(pmap, va); 706 if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) 707 return NULL; 708 return (pmap_pdp_to_pd(*pdp, va)); 709 } 710 711 /* 712 * Return pointer to PT slot in the PD given a pointer to the PD 713 */ 714 static __inline 715 pd_entry_t * 716 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 717 { 718 pd_entry_t *pt; 719 720 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 721 return (&pt[pmap_pt_index(va)]); 722 } 723 724 /* 725 * Return pointer to PT slot in the PD 726 * 727 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 728 * so we cannot lookup the PD via the PDP. Instead we 729 * must look it up via the pmap. 730 */ 731 static __inline 732 pd_entry_t * 733 pmap_pt(pmap_t pmap, vm_offset_t va) 734 { 735 pdp_entry_t *pd; 736 pv_entry_t pv; 737 vm_pindex_t pd_pindex; 738 vm_paddr_t phys; 739 740 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 741 pd_pindex = pmap_pd_pindex(va); 742 spin_lock_shared(&pmap->pm_spin); 743 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 744 if (pv == NULL || pv->pv_m == NULL) { 745 spin_unlock_shared(&pmap->pm_spin); 746 return NULL; 747 } 748 phys = VM_PAGE_TO_PHYS(pv->pv_m); 749 spin_unlock_shared(&pmap->pm_spin); 750 return (pmap_pd_to_pt(phys, va)); 751 } else { 752 pd = pmap_pd(pmap, va); 753 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) 754 return NULL; 755 return (pmap_pd_to_pt(*pd, va)); 756 } 757 } 758 759 /* 760 * Return pointer to PTE slot in the PT given a pointer to the PT 761 */ 762 static __inline 763 pt_entry_t * 764 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 765 { 766 pt_entry_t *pte; 767 768 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 769 return (&pte[pmap_pte_index(va)]); 770 } 771 772 /* 773 * Return pointer to PTE slot in the PT 774 */ 775 static __inline 776 pt_entry_t * 777 pmap_pte(pmap_t pmap, vm_offset_t va) 778 { 779 pd_entry_t *pt; 780 781 pt = pmap_pt(pmap, va); 782 if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) 783 return NULL; 784 if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) 785 return ((pt_entry_t *)pt); 786 return (pmap_pt_to_pte(*pt, va)); 787 } 788 789 /* 790 * Return address of PT slot in PD (KVM only) 791 * 792 * Cannot be used for user page tables because it might interfere with 793 * the shared page-table-page optimization (pmap_mmu_optimize). 794 */ 795 static __inline 796 pd_entry_t * 797 vtopt(vm_offset_t va) 798 { 799 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 800 NPML4EPGSHIFT)) - 1); 801 802 return (PDmap + ((va >> PDRSHIFT) & mask)); 803 } 804 805 /* 806 * KVM - return address of PTE slot in PT 807 */ 808 static __inline 809 pt_entry_t * 810 vtopte(vm_offset_t va) 811 { 812 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 813 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 814 815 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 816 } 817 818 /* 819 * Returns the physical address translation from va for a user address. 820 * (vm_paddr_t)-1 is returned on failure. 821 */ 822 vm_paddr_t 823 uservtophys(vm_offset_t va) 824 { 825 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 826 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 827 vm_paddr_t pa; 828 pt_entry_t pte; 829 pmap_t pmap; 830 831 pmap = vmspace_pmap(mycpu->gd_curthread->td_lwp->lwp_vmspace); 832 pa = (vm_paddr_t)-1; 833 if (va < VM_MAX_USER_ADDRESS) { 834 pte = kreadmem64(PTmap + ((va >> PAGE_SHIFT) & mask)); 835 if (pte & pmap->pmap_bits[PG_V_IDX]) 836 pa = (pte & PG_FRAME) | (va & PAGE_MASK); 837 } 838 return pa; 839 } 840 841 static uint64_t 842 allocpages(vm_paddr_t *firstaddr, long n) 843 { 844 uint64_t ret; 845 846 ret = *firstaddr; 847 bzero((void *)ret, n * PAGE_SIZE); 848 *firstaddr += n * PAGE_SIZE; 849 return (ret); 850 } 851 852 static 853 void 854 create_pagetables(vm_paddr_t *firstaddr) 855 { 856 long i; /* must be 64 bits */ 857 long nkpt_base; 858 long nkpt_phys; 859 long nkpd_phys; 860 int j; 861 862 /* 863 * We are running (mostly) V=P at this point 864 * 865 * Calculate how many 1GB PD entries in our PDP pages are needed 866 * for the DMAP. This is only allocated if the system does not 867 * support 1GB pages. Otherwise ndmpdp is simply a count of 868 * the number of 1G terminal entries in our PDP pages are needed. 869 * 870 * NOTE: Maxmem is in pages 871 */ 872 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 873 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 874 ndmpdp = 4; 875 KKASSERT(ndmpdp <= NDMPML4E * NPML4EPG); 876 877 /* 878 * Starting at KERNBASE - map all 2G worth of page table pages. 879 * KERNBASE is offset -2G from the end of kvm. This will accomodate 880 * all KVM allocations above KERNBASE, including the SYSMAPs below. 881 * 882 * We do this by allocating 2*512 PT pages. Each PT page can map 883 * 2MB, for 2GB total. 884 */ 885 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 886 887 /* 888 * Starting at the beginning of kvm (VM_MIN_KERNEL_ADDRESS), 889 * Calculate how many page table pages we need to preallocate 890 * for early vm_map allocations. 891 * 892 * A few extra won't hurt, they will get used up in the running 893 * system. 894 * 895 * vm_page array 896 * initial pventry's 897 */ 898 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 899 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 900 nkpt_phys += 128; /* a few extra */ 901 902 /* 903 * The highest value nkpd_phys can be set to is 904 * NKPDPE - (NPDPEPG - KPDPI) (i.e. NKPDPE - 2). 905 * 906 * Doing so would cause all PD pages to be pre-populated for 907 * a maximal KVM space (approximately 16*512 pages, or 32MB. 908 * We can save memory by not doing this. 909 */ 910 nkpd_phys = (nkpt_phys + NPDPEPG - 1) / NPDPEPG; 911 912 /* 913 * Allocate pages 914 * 915 * Normally NKPML4E=1-16 (1-16 kernel PDP page) 916 * Normally NKPDPE= NKPML4E*512-1 (511 min kernel PD pages) 917 * 918 * Only allocate enough PD pages 919 * NOTE: We allocate all kernel PD pages up-front, typically 920 * ~511G of KVM, requiring 511 PD pages. 921 */ 922 KPTbase = allocpages(firstaddr, nkpt_base); /* KERNBASE to end */ 923 KPTphys = allocpages(firstaddr, nkpt_phys); /* KVA start */ 924 KPML4phys = allocpages(firstaddr, 1); /* recursive PML4 map */ 925 KPDPphys = allocpages(firstaddr, NKPML4E); /* kernel PDP pages */ 926 KPDphys = allocpages(firstaddr, nkpd_phys); /* kernel PD pages */ 927 928 /* 929 * Alloc PD pages for the area starting at KERNBASE. 930 */ 931 KPDbase = allocpages(firstaddr, NPDPEPG - KPDPI); 932 933 /* 934 * Stuff for our DMAP 935 */ 936 DMPDPphys = allocpages(firstaddr, NDMPML4E); 937 if ((amd_feature & AMDID_PAGE1GB) == 0) 938 DMPDphys = allocpages(firstaddr, ndmpdp); 939 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 940 941 /* 942 * Fill in the underlying page table pages for the area around 943 * KERNBASE. This remaps low physical memory to KERNBASE. 944 * 945 * Read-only from zero to physfree 946 * XXX not fully used, underneath 2M pages 947 */ 948 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 949 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 950 ((pt_entry_t *)KPTbase)[i] |= 951 pmap_bits_default[PG_RW_IDX] | 952 pmap_bits_default[PG_V_IDX] | 953 pmap_bits_default[PG_G_IDX]; 954 } 955 956 /* 957 * Now map the initial kernel page tables. One block of page 958 * tables is placed at the beginning of kernel virtual memory, 959 * and another block is placed at KERNBASE to map the kernel binary, 960 * data, bss, and initial pre-allocations. 961 */ 962 for (i = 0; i < nkpt_base; i++) { 963 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 964 ((pd_entry_t *)KPDbase)[i] |= 965 pmap_bits_default[PG_RW_IDX] | 966 pmap_bits_default[PG_V_IDX]; 967 } 968 for (i = 0; i < nkpt_phys; i++) { 969 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 970 ((pd_entry_t *)KPDphys)[i] |= 971 pmap_bits_default[PG_RW_IDX] | 972 pmap_bits_default[PG_V_IDX]; 973 } 974 975 /* 976 * Map from zero to end of allocations using 2M pages as an 977 * optimization. This will bypass some of the KPTBase pages 978 * above in the KERNBASE area. 979 */ 980 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 981 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 982 ((pd_entry_t *)KPDbase)[i] |= 983 pmap_bits_default[PG_RW_IDX] | 984 pmap_bits_default[PG_V_IDX] | 985 pmap_bits_default[PG_PS_IDX] | 986 pmap_bits_default[PG_G_IDX]; 987 } 988 989 /* 990 * Load PD addresses into the PDP pages for primary KVA space to 991 * cover existing page tables. PD's for KERNBASE are handled in 992 * the next loop. 993 * 994 * expected to pre-populate all of its PDs. See NKPDPE in vmparam.h. 995 */ 996 for (i = 0; i < nkpd_phys; i++) { 997 ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] = 998 KPDphys + (i << PAGE_SHIFT); 999 ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] |= 1000 pmap_bits_default[PG_RW_IDX] | 1001 pmap_bits_default[PG_V_IDX] | 1002 pmap_bits_default[PG_A_IDX]; 1003 } 1004 1005 /* 1006 * Load PDs for KERNBASE to the end 1007 */ 1008 i = (NKPML4E - 1) * NPDPEPG + KPDPI; 1009 for (j = 0; j < NPDPEPG - KPDPI; ++j) { 1010 ((pdp_entry_t *)KPDPphys)[i + j] = 1011 KPDbase + (j << PAGE_SHIFT); 1012 ((pdp_entry_t *)KPDPphys)[i + j] |= 1013 pmap_bits_default[PG_RW_IDX] | 1014 pmap_bits_default[PG_V_IDX] | 1015 pmap_bits_default[PG_A_IDX]; 1016 } 1017 1018 /* 1019 * Now set up the direct map space using either 2MB or 1GB pages 1020 * Preset PG_M and PG_A because demotion expects it. 1021 * 1022 * When filling in entries in the PD pages make sure any excess 1023 * entries are set to zero as we allocated enough PD pages 1024 */ 1025 if ((amd_feature & AMDID_PAGE1GB) == 0) { 1026 /* 1027 * Use 2MB pages 1028 */ 1029 for (i = 0; i < NPDEPG * ndmpdp; i++) { 1030 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 1031 ((pd_entry_t *)DMPDphys)[i] |= 1032 pmap_bits_default[PG_RW_IDX] | 1033 pmap_bits_default[PG_V_IDX] | 1034 pmap_bits_default[PG_PS_IDX] | 1035 pmap_bits_default[PG_G_IDX] | 1036 pmap_bits_default[PG_M_IDX] | 1037 pmap_bits_default[PG_A_IDX]; 1038 } 1039 1040 /* 1041 * And the direct map space's PDP 1042 */ 1043 for (i = 0; i < ndmpdp; i++) { 1044 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 1045 (i << PAGE_SHIFT); 1046 ((pdp_entry_t *)DMPDPphys)[i] |= 1047 pmap_bits_default[PG_RW_IDX] | 1048 pmap_bits_default[PG_V_IDX]; 1049 } 1050 } else { 1051 /* 1052 * 1GB pages 1053 */ 1054 for (i = 0; i < ndmpdp; i++) { 1055 ((pdp_entry_t *)DMPDPphys)[i] = 1056 (vm_paddr_t)i << PDPSHIFT; 1057 ((pdp_entry_t *)DMPDPphys)[i] |= 1058 pmap_bits_default[PG_RW_IDX] | 1059 pmap_bits_default[PG_V_IDX] | 1060 pmap_bits_default[PG_PS_IDX] | 1061 pmap_bits_default[PG_G_IDX] | 1062 pmap_bits_default[PG_M_IDX] | 1063 pmap_bits_default[PG_A_IDX]; 1064 } 1065 } 1066 1067 /* And recursively map PML4 to itself in order to get PTmap */ 1068 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 1069 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= 1070 pmap_bits_default[PG_RW_IDX] | 1071 pmap_bits_default[PG_V_IDX] | 1072 pmap_bits_default[PG_A_IDX]; 1073 1074 /* 1075 * Connect the Direct Map slots up to the PML4 1076 */ 1077 for (j = 0; j < NDMPML4E; ++j) { 1078 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 1079 (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 1080 pmap_bits_default[PG_RW_IDX] | 1081 pmap_bits_default[PG_V_IDX] | 1082 pmap_bits_default[PG_A_IDX]; 1083 } 1084 1085 /* 1086 * Connect the KVA slot up to the PML4 1087 */ 1088 for (j = 0; j < NKPML4E; ++j) { 1089 ((pdp_entry_t *)KPML4phys)[KPML4I + j] = 1090 KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT); 1091 ((pdp_entry_t *)KPML4phys)[KPML4I + j] |= 1092 pmap_bits_default[PG_RW_IDX] | 1093 pmap_bits_default[PG_V_IDX] | 1094 pmap_bits_default[PG_A_IDX]; 1095 } 1096 cpu_mfence(); 1097 cpu_invltlb(); 1098 } 1099 1100 /* 1101 * Bootstrap the system enough to run with virtual memory. 1102 * 1103 * On x86_64 this is called after mapping has already been enabled 1104 * and just syncs the pmap module with what has already been done. 1105 * [We can't call it easily with mapping off since the kernel is not 1106 * mapped with PA == VA, hence we would have to relocate every address 1107 * from the linked base (virtual) address "KERNBASE" to the actual 1108 * (physical) address starting relative to 0] 1109 */ 1110 void 1111 pmap_bootstrap(vm_paddr_t *firstaddr) 1112 { 1113 vm_offset_t va; 1114 pt_entry_t *pte; 1115 int i; 1116 1117 KvaStart = VM_MIN_KERNEL_ADDRESS; 1118 KvaEnd = VM_MAX_KERNEL_ADDRESS; 1119 KvaSize = KvaEnd - KvaStart; 1120 1121 avail_start = *firstaddr; 1122 1123 /* 1124 * Create an initial set of page tables to run the kernel in. 1125 */ 1126 create_pagetables(firstaddr); 1127 1128 virtual2_start = KvaStart; 1129 virtual2_end = PTOV_OFFSET; 1130 1131 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 1132 virtual_start = pmap_kmem_choose(virtual_start); 1133 1134 virtual_end = VM_MAX_KERNEL_ADDRESS; 1135 1136 /* XXX do %cr0 as well */ 1137 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 1138 load_cr3(KPML4phys); 1139 1140 /* 1141 * Initialize protection array. 1142 */ 1143 x86_64_protection_init(); 1144 1145 /* 1146 * The kernel's pmap is statically allocated so we don't have to use 1147 * pmap_create, which is unlikely to work correctly at this part of 1148 * the boot sequence (XXX and which no longer exists). 1149 */ 1150 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 1151 kernel_pmap.pm_count = 1; 1152 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 1153 RB_INIT(&kernel_pmap.pm_pvroot); 1154 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 1155 for (i = 0; i < PM_PLACEMARKS; ++i) 1156 kernel_pmap.pm_placemarks[i] = PM_NOPLACEMARK; 1157 1158 /* 1159 * Reserve some special page table entries/VA space for temporary 1160 * mapping of pages. 1161 */ 1162 #define SYSMAP(c, p, v, n) \ 1163 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1164 1165 va = virtual_start; 1166 pte = vtopte(va); 1167 1168 /* 1169 * CMAP1/CMAP2 are used for zeroing and copying pages. 1170 */ 1171 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 1172 1173 /* 1174 * Crashdump maps. 1175 */ 1176 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 1177 1178 /* 1179 * ptvmmap is used for reading arbitrary physical pages via 1180 * /dev/mem. 1181 */ 1182 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 1183 1184 /* 1185 * msgbufp is used to map the system message buffer. 1186 * XXX msgbufmap is not used. 1187 */ 1188 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1189 atop(round_page(MSGBUF_SIZE))) 1190 1191 virtual_start = va; 1192 virtual_start = pmap_kmem_choose(virtual_start); 1193 1194 *CMAP1 = 0; 1195 1196 /* 1197 * PG_G is terribly broken on SMP because we IPI invltlb's in some 1198 * cases rather then invl1pg. Actually, I don't even know why it 1199 * works under UP because self-referential page table mappings 1200 */ 1201 // pgeflag = 0; 1202 1203 cpu_invltlb(); 1204 1205 /* Initialize the PAT MSR */ 1206 pmap_init_pat(); 1207 pmap_pinit_defaults(&kernel_pmap); 1208 1209 TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", 1210 &pmap_fast_kernel_cpusync); 1211 1212 } 1213 1214 /* 1215 * Setup the PAT MSR. 1216 */ 1217 void 1218 pmap_init_pat(void) 1219 { 1220 uint64_t pat_msr; 1221 u_long cr0, cr4; 1222 1223 /* 1224 * Default values mapping PATi,PCD,PWT bits at system reset. 1225 * The default values effectively ignore the PATi bit by 1226 * repeating the encodings for 0-3 in 4-7, and map the PCD 1227 * and PWT bit combinations to the expected PAT types. 1228 */ 1229 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 1230 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 1231 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 1232 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 1233 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 1234 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 1235 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 1236 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 1237 pat_pte_index[PAT_WRITE_BACK] = 0; 1238 pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; 1239 pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; 1240 pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; 1241 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 1242 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 1243 1244 if (cpu_feature & CPUID_PAT) { 1245 /* 1246 * If we support the PAT then set-up entries for 1247 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 1248 * 5 and 6. 1249 */ 1250 pat_msr = (pat_msr & ~PAT_MASK(5)) | 1251 PAT_VALUE(5, PAT_WRITE_PROTECTED); 1252 pat_msr = (pat_msr & ~PAT_MASK(6)) | 1253 PAT_VALUE(6, PAT_WRITE_COMBINING); 1254 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | X86_PG_NC_PWT; 1255 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PCD; 1256 1257 /* 1258 * Then enable the PAT 1259 */ 1260 1261 /* Disable PGE. */ 1262 cr4 = rcr4(); 1263 load_cr4(cr4 & ~CR4_PGE); 1264 1265 /* Disable caches (CD = 1, NW = 0). */ 1266 cr0 = rcr0(); 1267 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1268 1269 /* Flushes caches and TLBs. */ 1270 wbinvd(); 1271 cpu_invltlb(); 1272 1273 /* Update PAT and index table. */ 1274 wrmsr(MSR_PAT, pat_msr); 1275 1276 /* Flush caches and TLBs again. */ 1277 wbinvd(); 1278 cpu_invltlb(); 1279 1280 /* Restore caches and PGE. */ 1281 load_cr0(cr0); 1282 load_cr4(cr4); 1283 PatMsr = pat_msr; 1284 } 1285 } 1286 1287 /* 1288 * Set 4mb pdir for mp startup 1289 */ 1290 void 1291 pmap_set_opt(void) 1292 { 1293 if (cpu_feature & CPUID_PSE) { 1294 load_cr4(rcr4() | CR4_PSE); 1295 if (mycpu->gd_cpuid == 0) /* only on BSP */ 1296 cpu_invltlb(); 1297 } 1298 1299 /* 1300 * Check for SMAP support and enable if available. Must be done 1301 * after cr3 is loaded, and on all cores. 1302 */ 1303 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) { 1304 load_cr4(rcr4() | CR4_SMAP); 1305 } 1306 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) { 1307 load_cr4(rcr4() | CR4_SMEP); 1308 } 1309 } 1310 1311 /* 1312 * Early initialization of the pmap module. 1313 * 1314 * Called by vm_init, to initialize any structures that the pmap 1315 * system needs to map virtual memory. pmap_init has been enhanced to 1316 * support in a fairly consistant way, discontiguous physical memory. 1317 */ 1318 void 1319 pmap_init(void) 1320 { 1321 vm_pindex_t initial_pvs; 1322 vm_pindex_t i; 1323 1324 /* 1325 * Allocate memory for random pmap data structures. Includes the 1326 * pv_head_table. 1327 */ 1328 for (i = 0; i < vm_page_array_size; i++) { 1329 vm_page_t m; 1330 1331 m = &vm_page_array[i]; 1332 m->md.pmap_count = 0; 1333 m->md.writeable_count = 0; 1334 } 1335 1336 /* 1337 * init the pv free list 1338 */ 1339 initial_pvs = vm_page_array_size; 1340 if (initial_pvs < MINPV) 1341 initial_pvs = MINPV; 1342 pvzone = &pvzone_store; 1343 pvinit = (void *)kmem_alloc(&kernel_map, 1344 initial_pvs * sizeof (struct pv_entry), 1345 VM_SUBSYS_PVENTRY); 1346 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 1347 pvinit, initial_pvs); 1348 1349 /* 1350 * Now it is safe to enable pv_table recording. 1351 */ 1352 pmap_initialized = TRUE; 1353 } 1354 1355 /* 1356 * Initialize the address space (zone) for the pv_entries. Set a 1357 * high water mark so that the system can recover from excessive 1358 * numbers of pv entries. 1359 * 1360 * Also create the kernel page table template for isolated user 1361 * pmaps. 1362 */ 1363 static void pmap_init_iso_range(vm_offset_t base, size_t bytes); 1364 static void pmap_init2_iso_pmap(void); 1365 #if 0 1366 static void dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base); 1367 #endif 1368 1369 void 1370 pmap_init2(void) 1371 { 1372 vm_pindex_t entry_max; 1373 1374 /* 1375 * We can significantly reduce pv_entry_max from historical 1376 * levels because pv_entry's are no longer use for PTEs at the 1377 * leafs. This prevents excessive pcpu caching on many-core 1378 * boxes (even with the further '/ 16' done in zinitna(). 1379 * 1380 * Remember, however, that processes can share physical pages 1381 * with each process still needing the pdp/pd/pt infrstructure 1382 * (which still use pv_entry's). And don't just assume that 1383 * every PT will be completely filled up. So don't make it 1384 * too small. 1385 */ 1386 entry_max = maxproc * 32 + vm_page_array_size / 16; 1387 TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &entry_max); 1388 vm_pmap_pv_entries = entry_max; 1389 1390 /* 1391 * Subtract out pages already installed in the zone (hack) 1392 */ 1393 if (entry_max <= MINPV) 1394 entry_max = MINPV; 1395 1396 zinitna(pvzone, NULL, 0, entry_max, ZONE_INTERRUPT); 1397 1398 /* 1399 * Enable dynamic deletion of empty higher-level page table pages 1400 * by default only if system memory is < 8GB (use 7GB for slop). 1401 * This can save a little memory, but imposes significant 1402 * performance overhead for things like bulk builds, and for programs 1403 * which do a lot of memory mapping and memory unmapping. 1404 */ 1405 #if 0 1406 if (pmap_dynamic_delete < 0) { 1407 if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE) 1408 pmap_dynamic_delete = 1; 1409 else 1410 pmap_dynamic_delete = 0; 1411 } 1412 #endif 1413 /* 1414 * Disable so vm_map_backing iterations do not race 1415 */ 1416 pmap_dynamic_delete = 0; 1417 1418 /* 1419 * Automatic detection of Intel meltdown bug requiring user/kernel 1420 * mmap isolation. 1421 * 1422 * Currently there are so many Intel cpu's impacted that its better 1423 * to whitelist future Intel CPUs. Most? AMD cpus are not impacted 1424 * so the default is off for AMD. 1425 */ 1426 if (meltdown_mitigation < 0) { 1427 if (cpu_vendor_id == CPU_VENDOR_INTEL) 1428 meltdown_mitigation = 1; 1429 else 1430 meltdown_mitigation = 0; 1431 } 1432 if (meltdown_mitigation) { 1433 kprintf("machdep.meltdown_mitigation enabled to " 1434 "protect against (mostly Intel) meltdown bug\n"); 1435 kprintf("system call performance will be impacted\n"); 1436 } 1437 1438 pmap_init2_iso_pmap(); 1439 } 1440 1441 /* 1442 * Create the isolation pmap template. Once created, the template 1443 * is static and its PML4e entries are used to populate the 1444 * kernel portion of any isolated user pmaps. 1445 * 1446 * Our isolation pmap must contain: 1447 * (1) trampoline area for all cpus 1448 * (2) common_tss area for all cpus (its part of the trampoline area now) 1449 * (3) IDT for all cpus 1450 * (4) GDT for all cpus 1451 */ 1452 static void 1453 pmap_init2_iso_pmap(void) 1454 { 1455 int n; 1456 1457 if (bootverbose) 1458 kprintf("Initialize isolation pmap\n"); 1459 1460 /* 1461 * Try to use our normal API calls to make this easier. We have 1462 * to scrap the shadowed kernel PDPs pmap_pinit() creates for our 1463 * iso_pmap. 1464 */ 1465 pmap_pinit(&iso_pmap); 1466 bzero(iso_pmap.pm_pml4, PAGE_SIZE); 1467 1468 /* 1469 * Install areas needed by the cpu and trampoline. 1470 */ 1471 for (n = 0; n < ncpus; ++n) { 1472 struct privatespace *ps; 1473 1474 ps = CPU_prvspace[n]; 1475 pmap_init_iso_range((vm_offset_t)&ps->trampoline, 1476 sizeof(ps->trampoline)); 1477 pmap_init_iso_range((vm_offset_t)&ps->dblstack, 1478 sizeof(ps->dblstack)); 1479 pmap_init_iso_range((vm_offset_t)&ps->dbgstack, 1480 sizeof(ps->dbgstack)); 1481 pmap_init_iso_range((vm_offset_t)&ps->common_tss, 1482 sizeof(ps->common_tss)); 1483 pmap_init_iso_range(r_idt_arr[n].rd_base, 1484 r_idt_arr[n].rd_limit + 1); 1485 } 1486 pmap_init_iso_range((register_t)gdt, sizeof(gdt)); 1487 pmap_init_iso_range((vm_offset_t)(int *)btext, 1488 (vm_offset_t)(int *)etext - 1489 (vm_offset_t)(int *)btext); 1490 1491 #if 0 1492 kprintf("Dump iso_pmap:\n"); 1493 dump_pmap(&iso_pmap, vtophys(iso_pmap.pm_pml4), 0, 0); 1494 kprintf("\nDump kernel_pmap:\n"); 1495 dump_pmap(&kernel_pmap, vtophys(kernel_pmap.pm_pml4), 0, 0); 1496 #endif 1497 } 1498 1499 /* 1500 * This adds a kernel virtual address range to the isolation pmap. 1501 */ 1502 static void 1503 pmap_init_iso_range(vm_offset_t base, size_t bytes) 1504 { 1505 pv_entry_t pv; 1506 pv_entry_t pvp; 1507 pt_entry_t *ptep; 1508 pt_entry_t pte; 1509 vm_offset_t va; 1510 1511 if (bootverbose) { 1512 kprintf("isolate %016jx-%016jx (%zd)\n", 1513 base, base + bytes, bytes); 1514 } 1515 va = base & ~(vm_offset_t)PAGE_MASK; 1516 while (va < base + bytes) { 1517 if ((va & PDRMASK) == 0 && va + NBPDR <= base + bytes && 1518 (ptep = pmap_pt(&kernel_pmap, va)) != NULL && 1519 (*ptep & kernel_pmap.pmap_bits[PG_V_IDX]) && 1520 (*ptep & kernel_pmap.pmap_bits[PG_PS_IDX])) { 1521 /* 1522 * Use 2MB pages if possible 1523 */ 1524 pte = *ptep; 1525 pv = pmap_allocpte(&iso_pmap, pmap_pd_pindex(va), &pvp); 1526 ptep = pv_pte_lookup(pv, (va >> PDRSHIFT) & 511); 1527 *ptep = pte; 1528 va += NBPDR; 1529 } else { 1530 /* 1531 * Otherwise use 4KB pages 1532 */ 1533 pv = pmap_allocpte(&iso_pmap, pmap_pt_pindex(va), &pvp); 1534 ptep = pv_pte_lookup(pv, (va >> PAGE_SHIFT) & 511); 1535 *ptep = vtophys(va) | kernel_pmap.pmap_bits[PG_RW_IDX] | 1536 kernel_pmap.pmap_bits[PG_V_IDX] | 1537 kernel_pmap.pmap_bits[PG_A_IDX] | 1538 kernel_pmap.pmap_bits[PG_M_IDX]; 1539 1540 va += PAGE_SIZE; 1541 } 1542 pv_put(pv); 1543 pv_put(pvp); 1544 } 1545 } 1546 1547 #if 0 1548 /* 1549 * Useful debugging pmap dumper, do not remove (#if 0 when not in use) 1550 */ 1551 static 1552 void 1553 dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base) 1554 { 1555 pt_entry_t *ptp; 1556 vm_offset_t incr; 1557 int i; 1558 1559 switch(level) { 1560 case 0: /* PML4e page, 512G entries */ 1561 incr = (1LL << 48) / 512; 1562 break; 1563 case 1: /* PDP page, 1G entries */ 1564 incr = (1LL << 39) / 512; 1565 break; 1566 case 2: /* PD page, 2MB entries */ 1567 incr = (1LL << 30) / 512; 1568 break; 1569 case 3: /* PT page, 4KB entries */ 1570 incr = (1LL << 21) / 512; 1571 break; 1572 default: 1573 incr = 0; 1574 break; 1575 } 1576 1577 if (level == 0) 1578 kprintf("cr3 %016jx @ va=%016jx\n", pte, base); 1579 ptp = (void *)PHYS_TO_DMAP(pte & ~(pt_entry_t)PAGE_MASK); 1580 for (i = 0; i < 512; ++i) { 1581 if (level == 0 && i == 128) 1582 base += 0xFFFF000000000000LLU; 1583 if (ptp[i]) { 1584 kprintf("%*.*s ", level * 4, level * 4, ""); 1585 if (level == 1 && (ptp[i] & 0x180) == 0x180) { 1586 kprintf("va=%016jx %3d term %016jx (1GB)\n", 1587 base, i, ptp[i]); 1588 } else if (level == 2 && (ptp[i] & 0x180) == 0x180) { 1589 kprintf("va=%016jx %3d term %016jx (2MB)\n", 1590 base, i, ptp[i]); 1591 } else if (level == 3) { 1592 kprintf("va=%016jx %3d term %016jx\n", 1593 base, i, ptp[i]); 1594 } else { 1595 kprintf("va=%016jx %3d deep %016jx\n", 1596 base, i, ptp[i]); 1597 dump_pmap(pmap, ptp[i], level + 1, base); 1598 } 1599 } 1600 base += incr; 1601 } 1602 } 1603 1604 #endif 1605 1606 /* 1607 * Typically used to initialize a fictitious page by vm/device_pager.c 1608 */ 1609 void 1610 pmap_page_init(struct vm_page *m) 1611 { 1612 vm_page_init(m); 1613 m->md.pmap_count = 0; 1614 m->md.writeable_count = 0; 1615 } 1616 1617 /*************************************************** 1618 * Low level helper routines..... 1619 ***************************************************/ 1620 1621 /* 1622 * Extract the physical page address associated with the map/VA pair. 1623 * The page must be wired for this to work reliably. 1624 */ 1625 vm_paddr_t 1626 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 1627 { 1628 vm_paddr_t rtval; 1629 pv_entry_t pt_pv; 1630 pt_entry_t *ptep; 1631 1632 rtval = 0; 1633 if (va >= VM_MAX_USER_ADDRESS) { 1634 /* 1635 * Kernel page directories might be direct-mapped and 1636 * there is typically no PV tracking of pte's 1637 */ 1638 pd_entry_t *pt; 1639 1640 pt = pmap_pt(pmap, va); 1641 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { 1642 if (*pt & pmap->pmap_bits[PG_PS_IDX]) { 1643 rtval = *pt & PG_PS_FRAME; 1644 rtval |= va & PDRMASK; 1645 } else { 1646 ptep = pmap_pt_to_pte(*pt, va); 1647 if (*pt & pmap->pmap_bits[PG_V_IDX]) { 1648 rtval = *ptep & PG_FRAME; 1649 rtval |= va & PAGE_MASK; 1650 } 1651 } 1652 } 1653 if (handlep) 1654 *handlep = NULL; 1655 } else { 1656 /* 1657 * User pages currently do not direct-map the page directory 1658 * and some pages might not used managed PVs. But all PT's 1659 * will have a PV. 1660 */ 1661 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1662 if (pt_pv) { 1663 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1664 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 1665 rtval = *ptep & PG_FRAME; 1666 rtval |= va & PAGE_MASK; 1667 } 1668 if (handlep) 1669 *handlep = pt_pv; /* locked until done */ 1670 else 1671 pv_put (pt_pv); 1672 } else if (handlep) { 1673 *handlep = NULL; 1674 } 1675 } 1676 return rtval; 1677 } 1678 1679 void 1680 pmap_extract_done(void *handle) 1681 { 1682 if (handle) 1683 pv_put((pv_entry_t)handle); 1684 } 1685 1686 /* 1687 * Similar to extract but checks protections, SMP-friendly short-cut for 1688 * vm_fault_page[_quick](). Can return NULL to cause the caller to 1689 * fall-through to the real fault code. Does not work with HVM page 1690 * tables. 1691 * 1692 * if busyp is NULL the returned page, if not NULL, is held (and not busied). 1693 * 1694 * If busyp is not NULL and this function sets *busyp non-zero, the returned 1695 * page is busied (and not held). 1696 * 1697 * If busyp is not NULL and this function sets *busyp to zero, the returned 1698 * page is held (and not busied). 1699 * 1700 * If VM_PROT_WRITE is set in prot, and the pte is already writable, the 1701 * returned page will be dirtied. If the pte is not already writable NULL 1702 * is returned. In otherwords, if the bit is set and a vm_page_t is returned, 1703 * any COW will already have happened and that page can be written by the 1704 * caller. 1705 * 1706 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NOT SUITABLE FOR READING 1707 * OR WRITING AS-IS. 1708 */ 1709 vm_page_t 1710 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot, int *busyp) 1711 { 1712 if (pmap && 1713 va < VM_MAX_USER_ADDRESS && 1714 (pmap->pm_flags & PMAP_HVM) == 0) { 1715 pv_entry_t pt_pv; 1716 pv_entry_t pte_pv; 1717 pt_entry_t *ptep; 1718 pt_entry_t req; 1719 vm_page_t m; 1720 int error; 1721 1722 req = pmap->pmap_bits[PG_V_IDX] | 1723 pmap->pmap_bits[PG_U_IDX]; 1724 if (prot & VM_PROT_WRITE) 1725 req |= pmap->pmap_bits[PG_RW_IDX]; 1726 1727 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1728 if (pt_pv == NULL) 1729 return (NULL); 1730 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1731 if ((*ptep & req) != req) { 1732 pv_put(pt_pv); 1733 return (NULL); 1734 } 1735 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), NULL, &error); 1736 if (pte_pv && error == 0) { 1737 m = pte_pv->pv_m; 1738 if (prot & VM_PROT_WRITE) { 1739 /* interlocked by presence of pv_entry */ 1740 vm_page_dirty(m); 1741 } 1742 if (busyp) { 1743 if (prot & VM_PROT_WRITE) { 1744 if (vm_page_busy_try(m, TRUE)) 1745 m = NULL; 1746 *busyp = 1; 1747 } else { 1748 vm_page_hold(m); 1749 *busyp = 0; 1750 } 1751 } else { 1752 vm_page_hold(m); 1753 } 1754 pv_put(pte_pv); 1755 } else if (pte_pv) { 1756 pv_drop(pte_pv); 1757 m = NULL; 1758 } else { 1759 /* error, since we didn't request a placemarker */ 1760 m = NULL; 1761 } 1762 pv_put(pt_pv); 1763 return(m); 1764 } else { 1765 return(NULL); 1766 } 1767 } 1768 1769 /* 1770 * Extract the physical page address associated kernel virtual address. 1771 */ 1772 vm_paddr_t 1773 pmap_kextract(vm_offset_t va) 1774 { 1775 pd_entry_t pt; /* pt entry in pd */ 1776 vm_paddr_t pa; 1777 1778 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1779 pa = DMAP_TO_PHYS(va); 1780 } else { 1781 pt = *vtopt(va); 1782 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) { 1783 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1784 } else { 1785 /* 1786 * Beware of a concurrent promotion that changes the 1787 * PDE at this point! For example, vtopte() must not 1788 * be used to access the PTE because it would use the 1789 * new PDE. It is, however, safe to use the old PDE 1790 * because the page table page is preserved by the 1791 * promotion. 1792 */ 1793 pa = *pmap_pt_to_pte(pt, va); 1794 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1795 } 1796 } 1797 return pa; 1798 } 1799 1800 /*************************************************** 1801 * Low level mapping routines..... 1802 ***************************************************/ 1803 1804 /* 1805 * Routine: pmap_kenter 1806 * Function: 1807 * Add a wired page to the KVA 1808 * NOTE! note that in order for the mapping to take effect -- you 1809 * should do an invltlb after doing the pmap_kenter(). 1810 */ 1811 void 1812 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1813 { 1814 pt_entry_t *ptep; 1815 pt_entry_t npte; 1816 1817 npte = pa | 1818 kernel_pmap.pmap_bits[PG_RW_IDX] | 1819 kernel_pmap.pmap_bits[PG_V_IDX]; 1820 // pgeflag; 1821 ptep = vtopte(va); 1822 #if 1 1823 pmap_inval_smp(&kernel_pmap, va, 1, ptep, npte); 1824 #else 1825 /* FUTURE */ 1826 if (*ptep) 1827 pmap_inval_smp(&kernel_pmap, va, ptep, npte); 1828 else 1829 *ptep = npte; 1830 #endif 1831 } 1832 1833 /* 1834 * Similar to pmap_kenter(), except we only invalidate the mapping on the 1835 * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't 1836 * (caller can conditionalize calling smp_invltlb()). 1837 */ 1838 int 1839 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1840 { 1841 pt_entry_t *ptep; 1842 pt_entry_t npte; 1843 int res; 1844 1845 npte = pa | kernel_pmap.pmap_bits[PG_RW_IDX] | 1846 kernel_pmap.pmap_bits[PG_V_IDX]; 1847 // npte |= pgeflag; 1848 ptep = vtopte(va); 1849 #if 1 1850 res = 1; 1851 #else 1852 /* FUTURE */ 1853 res = (*ptep != 0); 1854 #endif 1855 atomic_swap_long(ptep, npte); 1856 cpu_invlpg((void *)va); 1857 1858 return res; 1859 } 1860 1861 /* 1862 * Enter addresses into the kernel pmap but don't bother 1863 * doing any tlb invalidations. Caller will do a rollup 1864 * invalidation via pmap_rollup_inval(). 1865 */ 1866 int 1867 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 1868 { 1869 pt_entry_t *ptep; 1870 pt_entry_t npte; 1871 int res; 1872 1873 npte = pa | 1874 kernel_pmap.pmap_bits[PG_RW_IDX] | 1875 kernel_pmap.pmap_bits[PG_V_IDX]; 1876 // pgeflag; 1877 ptep = vtopte(va); 1878 #if 1 1879 res = 1; 1880 #else 1881 /* FUTURE */ 1882 res = (*ptep != 0); 1883 #endif 1884 atomic_swap_long(ptep, npte); 1885 cpu_invlpg((void *)va); 1886 1887 return res; 1888 } 1889 1890 /* 1891 * remove a page from the kernel pagetables 1892 */ 1893 void 1894 pmap_kremove(vm_offset_t va) 1895 { 1896 pt_entry_t *ptep; 1897 1898 ptep = vtopte(va); 1899 pmap_inval_smp(&kernel_pmap, va, 1, ptep, 0); 1900 } 1901 1902 void 1903 pmap_kremove_quick(vm_offset_t va) 1904 { 1905 pt_entry_t *ptep; 1906 1907 ptep = vtopte(va); 1908 (void)pte_load_clear(ptep); 1909 cpu_invlpg((void *)va); 1910 } 1911 1912 /* 1913 * Remove addresses from the kernel pmap but don't bother 1914 * doing any tlb invalidations. Caller will do a rollup 1915 * invalidation via pmap_rollup_inval(). 1916 */ 1917 void 1918 pmap_kremove_noinval(vm_offset_t va) 1919 { 1920 pt_entry_t *ptep; 1921 1922 ptep = vtopte(va); 1923 (void)pte_load_clear(ptep); 1924 } 1925 1926 /* 1927 * XXX these need to be recoded. They are not used in any critical path. 1928 */ 1929 void 1930 pmap_kmodify_rw(vm_offset_t va) 1931 { 1932 atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]); 1933 cpu_invlpg((void *)va); 1934 } 1935 1936 /* NOT USED 1937 void 1938 pmap_kmodify_nc(vm_offset_t va) 1939 { 1940 atomic_set_long(vtopte(va), PG_N); 1941 cpu_invlpg((void *)va); 1942 } 1943 */ 1944 1945 /* 1946 * Used to map a range of physical addresses into kernel virtual 1947 * address space during the low level boot, typically to map the 1948 * dump bitmap, message buffer, and vm_page_array. 1949 * 1950 * These mappings are typically made at some pointer after the end of the 1951 * kernel text+data. 1952 * 1953 * We could return PHYS_TO_DMAP(start) here and not allocate any 1954 * via (*virtp), but then kmem from userland and kernel dumps won't 1955 * have access to the related pointers. 1956 */ 1957 vm_offset_t 1958 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1959 { 1960 vm_offset_t va; 1961 vm_offset_t va_start; 1962 1963 /*return PHYS_TO_DMAP(start);*/ 1964 1965 va_start = *virtp; 1966 va = va_start; 1967 1968 while (start < end) { 1969 pmap_kenter_quick(va, start); 1970 va += PAGE_SIZE; 1971 start += PAGE_SIZE; 1972 } 1973 *virtp = va; 1974 return va_start; 1975 } 1976 1977 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1978 1979 /* 1980 * Remove the specified set of pages from the data and instruction caches. 1981 * 1982 * In contrast to pmap_invalidate_cache_range(), this function does not 1983 * rely on the CPU's self-snoop feature, because it is intended for use 1984 * when moving pages into a different cache domain. 1985 */ 1986 void 1987 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1988 { 1989 vm_offset_t daddr, eva; 1990 int i; 1991 1992 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1993 (cpu_feature & CPUID_CLFSH) == 0) 1994 wbinvd(); 1995 else { 1996 cpu_mfence(); 1997 for (i = 0; i < count; i++) { 1998 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1999 eva = daddr + PAGE_SIZE; 2000 for (; daddr < eva; daddr += cpu_clflush_line_size) 2001 clflush(daddr); 2002 } 2003 cpu_mfence(); 2004 } 2005 } 2006 2007 void 2008 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 2009 { 2010 KASSERT((sva & PAGE_MASK) == 0, 2011 ("pmap_invalidate_cache_range: sva not page-aligned")); 2012 KASSERT((eva & PAGE_MASK) == 0, 2013 ("pmap_invalidate_cache_range: eva not page-aligned")); 2014 2015 if (cpu_feature & CPUID_SS) { 2016 ; /* If "Self Snoop" is supported, do nothing. */ 2017 } else { 2018 /* Globally invalidate caches */ 2019 cpu_wbinvd_on_all_cpus(); 2020 } 2021 } 2022 2023 /* 2024 * Invalidate the specified range of virtual memory on all cpus associated 2025 * with the pmap. 2026 */ 2027 void 2028 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2029 { 2030 pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); 2031 } 2032 2033 /* 2034 * Add a list of wired pages to the kva. This routine is used for temporary 2035 * kernel mappings such as those found in buffer cache buffer. Page 2036 * modifications and accesses are not tracked or recorded. 2037 * 2038 * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed 2039 * semantics as previous mappings may have been zerod without any 2040 * invalidation. 2041 * 2042 * The page *must* be wired. 2043 */ 2044 static __inline void 2045 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) 2046 { 2047 vm_offset_t end_va; 2048 vm_offset_t va; 2049 2050 end_va = beg_va + count * PAGE_SIZE; 2051 2052 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2053 pt_entry_t pte; 2054 pt_entry_t *ptep; 2055 2056 ptep = vtopte(va); 2057 pte = VM_PAGE_TO_PHYS(*m) | 2058 kernel_pmap.pmap_bits[PG_RW_IDX] | 2059 kernel_pmap.pmap_bits[PG_V_IDX] | 2060 kernel_pmap.pmap_cache_bits[(*m)->pat_mode]; 2061 // pgeflag; 2062 atomic_swap_long(ptep, pte); 2063 m++; 2064 } 2065 if (doinval) 2066 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 2067 } 2068 2069 void 2070 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 2071 { 2072 _pmap_qenter(beg_va, m, count, 1); 2073 } 2074 2075 void 2076 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) 2077 { 2078 _pmap_qenter(beg_va, m, count, 0); 2079 } 2080 2081 /* 2082 * This routine jerks page mappings from the kernel -- it is meant only 2083 * for temporary mappings such as those found in buffer cache buffers. 2084 * No recording modified or access status occurs. 2085 * 2086 * MPSAFE, INTERRUPT SAFE (cluster callback) 2087 */ 2088 void 2089 pmap_qremove(vm_offset_t beg_va, int count) 2090 { 2091 vm_offset_t end_va; 2092 vm_offset_t va; 2093 2094 end_va = beg_va + count * PAGE_SIZE; 2095 2096 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2097 pt_entry_t *pte; 2098 2099 pte = vtopte(va); 2100 (void)pte_load_clear(pte); 2101 cpu_invlpg((void *)va); 2102 } 2103 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 2104 } 2105 2106 /* 2107 * This routine removes temporary kernel mappings, only invalidating them 2108 * on the current cpu. It should only be used under carefully controlled 2109 * conditions. 2110 */ 2111 void 2112 pmap_qremove_quick(vm_offset_t beg_va, int count) 2113 { 2114 vm_offset_t end_va; 2115 vm_offset_t va; 2116 2117 end_va = beg_va + count * PAGE_SIZE; 2118 2119 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2120 pt_entry_t *pte; 2121 2122 pte = vtopte(va); 2123 (void)pte_load_clear(pte); 2124 cpu_invlpg((void *)va); 2125 } 2126 } 2127 2128 /* 2129 * This routine removes temporary kernel mappings *without* invalidating 2130 * the TLB. It can only be used on permanent kva reservations such as those 2131 * found in buffer cache buffers, under carefully controlled circumstances. 2132 * 2133 * NOTE: Repopulating these KVAs requires unconditional invalidation. 2134 * (pmap_qenter() does unconditional invalidation). 2135 */ 2136 void 2137 pmap_qremove_noinval(vm_offset_t beg_va, int count) 2138 { 2139 vm_offset_t end_va; 2140 vm_offset_t va; 2141 2142 end_va = beg_va + count * PAGE_SIZE; 2143 2144 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2145 pt_entry_t *pte; 2146 2147 pte = vtopte(va); 2148 (void)pte_load_clear(pte); 2149 } 2150 } 2151 2152 /* 2153 * Create a new thread and optionally associate it with a (new) process. 2154 * NOTE! the new thread's cpu may not equal the current cpu. 2155 */ 2156 void 2157 pmap_init_thread(thread_t td) 2158 { 2159 /* enforce pcb placement & alignment */ 2160 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 2161 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 2162 td->td_savefpu = &td->td_pcb->pcb_save; 2163 td->td_sp = (char *)td->td_pcb; /* no -16 */ 2164 } 2165 2166 /* 2167 * This routine directly affects the fork perf for a process. 2168 */ 2169 void 2170 pmap_init_proc(struct proc *p) 2171 { 2172 } 2173 2174 static void 2175 pmap_pinit_defaults(struct pmap *pmap) 2176 { 2177 bcopy(pmap_bits_default, pmap->pmap_bits, 2178 sizeof(pmap_bits_default)); 2179 bcopy(protection_codes, pmap->protection_codes, 2180 sizeof(protection_codes)); 2181 bcopy(pat_pte_index, pmap->pmap_cache_bits, 2182 sizeof(pat_pte_index)); 2183 pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; 2184 pmap->copyinstr = std_copyinstr; 2185 pmap->copyin = std_copyin; 2186 pmap->copyout = std_copyout; 2187 pmap->fubyte = std_fubyte; 2188 pmap->subyte = std_subyte; 2189 pmap->fuword32 = std_fuword32; 2190 pmap->fuword64 = std_fuword64; 2191 pmap->suword32 = std_suword32; 2192 pmap->suword64 = std_suword64; 2193 pmap->swapu32 = std_swapu32; 2194 pmap->swapu64 = std_swapu64; 2195 pmap->fuwordadd32 = std_fuwordadd32; 2196 pmap->fuwordadd64 = std_fuwordadd64; 2197 } 2198 /* 2199 * Initialize pmap0/vmspace0. 2200 * 2201 * On architectures where the kernel pmap is not integrated into the user 2202 * process pmap, this pmap represents the process pmap, not the kernel pmap. 2203 * kernel_pmap should be used to directly access the kernel_pmap. 2204 */ 2205 void 2206 pmap_pinit0(struct pmap *pmap) 2207 { 2208 int i; 2209 2210 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 2211 pmap->pm_count = 1; 2212 CPUMASK_ASSZERO(pmap->pm_active); 2213 pmap->pm_pvhint_pt = NULL; 2214 pmap->pm_pvhint_unused = NULL; 2215 RB_INIT(&pmap->pm_pvroot); 2216 spin_init(&pmap->pm_spin, "pmapinit0"); 2217 for (i = 0; i < PM_PLACEMARKS; ++i) 2218 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 2219 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2220 pmap_pinit_defaults(pmap); 2221 } 2222 2223 /* 2224 * Initialize a preallocated and zeroed pmap structure, 2225 * such as one in a vmspace structure. 2226 */ 2227 static void 2228 pmap_pinit_simple(struct pmap *pmap) 2229 { 2230 int i; 2231 2232 /* 2233 * Misc initialization 2234 */ 2235 pmap->pm_count = 1; 2236 CPUMASK_ASSZERO(pmap->pm_active); 2237 pmap->pm_pvhint_pt = NULL; 2238 pmap->pm_pvhint_unused = NULL; 2239 pmap->pm_flags = PMAP_FLAG_SIMPLE; 2240 2241 pmap_pinit_defaults(pmap); 2242 2243 /* 2244 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 2245 * for this). 2246 */ 2247 if (pmap->pm_pmlpv == NULL) { 2248 RB_INIT(&pmap->pm_pvroot); 2249 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2250 spin_init(&pmap->pm_spin, "pmapinitsimple"); 2251 for (i = 0; i < PM_PLACEMARKS; ++i) 2252 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 2253 } 2254 } 2255 2256 void 2257 pmap_pinit(struct pmap *pmap) 2258 { 2259 pv_entry_t pv; 2260 int j; 2261 2262 if (pmap->pm_pmlpv) { 2263 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { 2264 pmap_puninit(pmap); 2265 } 2266 } 2267 2268 pmap_pinit_simple(pmap); 2269 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 2270 2271 /* 2272 * No need to allocate page table space yet but we do need a valid 2273 * page directory table. 2274 */ 2275 if (pmap->pm_pml4 == NULL) { 2276 pmap->pm_pml4 = 2277 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, 2278 PAGE_SIZE * 2, 2279 VM_SUBSYS_PML4); 2280 pmap->pm_pml4_iso = (void *)((char *)pmap->pm_pml4 + PAGE_SIZE); 2281 } 2282 2283 /* 2284 * Allocate the PML4e table, which wires it even though it isn't 2285 * being entered into some higher level page table (it being the 2286 * highest level). If one is already cached we don't have to do 2287 * anything. 2288 */ 2289 if ((pv = pmap->pm_pmlpv) == NULL) { 2290 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2291 pmap->pm_pmlpv = pv; 2292 pmap_kenter((vm_offset_t)pmap->pm_pml4, 2293 VM_PAGE_TO_PHYS(pv->pv_m)); 2294 pv_put(pv); 2295 2296 /* 2297 * Install DMAP and KMAP. 2298 */ 2299 for (j = 0; j < NDMPML4E; ++j) { 2300 pmap->pm_pml4[DMPML4I + j] = 2301 (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 2302 pmap->pmap_bits[PG_RW_IDX] | 2303 pmap->pmap_bits[PG_V_IDX] | 2304 pmap->pmap_bits[PG_A_IDX]; 2305 } 2306 for (j = 0; j < NKPML4E; ++j) { 2307 pmap->pm_pml4[KPML4I + j] = 2308 (KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 2309 pmap->pmap_bits[PG_RW_IDX] | 2310 pmap->pmap_bits[PG_V_IDX] | 2311 pmap->pmap_bits[PG_A_IDX]; 2312 } 2313 2314 /* 2315 * install self-referential address mapping entry 2316 */ 2317 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 2318 pmap->pmap_bits[PG_V_IDX] | 2319 pmap->pmap_bits[PG_RW_IDX] | 2320 pmap->pmap_bits[PG_A_IDX]; 2321 } else { 2322 KKASSERT(pv->pv_m->flags & PG_MAPPED); 2323 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 2324 } 2325 KKASSERT(pmap->pm_pml4[255] == 0); 2326 2327 /* 2328 * When implementing an isolated userland pmap, a second PML4e table 2329 * is needed. We use pmap_pml4_pindex() + 1 for convenience, but 2330 * note that we do not operate on this table using our API functions 2331 * so handling of the + 1 case is mostly just to prevent implosions. 2332 * 2333 * We install an isolated version of the kernel PDPs into this 2334 * second PML4e table. The pmap code will mirror all user PDPs 2335 * between the primary and secondary PML4e table. 2336 */ 2337 if ((pv = pmap->pm_pmlpv_iso) == NULL && meltdown_mitigation && 2338 pmap != &iso_pmap) { 2339 pv = pmap_allocpte(pmap, pmap_pml4_pindex() + 1, NULL); 2340 pmap->pm_pmlpv_iso = pv; 2341 pmap_kenter((vm_offset_t)pmap->pm_pml4_iso, 2342 VM_PAGE_TO_PHYS(pv->pv_m)); 2343 pv_put(pv); 2344 2345 /* 2346 * Install an isolated version of the kernel pmap for 2347 * user consumption, using PDPs constructed in iso_pmap. 2348 */ 2349 for (j = 0; j < NKPML4E; ++j) { 2350 pmap->pm_pml4_iso[KPML4I + j] = 2351 iso_pmap.pm_pml4[KPML4I + j]; 2352 } 2353 } else if (pv) { 2354 KKASSERT(pv->pv_m->flags & PG_MAPPED); 2355 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 2356 } 2357 } 2358 2359 /* 2360 * Clean up a pmap structure so it can be physically freed. This routine 2361 * is called by the vmspace dtor function. A great deal of pmap data is 2362 * left passively mapped to improve vmspace management so we have a bit 2363 * of cleanup work to do here. 2364 */ 2365 void 2366 pmap_puninit(pmap_t pmap) 2367 { 2368 pv_entry_t pv; 2369 vm_page_t p; 2370 2371 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 2372 if ((pv = pmap->pm_pmlpv) != NULL) { 2373 if (pv_hold_try(pv) == 0) 2374 pv_lock(pv); 2375 KKASSERT(pv == pmap->pm_pmlpv); 2376 p = pmap_remove_pv_page(pv); 2377 pv_free(pv, NULL); 2378 pv = NULL; /* safety */ 2379 pmap_kremove((vm_offset_t)pmap->pm_pml4); 2380 vm_page_busy_wait(p, FALSE, "pgpun"); 2381 KKASSERT(p->flags & PG_UNQUEUED); 2382 vm_page_unwire(p, 0); 2383 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2384 vm_page_free(p); 2385 pmap->pm_pmlpv = NULL; 2386 } 2387 if ((pv = pmap->pm_pmlpv_iso) != NULL) { 2388 if (pv_hold_try(pv) == 0) 2389 pv_lock(pv); 2390 KKASSERT(pv == pmap->pm_pmlpv_iso); 2391 p = pmap_remove_pv_page(pv); 2392 pv_free(pv, NULL); 2393 pv = NULL; /* safety */ 2394 pmap_kremove((vm_offset_t)pmap->pm_pml4_iso); 2395 vm_page_busy_wait(p, FALSE, "pgpun"); 2396 KKASSERT(p->flags & PG_UNQUEUED); 2397 vm_page_unwire(p, 0); 2398 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2399 vm_page_free(p); 2400 pmap->pm_pmlpv_iso = NULL; 2401 } 2402 if (pmap->pm_pml4) { 2403 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 2404 kmem_free(&kernel_map, 2405 (vm_offset_t)pmap->pm_pml4, PAGE_SIZE * 2); 2406 pmap->pm_pml4 = NULL; 2407 pmap->pm_pml4_iso = NULL; 2408 } 2409 KKASSERT(pmap->pm_stats.resident_count == 0); 2410 KKASSERT(pmap->pm_stats.wired_count == 0); 2411 } 2412 2413 /* 2414 * This function is now unused (used to add the pmap to the pmap_list) 2415 */ 2416 void 2417 pmap_pinit2(struct pmap *pmap) 2418 { 2419 } 2420 2421 /* 2422 * This routine is called when various levels in the page table need to 2423 * be populated. This routine cannot fail. 2424 * 2425 * This function returns two locked pv_entry's, one representing the 2426 * requested pv and one representing the requested pv's parent pv. If 2427 * an intermediate page table does not exist it will be created, mapped, 2428 * wired, and the parent page table will be given an additional hold 2429 * count representing the presence of the child pv_entry. 2430 */ 2431 static 2432 pv_entry_t 2433 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 2434 { 2435 pt_entry_t *ptep; 2436 pt_entry_t *ptep_iso; 2437 pv_entry_t pv; 2438 pv_entry_t pvp; 2439 pt_entry_t v; 2440 vm_page_t m; 2441 int isnew; 2442 int ispt; 2443 2444 /* 2445 * If the pv already exists and we aren't being asked for the 2446 * parent page table page we can just return it. A locked+held pv 2447 * is returned. The pv will also have a second hold related to the 2448 * pmap association that we don't have to worry about. 2449 */ 2450 ispt = 0; 2451 pv = pv_alloc(pmap, ptepindex, &isnew); 2452 if (isnew == 0 && pvpp == NULL) 2453 return(pv); 2454 2455 /* 2456 * DragonFly doesn't use PV's to represent terminal PTEs any more. 2457 * The index range is still used for placemarkers, but not for 2458 * actual pv_entry's. 2459 */ 2460 KKASSERT(ptepindex >= pmap_pt_pindex(0)); 2461 2462 /* 2463 * Note that pt_pv's are only returned for user VAs. We assert that 2464 * a pt_pv is not being requested for kernel VAs. The kernel 2465 * pre-wires all higher-level page tables so don't overload managed 2466 * higher-level page tables on top of it! 2467 * 2468 * However, its convenient for us to allow the case when creating 2469 * iso_pmap. This is a bit of a hack but it simplifies iso_pmap 2470 * a lot. 2471 */ 2472 2473 /* 2474 * The kernel never uses managed PT/PD/PDP pages. 2475 */ 2476 KKASSERT(pmap != &kernel_pmap); 2477 2478 /* 2479 * Non-terminal PVs allocate a VM page to represent the page table, 2480 * so we have to resolve pvp and calculate ptepindex for the pvp 2481 * and then for the page table entry index in the pvp for 2482 * fall-through. 2483 */ 2484 if (ptepindex < pmap_pd_pindex(0)) { 2485 /* 2486 * pv is PT, pvp is PD 2487 */ 2488 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 2489 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 2490 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2491 2492 /* 2493 * PT index in PD 2494 */ 2495 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 2496 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 2497 ispt = 1; 2498 } else if (ptepindex < pmap_pdp_pindex(0)) { 2499 /* 2500 * pv is PD, pvp is PDP 2501 * 2502 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 2503 * the PD. 2504 */ 2505 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 2506 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2507 2508 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 2509 KKASSERT(pvpp == NULL); 2510 pvp = NULL; 2511 } else { 2512 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2513 } 2514 2515 /* 2516 * PD index in PDP 2517 */ 2518 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 2519 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 2520 } else if (ptepindex < pmap_pml4_pindex()) { 2521 /* 2522 * pv is PDP, pvp is the root pml4 table 2523 */ 2524 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2525 2526 /* 2527 * PDP index in PML4 2528 */ 2529 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 2530 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 2531 } else { 2532 /* 2533 * pv represents the top-level PML4, there is no parent. 2534 */ 2535 pvp = NULL; 2536 } 2537 2538 if (isnew == 0) 2539 goto notnew; 2540 2541 /* 2542 * (isnew) is TRUE, pv is not terminal. 2543 * 2544 * (1) Add a wire count to the parent page table (pvp). 2545 * (2) Allocate a VM page for the page table. 2546 * (3) Enter the VM page into the parent page table. 2547 * 2548 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 2549 */ 2550 if (pvp) 2551 vm_page_wire_quick(pvp->pv_m); 2552 2553 for (;;) { 2554 m = vm_page_alloc(NULL, pv->pv_pindex, 2555 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2556 VM_ALLOC_INTERRUPT); 2557 if (m) 2558 break; 2559 vm_wait(0); 2560 } 2561 vm_page_wire(m); /* wire for mapping in parent */ 2562 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 2563 m->valid = VM_PAGE_BITS_ALL; 2564 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE | PG_UNQUEUED); 2565 KKASSERT(m->queue == PQ_NONE); 2566 2567 pv->pv_m = m; 2568 2569 /* 2570 * (isnew) is TRUE, pv is not terminal. 2571 * 2572 * Wire the page into pvp. Bump the resident_count for the pmap. 2573 * There is no pvp for the top level, address the pm_pml4[] array 2574 * directly. 2575 * 2576 * If the caller wants the parent we return it, otherwise 2577 * we just put it away. 2578 * 2579 * No interlock is needed for pte 0 -> non-zero. 2580 * 2581 * In the situation where *ptep is valid we might have an unmanaged 2582 * page table page shared from another page table which we need to 2583 * unshare before installing our private page table page. 2584 */ 2585 if (pvp) { 2586 v = VM_PAGE_TO_PHYS(m) | 2587 (pmap->pmap_bits[PG_RW_IDX] | 2588 pmap->pmap_bits[PG_V_IDX] | 2589 pmap->pmap_bits[PG_A_IDX]); 2590 if (ptepindex < NUPTE_USER) 2591 v |= pmap->pmap_bits[PG_U_IDX]; 2592 if (ptepindex < pmap_pt_pindex(0)) 2593 v |= pmap->pmap_bits[PG_M_IDX]; 2594 2595 ptep = pv_pte_lookup(pvp, ptepindex); 2596 if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) 2597 ptep_iso = pv_pte_lookup(pmap->pm_pmlpv_iso, ptepindex); 2598 else 2599 ptep_iso = NULL; 2600 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 2601 panic("pmap_allocpte: ptpte present without pv_entry!"); 2602 } else { 2603 pt_entry_t pte; 2604 2605 pte = atomic_swap_long(ptep, v); 2606 if (ptep_iso) 2607 atomic_swap_long(ptep_iso, v); 2608 if (pte != 0) { 2609 kprintf("install pgtbl mixup 0x%016jx " 2610 "old/new 0x%016jx/0x%016jx\n", 2611 (intmax_t)ptepindex, pte, v); 2612 } 2613 } 2614 } 2615 vm_page_wakeup(m); 2616 2617 /* 2618 * (isnew) may be TRUE or FALSE, pv may or may not be terminal. 2619 */ 2620 notnew: 2621 if (pvp) { 2622 KKASSERT(pvp->pv_m != NULL); 2623 ptep = pv_pte_lookup(pvp, ptepindex); 2624 v = VM_PAGE_TO_PHYS(pv->pv_m) | 2625 (pmap->pmap_bits[PG_RW_IDX] | 2626 pmap->pmap_bits[PG_V_IDX] | 2627 pmap->pmap_bits[PG_A_IDX]); 2628 if (ptepindex < NUPTE_USER) 2629 v |= pmap->pmap_bits[PG_U_IDX]; 2630 if (ptepindex < pmap_pt_pindex(0)) 2631 v |= pmap->pmap_bits[PG_M_IDX]; 2632 if (*ptep != v) { 2633 kprintf("mismatched upper level pt %016jx/%016jx\n", 2634 *ptep, v); 2635 } 2636 } 2637 if (pvpp) 2638 *pvpp = pvp; 2639 else if (pvp) 2640 pv_put(pvp); 2641 return (pv); 2642 } 2643 2644 /* 2645 * Release any resources held by the given physical map. 2646 * 2647 * Called when a pmap initialized by pmap_pinit is being released. Should 2648 * only be called if the map contains no valid mappings. 2649 */ 2650 struct pmap_release_info { 2651 pmap_t pmap; 2652 int retry; 2653 pv_entry_t pvp; 2654 }; 2655 2656 static int pmap_release_callback(pv_entry_t pv, void *data); 2657 2658 void 2659 pmap_release(struct pmap *pmap) 2660 { 2661 struct pmap_release_info info; 2662 2663 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 2664 ("pmap still active! %016jx", 2665 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 2666 2667 /* 2668 * There is no longer a pmap_list, if there were we would remove the 2669 * pmap from it here. 2670 */ 2671 2672 /* 2673 * Pull pv's off the RB tree in order from low to high and release 2674 * each page. 2675 */ 2676 info.pmap = pmap; 2677 do { 2678 info.retry = 0; 2679 info.pvp = NULL; 2680 2681 spin_lock(&pmap->pm_spin); 2682 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2683 pmap_release_callback, &info); 2684 spin_unlock(&pmap->pm_spin); 2685 2686 if (info.pvp) 2687 pv_put(info.pvp); 2688 } while (info.retry); 2689 2690 2691 /* 2692 * One resident page (the pml4 page) should remain. Two if 2693 * the pmap has implemented an isolated userland PML4E table. 2694 * No wired pages should remain. 2695 */ 2696 int expected_res = 0; 2697 2698 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0) 2699 ++expected_res; 2700 if (pmap->pm_pmlpv_iso) 2701 ++expected_res; 2702 2703 #if 1 2704 if (pmap->pm_stats.resident_count != expected_res || 2705 pmap->pm_stats.wired_count != 0) { 2706 kprintf("fatal pmap problem - pmap %p flags %08x " 2707 "rescnt=%jd wirecnt=%jd\n", 2708 pmap, 2709 pmap->pm_flags, 2710 pmap->pm_stats.resident_count, 2711 pmap->pm_stats.wired_count); 2712 tsleep(pmap, 0, "DEAD", 0); 2713 } 2714 #else 2715 KKASSERT(pmap->pm_stats.resident_count == expected_res); 2716 KKASSERT(pmap->pm_stats.wired_count == 0); 2717 #endif 2718 } 2719 2720 /* 2721 * Called from low to high. We must cache the proper parent pv so we 2722 * can adjust its wired count. 2723 */ 2724 static int 2725 pmap_release_callback(pv_entry_t pv, void *data) 2726 { 2727 struct pmap_release_info *info = data; 2728 pmap_t pmap = info->pmap; 2729 vm_pindex_t pindex; 2730 int r; 2731 2732 /* 2733 * Acquire a held and locked pv, check for release race 2734 */ 2735 pindex = pv->pv_pindex; 2736 if (info->pvp == pv) { 2737 spin_unlock(&pmap->pm_spin); 2738 info->pvp = NULL; 2739 } else if (pv_hold_try(pv)) { 2740 spin_unlock(&pmap->pm_spin); 2741 } else { 2742 spin_unlock(&pmap->pm_spin); 2743 pv_lock(pv); 2744 pv_put(pv); 2745 info->retry = 1; 2746 spin_lock(&pmap->pm_spin); 2747 2748 return -1; 2749 } 2750 KKASSERT(pv->pv_pmap == pmap && pindex == pv->pv_pindex); 2751 2752 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2753 /* 2754 * I am PTE, parent is PT 2755 */ 2756 pindex = pv->pv_pindex >> NPTEPGSHIFT; 2757 pindex += NUPTE_TOTAL; 2758 } else if (pv->pv_pindex < pmap_pd_pindex(0)) { 2759 /* 2760 * I am PT, parent is PD 2761 */ 2762 pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT; 2763 pindex += NUPTE_TOTAL + NUPT_TOTAL; 2764 } else if (pv->pv_pindex < pmap_pdp_pindex(0)) { 2765 /* 2766 * I am PD, parent is PDP 2767 */ 2768 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >> 2769 NPDPEPGSHIFT; 2770 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2771 } else if (pv->pv_pindex < pmap_pml4_pindex()) { 2772 /* 2773 * I am PDP, parent is PML4. We always calculate the 2774 * normal PML4 here, not the isolated PML4. 2775 */ 2776 pindex = pmap_pml4_pindex(); 2777 } else { 2778 /* 2779 * parent is NULL 2780 */ 2781 if (info->pvp) { 2782 pv_put(info->pvp); 2783 info->pvp = NULL; 2784 } 2785 pindex = 0; 2786 } 2787 if (pindex) { 2788 if (info->pvp && info->pvp->pv_pindex != pindex) { 2789 pv_put(info->pvp); 2790 info->pvp = NULL; 2791 } 2792 if (info->pvp == NULL) 2793 info->pvp = pv_get(pmap, pindex, NULL); 2794 } else { 2795 if (info->pvp) { 2796 pv_put(info->pvp); 2797 info->pvp = NULL; 2798 } 2799 } 2800 r = pmap_release_pv(pv, info->pvp, NULL); 2801 spin_lock(&pmap->pm_spin); 2802 2803 return(r); 2804 } 2805 2806 /* 2807 * Called with held (i.e. also locked) pv. This function will dispose of 2808 * the lock along with the pv. 2809 * 2810 * If the caller already holds the locked parent page table for pv it 2811 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2812 * pass NULL for pvp. 2813 */ 2814 static int 2815 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2816 { 2817 vm_page_t p; 2818 2819 /* 2820 * The pmap is currently not spinlocked, pv is held+locked. 2821 * Remove the pv's page from its parent's page table. The 2822 * parent's page table page's wire_count will be decremented. 2823 * 2824 * This will clean out the pte at any level of the page table. 2825 * If smp != 0 all cpus are affected. 2826 * 2827 * Do not tear-down recursively, its faster to just let the 2828 * release run its course. 2829 */ 2830 pmap_remove_pv_pte(pv, pvp, bulk, 0); 2831 2832 /* 2833 * Terminal pvs are unhooked from their vm_pages. Because 2834 * terminal pages aren't page table pages they aren't wired 2835 * by us, so we have to be sure not to unwire them either. 2836 */ 2837 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2838 pmap_remove_pv_page(pv); 2839 goto skip; 2840 } 2841 2842 /* 2843 * We leave the top-level page table page cached, wired, and 2844 * mapped in the pmap until the dtor function (pmap_puninit()) 2845 * gets called. 2846 * 2847 * Since we are leaving the top-level pv intact we need 2848 * to break out of what would otherwise be an infinite loop. 2849 * 2850 * This covers both the normal and the isolated PML4 page. 2851 */ 2852 if (pv->pv_pindex >= pmap_pml4_pindex()) { 2853 pv_put(pv); 2854 return(-1); 2855 } 2856 2857 /* 2858 * For page table pages (other than the top-level page), 2859 * remove and free the vm_page. The representitive mapping 2860 * removed above by pmap_remove_pv_pte() did not undo the 2861 * last wire_count so we have to do that as well. 2862 */ 2863 p = pmap_remove_pv_page(pv); 2864 vm_page_busy_wait(p, FALSE, "pmaprl"); 2865 if (p->wire_count != 1) { 2866 const char *tstr; 2867 2868 if (pv->pv_pindex >= pmap_pdp_pindex(0)) 2869 tstr = "PDP"; 2870 else if (pv->pv_pindex >= pmap_pd_pindex(0)) 2871 tstr = "PD"; 2872 else if (pv->pv_pindex >= pmap_pt_pindex(0)) 2873 tstr = "PT"; 2874 else 2875 tstr = "PTE"; 2876 2877 kprintf("p(%s) p->wire_count was %016lx %d\n", 2878 tstr, pv->pv_pindex, p->wire_count); 2879 } 2880 KKASSERT(p->wire_count == 1); 2881 KKASSERT(p->flags & PG_UNQUEUED); 2882 2883 vm_page_unwire(p, 0); 2884 KKASSERT(p->wire_count == 0); 2885 2886 vm_page_free(p); 2887 skip: 2888 pv_free(pv, pvp); 2889 2890 return 0; 2891 } 2892 2893 /* 2894 * This function will remove the pte associated with a pv from its parent. 2895 * Terminal pv's are supported. All cpus specified by (bulk) are properly 2896 * invalidated. 2897 * 2898 * The wire count will be dropped on the parent page table. The wire 2899 * count on the page being removed (pv->pv_m) from the parent page table 2900 * is NOT touched. Note that terminal pages will not have any additional 2901 * wire counts while page table pages will have at least one representing 2902 * the mapping, plus others representing sub-mappings. 2903 * 2904 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2905 * pages and user page table and terminal pages. 2906 * 2907 * NOTE: The pte being removed might be unmanaged, and the pv supplied might 2908 * be freshly allocated and not imply that the pte is managed. In this 2909 * case pv->pv_m should be NULL. 2910 * 2911 * The pv must be locked. The pvp, if supplied, must be locked. All 2912 * supplied pv's will remain locked on return. 2913 * 2914 * XXX must lock parent pv's if they exist to remove pte XXX 2915 */ 2916 static 2917 void 2918 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk, 2919 int destroy) 2920 { 2921 vm_pindex_t ptepindex = pv->pv_pindex; 2922 pmap_t pmap = pv->pv_pmap; 2923 vm_page_t p; 2924 int gotpvp = 0; 2925 2926 KKASSERT(pmap); 2927 2928 if (ptepindex >= pmap_pml4_pindex()) { 2929 /* 2930 * We are the top level PML4E table, there is no parent. 2931 * 2932 * This is either the normal or isolated PML4E table. 2933 * Only the normal is used in regular operation, the isolated 2934 * is only passed in when breaking down the whole pmap. 2935 */ 2936 p = pmap->pm_pmlpv->pv_m; 2937 KKASSERT(pv->pv_m == p); /* debugging */ 2938 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2939 /* 2940 * Remove a PDP page from the PML4E. This can only occur 2941 * with user page tables. We do not have to lock the 2942 * pml4 PV so just ignore pvp. 2943 */ 2944 vm_pindex_t pml4_pindex; 2945 vm_pindex_t pdp_index; 2946 pml4_entry_t *pdp; 2947 pml4_entry_t *pdp_iso; 2948 2949 pdp_index = ptepindex - pmap_pdp_pindex(0); 2950 if (pvp == NULL) { 2951 pml4_pindex = pmap_pml4_pindex(); 2952 pvp = pv_get(pv->pv_pmap, pml4_pindex, NULL); 2953 KKASSERT(pvp); 2954 gotpvp = 1; 2955 } 2956 2957 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2958 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); 2959 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2960 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); 2961 2962 /* 2963 * Also remove the PDP from the isolated PML4E if the 2964 * process uses one. 2965 */ 2966 if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) { 2967 pdp_iso = &pmap->pm_pml4_iso[pdp_index & 2968 ((1ul << NPML4EPGSHIFT) - 1)]; 2969 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp_iso, 0); 2970 } 2971 KKASSERT(pv->pv_m == p); /* debugging */ 2972 } else if (ptepindex >= pmap_pd_pindex(0)) { 2973 /* 2974 * Remove a PD page from the PDP 2975 * 2976 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 2977 * of a simple pmap because it stops at 2978 * the PD page. 2979 */ 2980 vm_pindex_t pdp_pindex; 2981 vm_pindex_t pd_index; 2982 pdp_entry_t *pd; 2983 2984 pd_index = ptepindex - pmap_pd_pindex(0); 2985 2986 if (pvp == NULL) { 2987 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 2988 (pd_index >> NPML4EPGSHIFT); 2989 pvp = pv_get(pv->pv_pmap, pdp_pindex, NULL); 2990 gotpvp = 1; 2991 } 2992 2993 if (pvp) { 2994 pd = pv_pte_lookup(pvp, pd_index & 2995 ((1ul << NPDPEPGSHIFT) - 1)); 2996 KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); 2997 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2998 pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); 2999 } else { 3000 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 3001 p = pv->pv_m; /* degenerate test later */ 3002 } 3003 KKASSERT(pv->pv_m == p); /* debugging */ 3004 } else if (ptepindex >= pmap_pt_pindex(0)) { 3005 /* 3006 * Remove a PT page from the PD 3007 */ 3008 vm_pindex_t pd_pindex; 3009 vm_pindex_t pt_index; 3010 pd_entry_t *pt; 3011 3012 pt_index = ptepindex - pmap_pt_pindex(0); 3013 3014 if (pvp == NULL) { 3015 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 3016 (pt_index >> NPDPEPGSHIFT); 3017 pvp = pv_get(pv->pv_pmap, pd_pindex, NULL); 3018 KKASSERT(pvp); 3019 gotpvp = 1; 3020 } 3021 3022 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 3023 #if 0 3024 KASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0, 3025 ("*pt unexpectedly invalid %016jx " 3026 "gotpvp=%d ptepindex=%ld ptindex=%ld pv=%p pvp=%p", 3027 *pt, gotpvp, ptepindex, pt_index, pv, pvp)); 3028 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 3029 #else 3030 if ((*pt & pmap->pmap_bits[PG_V_IDX]) == 0) { 3031 kprintf("*pt unexpectedly invalid %016jx " 3032 "gotpvp=%d ptepindex=%ld ptindex=%ld " 3033 "pv=%p pvp=%p\n", 3034 *pt, gotpvp, ptepindex, pt_index, pv, pvp); 3035 tsleep(pt, 0, "DEAD", 0); 3036 p = pv->pv_m; 3037 } else { 3038 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 3039 } 3040 #endif 3041 pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); 3042 KKASSERT(pv->pv_m == p); /* debugging */ 3043 } else { 3044 KKASSERT(0); 3045 } 3046 3047 /* 3048 * If requested, scrap the underlying pv->pv_m and the underlying 3049 * pv. If this is a page-table-page we must also free the page. 3050 * 3051 * pvp must be returned locked. 3052 */ 3053 if (destroy == 1) { 3054 /* 3055 * page table page (PT, PD, PDP, PML4), caller was responsible 3056 * for testing wired_count. 3057 */ 3058 KKASSERT(pv->pv_m->wire_count == 1); 3059 p = pmap_remove_pv_page(pv); 3060 pv_free(pv, pvp); 3061 pv = NULL; 3062 3063 vm_page_busy_wait(p, FALSE, "pgpun"); 3064 vm_page_unwire(p, 0); 3065 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 3066 vm_page_free(p); 3067 } else if (destroy == 2) { 3068 /* 3069 * Normal page, remove from pmap and leave the underlying 3070 * page untouched. 3071 */ 3072 pmap_remove_pv_page(pv); 3073 pv_free(pv, pvp); 3074 pv = NULL; /* safety */ 3075 } 3076 3077 /* 3078 * If we acquired pvp ourselves then we are responsible for 3079 * recursively deleting it. 3080 */ 3081 if (pvp && gotpvp) { 3082 /* 3083 * Recursively destroy higher-level page tables. 3084 * 3085 * This is optional. If we do not, they will still 3086 * be destroyed when the process exits. 3087 * 3088 * NOTE: Do not destroy pv_entry's with extra hold refs, 3089 * a caller may have unlocked it and intends to 3090 * continue to use it. 3091 */ 3092 if (pmap_dynamic_delete && 3093 pvp->pv_m && 3094 pvp->pv_m->wire_count == 1 && 3095 (pvp->pv_hold & PV_HOLD_MASK) == 2 && 3096 pvp->pv_pindex < pmap_pml4_pindex()) { 3097 if (pmap != &kernel_pmap) { 3098 pmap_remove_pv_pte(pvp, NULL, bulk, 1); 3099 pvp = NULL; /* safety */ 3100 } else { 3101 kprintf("Attempt to remove kernel_pmap pindex " 3102 "%jd\n", pvp->pv_pindex); 3103 pv_put(pvp); 3104 } 3105 } else { 3106 pv_put(pvp); 3107 } 3108 } 3109 } 3110 3111 /* 3112 * Remove the vm_page association to a pv. The pv must be locked. 3113 */ 3114 static 3115 vm_page_t 3116 pmap_remove_pv_page(pv_entry_t pv) 3117 { 3118 vm_page_t m; 3119 3120 m = pv->pv_m; 3121 pv->pv_m = NULL; 3122 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3123 3124 return(m); 3125 } 3126 3127 /* 3128 * Grow the number of kernel page table entries, if needed. 3129 * 3130 * This routine is always called to validate any address space 3131 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 3132 * space below KERNBASE. 3133 * 3134 * kernel_map must be locked exclusively by the caller. 3135 */ 3136 void 3137 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 3138 { 3139 vm_paddr_t paddr; 3140 vm_offset_t ptppaddr; 3141 vm_page_t nkpg; 3142 pd_entry_t *pt, newpt; 3143 pdp_entry_t *pd, newpd; 3144 int update_kernel_vm_end; 3145 3146 /* 3147 * bootstrap kernel_vm_end on first real VM use 3148 */ 3149 if (kernel_vm_end == 0) { 3150 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 3151 3152 for (;;) { 3153 pt = pmap_pt(&kernel_pmap, kernel_vm_end); 3154 if (pt == NULL) 3155 break; 3156 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) == 0) 3157 break; 3158 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 3159 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3160 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 3161 kernel_vm_end = vm_map_max(&kernel_map); 3162 break; 3163 } 3164 } 3165 } 3166 3167 /* 3168 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 3169 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 3170 * do not want to force-fill 128G worth of page tables. 3171 */ 3172 if (kstart < KERNBASE) { 3173 if (kstart > kernel_vm_end) 3174 kstart = kernel_vm_end; 3175 KKASSERT(kend <= KERNBASE); 3176 update_kernel_vm_end = 1; 3177 } else { 3178 update_kernel_vm_end = 0; 3179 } 3180 3181 kstart = rounddown2(kstart, (vm_offset_t)(PAGE_SIZE * NPTEPG)); 3182 kend = roundup2(kend, (vm_offset_t)(PAGE_SIZE * NPTEPG)); 3183 3184 if (kend - 1 >= vm_map_max(&kernel_map)) 3185 kend = vm_map_max(&kernel_map); 3186 3187 while (kstart < kend) { 3188 pt = pmap_pt(&kernel_pmap, kstart); 3189 if (pt == NULL) { 3190 /* 3191 * We need a new PD entry 3192 */ 3193 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3194 VM_ALLOC_NORMAL | 3195 VM_ALLOC_SYSTEM | 3196 VM_ALLOC_INTERRUPT); 3197 if (nkpg == NULL) { 3198 panic("pmap_growkernel: no memory to grow " 3199 "kernel"); 3200 } 3201 paddr = VM_PAGE_TO_PHYS(nkpg); 3202 pmap_zero_page(paddr); 3203 pd = pmap_pd(&kernel_pmap, kstart); 3204 3205 newpd = (pdp_entry_t) 3206 (paddr | 3207 kernel_pmap.pmap_bits[PG_V_IDX] | 3208 kernel_pmap.pmap_bits[PG_RW_IDX] | 3209 kernel_pmap.pmap_bits[PG_A_IDX]); 3210 atomic_swap_long(pd, newpd); 3211 3212 #if 0 3213 kprintf("NEWPD pd=%p pde=%016jx phys=%016jx\n", 3214 pd, newpd, paddr); 3215 #endif 3216 3217 continue; /* try again */ 3218 } 3219 3220 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3221 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3222 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3223 if (kstart - 1 >= vm_map_max(&kernel_map)) { 3224 kstart = vm_map_max(&kernel_map); 3225 break; 3226 } 3227 continue; 3228 } 3229 3230 /* 3231 * We need a new PT 3232 * 3233 * This index is bogus, but out of the way 3234 */ 3235 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3236 VM_ALLOC_NORMAL | 3237 VM_ALLOC_SYSTEM | 3238 VM_ALLOC_INTERRUPT); 3239 if (nkpg == NULL) 3240 panic("pmap_growkernel: no memory to grow kernel"); 3241 3242 vm_page_wire(nkpg); 3243 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 3244 pmap_zero_page(ptppaddr); 3245 newpt = (pd_entry_t)(ptppaddr | 3246 kernel_pmap.pmap_bits[PG_V_IDX] | 3247 kernel_pmap.pmap_bits[PG_RW_IDX] | 3248 kernel_pmap.pmap_bits[PG_A_IDX]); 3249 atomic_swap_long(pt, newpt); 3250 3251 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3252 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3253 3254 if (kstart - 1 >= vm_map_max(&kernel_map)) { 3255 kstart = vm_map_max(&kernel_map); 3256 break; 3257 } 3258 } 3259 3260 /* 3261 * Only update kernel_vm_end for areas below KERNBASE. 3262 */ 3263 if (update_kernel_vm_end && kernel_vm_end < kstart) 3264 kernel_vm_end = kstart; 3265 } 3266 3267 /* 3268 * Add a reference to the specified pmap. 3269 */ 3270 void 3271 pmap_reference(pmap_t pmap) 3272 { 3273 if (pmap != NULL) 3274 atomic_add_int(&pmap->pm_count, 1); 3275 } 3276 3277 void 3278 pmap_maybethreaded(pmap_t pmap) 3279 { 3280 atomic_set_int(&pmap->pm_flags, PMAP_MULTI); 3281 } 3282 3283 /* 3284 * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE 3285 * flags if able. 3286 */ 3287 int 3288 pmap_mapped_sync(vm_page_t m) 3289 { 3290 if (m->md.pmap_count == 0) 3291 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3292 return (m->flags); 3293 } 3294 3295 /*************************************************** 3296 * page management routines. 3297 ***************************************************/ 3298 3299 /* 3300 * Hold a pv without locking it 3301 */ 3302 #if 0 3303 static void 3304 pv_hold(pv_entry_t pv) 3305 { 3306 atomic_add_int(&pv->pv_hold, 1); 3307 } 3308 #endif 3309 3310 /* 3311 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 3312 * was successfully locked, FALSE if it wasn't. The caller must dispose of 3313 * the pv properly. 3314 * 3315 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 3316 * pv list via its page) must be held by the caller in order to stabilize 3317 * the pv. 3318 */ 3319 static int 3320 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 3321 { 3322 u_int count; 3323 3324 /* 3325 * Critical path shortcut expects pv to already have one ref 3326 * (for the pv->pv_pmap). 3327 */ 3328 count = pv->pv_hold; 3329 cpu_ccfence(); 3330 for (;;) { 3331 if ((count & PV_HOLD_LOCKED) == 0) { 3332 if (atomic_fcmpset_int(&pv->pv_hold, &count, 3333 (count + 1) | PV_HOLD_LOCKED)) { 3334 #ifdef PMAP_DEBUG 3335 pv->pv_func = func; 3336 pv->pv_line = lineno; 3337 #endif 3338 return TRUE; 3339 } 3340 } else { 3341 if (atomic_fcmpset_int(&pv->pv_hold, &count, count + 1)) 3342 return FALSE; 3343 } 3344 /* retry */ 3345 } 3346 } 3347 3348 /* 3349 * Drop a previously held pv_entry which could not be locked, allowing its 3350 * destruction. 3351 * 3352 * Must not be called with a spinlock held as we might zfree() the pv if it 3353 * is no longer associated with a pmap and this was the last hold count. 3354 */ 3355 static void 3356 pv_drop(pv_entry_t pv) 3357 { 3358 u_int count; 3359 3360 for (;;) { 3361 count = pv->pv_hold; 3362 cpu_ccfence(); 3363 KKASSERT((count & PV_HOLD_MASK) > 0); 3364 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 3365 (PV_HOLD_LOCKED | 1)); 3366 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 3367 if ((count & PV_HOLD_MASK) == 1) { 3368 #ifdef PMAP_DEBUG2 3369 if (pmap_enter_debug > 0) { 3370 --pmap_enter_debug; 3371 kprintf("pv_drop: free pv %p\n", pv); 3372 } 3373 #endif 3374 KKASSERT(count == 1); 3375 KKASSERT(pv->pv_pmap == NULL); 3376 zfree(pvzone, pv); 3377 } 3378 return; 3379 } 3380 /* retry */ 3381 } 3382 } 3383 3384 /* 3385 * Find or allocate the requested PV entry, returning a locked, held pv. 3386 * 3387 * If (*isnew) is non-zero, the returned pv will have two hold counts, one 3388 * for the caller and one representing the pmap and vm_page association. 3389 * 3390 * If (*isnew) is zero, the returned pv will have only one hold count. 3391 * 3392 * Since both associations can only be adjusted while the pv is locked, 3393 * together they represent just one additional hold. 3394 */ 3395 static 3396 pv_entry_t 3397 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 3398 { 3399 struct mdglobaldata *md = mdcpu; 3400 pv_entry_t pv; 3401 pv_entry_t pnew; 3402 int pmap_excl = 0; 3403 3404 pnew = NULL; 3405 if (md->gd_newpv) { 3406 #if 1 3407 pnew = atomic_swap_ptr((void *)&md->gd_newpv, NULL); 3408 #else 3409 crit_enter(); 3410 pnew = md->gd_newpv; /* might race NULL */ 3411 md->gd_newpv = NULL; 3412 crit_exit(); 3413 #endif 3414 } 3415 if (pnew == NULL) 3416 pnew = zalloc(pvzone); 3417 3418 spin_lock_shared(&pmap->pm_spin); 3419 for (;;) { 3420 /* 3421 * Shortcut cache 3422 */ 3423 pv = pv_entry_lookup(pmap, pindex); 3424 if (pv == NULL) { 3425 vm_pindex_t *pmark; 3426 3427 /* 3428 * Requires exclusive pmap spinlock 3429 */ 3430 if (pmap_excl == 0) { 3431 pmap_excl = 1; 3432 if (!spin_lock_upgrade_try(&pmap->pm_spin)) { 3433 spin_unlock_shared(&pmap->pm_spin); 3434 spin_lock(&pmap->pm_spin); 3435 continue; 3436 } 3437 } 3438 3439 /* 3440 * We need to block if someone is holding our 3441 * placemarker. As long as we determine the 3442 * placemarker has not been aquired we do not 3443 * need to get it as acquision also requires 3444 * the pmap spin lock. 3445 * 3446 * However, we can race the wakeup. 3447 */ 3448 pmark = pmap_placemarker_hash(pmap, pindex); 3449 3450 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3451 tsleep_interlock(pmark, 0); 3452 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3453 if (((*pmark ^ pindex) & 3454 ~PM_PLACEMARK_WAKEUP) == 0) { 3455 spin_unlock(&pmap->pm_spin); 3456 tsleep(pmark, PINTERLOCKED, "pvplc", 0); 3457 spin_lock(&pmap->pm_spin); 3458 } 3459 continue; 3460 } 3461 3462 /* 3463 * Setup the new entry 3464 */ 3465 pnew->pv_pmap = pmap; 3466 pnew->pv_pindex = pindex; 3467 pnew->pv_hold = PV_HOLD_LOCKED | 2; 3468 pnew->pv_flags = 0; 3469 #ifdef PMAP_DEBUG 3470 pnew->pv_func = func; 3471 pnew->pv_line = lineno; 3472 if (pnew->pv_line_lastfree > 0) { 3473 pnew->pv_line_lastfree = 3474 -pnew->pv_line_lastfree; 3475 } 3476 #endif 3477 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 3478 atomic_add_long(&pmap->pm_stats.resident_count, 1); 3479 spin_unlock(&pmap->pm_spin); 3480 *isnew = 1; 3481 3482 KASSERT(pv == NULL, ("pv insert failed %p->%p", pnew, pv)); 3483 return(pnew); 3484 } 3485 3486 /* 3487 * We already have an entry, cleanup the staged pnew if 3488 * we can get the lock, otherwise block and retry. 3489 */ 3490 if (__predict_true(_pv_hold_try(pv PMAP_DEBUG_COPY))) { 3491 if (pmap_excl) 3492 spin_unlock(&pmap->pm_spin); 3493 else 3494 spin_unlock_shared(&pmap->pm_spin); 3495 #if 1 3496 pnew = atomic_swap_ptr((void *)&md->gd_newpv, pnew); 3497 if (pnew) 3498 zfree(pvzone, pnew); 3499 #else 3500 crit_enter(); 3501 if (md->gd_newpv == NULL) 3502 md->gd_newpv = pnew; 3503 else 3504 zfree(pvzone, pnew); 3505 crit_exit(); 3506 #endif 3507 KKASSERT(pv->pv_pmap == pmap && 3508 pv->pv_pindex == pindex); 3509 *isnew = 0; 3510 return(pv); 3511 } 3512 if (pmap_excl) { 3513 spin_unlock(&pmap->pm_spin); 3514 _pv_lock(pv PMAP_DEBUG_COPY); 3515 pv_put(pv); 3516 spin_lock(&pmap->pm_spin); 3517 } else { 3518 spin_unlock_shared(&pmap->pm_spin); 3519 _pv_lock(pv PMAP_DEBUG_COPY); 3520 pv_put(pv); 3521 spin_lock_shared(&pmap->pm_spin); 3522 } 3523 } 3524 /* NOT REACHED */ 3525 } 3526 3527 /* 3528 * Find the requested PV entry, returning a locked+held pv or NULL 3529 */ 3530 static 3531 pv_entry_t 3532 _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp PMAP_DEBUG_DECL) 3533 { 3534 pv_entry_t pv; 3535 int pmap_excl = 0; 3536 3537 spin_lock_shared(&pmap->pm_spin); 3538 for (;;) { 3539 /* 3540 * Shortcut cache 3541 */ 3542 pv = pv_entry_lookup(pmap, pindex); 3543 if (pv == NULL) { 3544 /* 3545 * Block if there is ANY placemarker. If we are to 3546 * return it, we must also aquire the spot, so we 3547 * have to block even if the placemarker is held on 3548 * a different address. 3549 * 3550 * OPTIMIZATION: If pmarkp is passed as NULL the 3551 * caller is just probing (or looking for a real 3552 * pv_entry), and in this case we only need to check 3553 * to see if the placemarker matches pindex. 3554 */ 3555 vm_pindex_t *pmark; 3556 3557 /* 3558 * Requires exclusive pmap spinlock 3559 */ 3560 if (pmap_excl == 0) { 3561 pmap_excl = 1; 3562 if (!spin_lock_upgrade_try(&pmap->pm_spin)) { 3563 spin_unlock_shared(&pmap->pm_spin); 3564 spin_lock(&pmap->pm_spin); 3565 continue; 3566 } 3567 } 3568 3569 pmark = pmap_placemarker_hash(pmap, pindex); 3570 3571 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3572 ((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3573 tsleep_interlock(pmark, 0); 3574 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3575 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3576 ((*pmark ^ pindex) & 3577 ~PM_PLACEMARK_WAKEUP) == 0) { 3578 spin_unlock(&pmap->pm_spin); 3579 tsleep(pmark, PINTERLOCKED, "pvpld", 0); 3580 spin_lock(&pmap->pm_spin); 3581 } 3582 continue; 3583 } 3584 if (pmarkp) { 3585 if (atomic_swap_long(pmark, pindex) != 3586 PM_NOPLACEMARK) { 3587 panic("_pv_get: pmark race"); 3588 } 3589 *pmarkp = pmark; 3590 } 3591 spin_unlock(&pmap->pm_spin); 3592 return NULL; 3593 } 3594 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3595 if (pmap_excl) 3596 spin_unlock(&pmap->pm_spin); 3597 else 3598 spin_unlock_shared(&pmap->pm_spin); 3599 KKASSERT(pv->pv_pmap == pmap && 3600 pv->pv_pindex == pindex); 3601 return(pv); 3602 } 3603 if (pmap_excl) { 3604 spin_unlock(&pmap->pm_spin); 3605 _pv_lock(pv PMAP_DEBUG_COPY); 3606 pv_put(pv); 3607 spin_lock(&pmap->pm_spin); 3608 } else { 3609 spin_unlock_shared(&pmap->pm_spin); 3610 _pv_lock(pv PMAP_DEBUG_COPY); 3611 pv_put(pv); 3612 spin_lock_shared(&pmap->pm_spin); 3613 } 3614 } 3615 } 3616 3617 /* 3618 * Lookup, hold, and attempt to lock (pmap,pindex). 3619 * 3620 * If the entry does not exist NULL is returned and *errorp is set to 0 3621 * 3622 * If the entry exists and could be successfully locked it is returned and 3623 * errorp is set to 0. 3624 * 3625 * If the entry exists but could NOT be successfully locked it is returned 3626 * held and *errorp is set to 1. 3627 * 3628 * If the entry is placemarked by someone else NULL is returned and *errorp 3629 * is set to 1. 3630 */ 3631 static 3632 pv_entry_t 3633 pv_get_try(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp, int *errorp) 3634 { 3635 pv_entry_t pv; 3636 3637 spin_lock_shared(&pmap->pm_spin); 3638 3639 pv = pv_entry_lookup(pmap, pindex); 3640 if (pv == NULL) { 3641 vm_pindex_t *pmark; 3642 3643 pmark = pmap_placemarker_hash(pmap, pindex); 3644 3645 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3646 *errorp = 1; 3647 } else if (pmarkp && 3648 atomic_cmpset_long(pmark, PM_NOPLACEMARK, pindex)) { 3649 *errorp = 0; 3650 } else { 3651 /* 3652 * Can't set a placemark with a NULL pmarkp, or if 3653 * pmarkp is non-NULL but we failed to set our 3654 * placemark. 3655 */ 3656 *errorp = 1; 3657 } 3658 if (pmarkp) 3659 *pmarkp = pmark; 3660 spin_unlock_shared(&pmap->pm_spin); 3661 3662 return NULL; 3663 } 3664 3665 /* 3666 * XXX This has problems if the lock is shared, why? 3667 */ 3668 if (pv_hold_try(pv)) { 3669 spin_unlock_shared(&pmap->pm_spin); 3670 *errorp = 0; 3671 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); 3672 return(pv); /* lock succeeded */ 3673 } 3674 spin_unlock_shared(&pmap->pm_spin); 3675 *errorp = 1; 3676 3677 return (pv); /* lock failed */ 3678 } 3679 3680 /* 3681 * Lock a held pv, keeping the hold count 3682 */ 3683 static 3684 void 3685 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 3686 { 3687 u_int count; 3688 3689 for (;;) { 3690 count = pv->pv_hold; 3691 cpu_ccfence(); 3692 if ((count & PV_HOLD_LOCKED) == 0) { 3693 if (atomic_cmpset_int(&pv->pv_hold, count, 3694 count | PV_HOLD_LOCKED)) { 3695 #ifdef PMAP_DEBUG 3696 pv->pv_func = func; 3697 pv->pv_line = lineno; 3698 #endif 3699 return; 3700 } 3701 continue; 3702 } 3703 tsleep_interlock(pv, 0); 3704 if (atomic_cmpset_int(&pv->pv_hold, count, 3705 count | PV_HOLD_WAITING)) { 3706 #ifdef PMAP_DEBUG2 3707 if (pmap_enter_debug > 0) { 3708 --pmap_enter_debug; 3709 kprintf("pv waiting on %s:%d\n", 3710 pv->pv_func, pv->pv_line); 3711 } 3712 #endif 3713 tsleep(pv, PINTERLOCKED, "pvwait", hz); 3714 } 3715 /* retry */ 3716 } 3717 } 3718 3719 /* 3720 * Unlock a held and locked pv, keeping the hold count. 3721 */ 3722 static 3723 void 3724 pv_unlock(pv_entry_t pv) 3725 { 3726 u_int count; 3727 3728 for (;;) { 3729 count = pv->pv_hold; 3730 cpu_ccfence(); 3731 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= 3732 (PV_HOLD_LOCKED | 1)); 3733 if (atomic_cmpset_int(&pv->pv_hold, count, 3734 count & 3735 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 3736 if (count & PV_HOLD_WAITING) 3737 wakeup(pv); 3738 break; 3739 } 3740 } 3741 } 3742 3743 /* 3744 * Unlock and drop a pv. If the pv is no longer associated with a pmap 3745 * and the hold count drops to zero we will free it. 3746 * 3747 * Caller should not hold any spin locks. We are protected from hold races 3748 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 3749 * lock held. A pv cannot be located otherwise. 3750 */ 3751 static 3752 void 3753 pv_put(pv_entry_t pv) 3754 { 3755 #ifdef PMAP_DEBUG2 3756 if (pmap_enter_debug > 0) { 3757 --pmap_enter_debug; 3758 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); 3759 } 3760 #endif 3761 3762 /* 3763 * Normal put-aways must have a pv_m associated with the pv, 3764 * but allow the case where the pv has been destructed due 3765 * to pmap_dynamic_delete. 3766 */ 3767 KKASSERT(pv->pv_pmap == NULL || pv->pv_m != NULL); 3768 3769 /* 3770 * Fast - shortcut most common condition 3771 */ 3772 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) 3773 return; 3774 3775 /* 3776 * Slow 3777 */ 3778 pv_unlock(pv); 3779 pv_drop(pv); 3780 } 3781 3782 /* 3783 * Remove the pmap association from a pv, require that pv_m already be removed, 3784 * then unlock and drop the pv. Any pte operations must have already been 3785 * completed. This call may result in a last-drop which will physically free 3786 * the pv. 3787 * 3788 * Removing the pmap association entails an additional drop. 3789 * 3790 * pv must be exclusively locked on call and will be disposed of on return. 3791 */ 3792 static 3793 void 3794 _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL) 3795 { 3796 pmap_t pmap; 3797 3798 #ifdef PMAP_DEBUG 3799 pv->pv_func_lastfree = func; 3800 pv->pv_line_lastfree = lineno; 3801 #endif 3802 KKASSERT(pv->pv_m == NULL); 3803 KKASSERT((pv->pv_hold & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= 3804 (PV_HOLD_LOCKED|1)); 3805 if ((pmap = pv->pv_pmap) != NULL) { 3806 spin_lock(&pmap->pm_spin); 3807 KKASSERT(pv->pv_pmap == pmap); 3808 if (pmap->pm_pvhint_pt == pv) 3809 pmap->pm_pvhint_pt = NULL; 3810 if (pmap->pm_pvhint_unused == pv) 3811 pmap->pm_pvhint_unused = NULL; 3812 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 3813 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3814 pv->pv_pmap = NULL; 3815 pv->pv_pindex = 0; 3816 spin_unlock(&pmap->pm_spin); 3817 3818 /* 3819 * Try to shortcut three atomic ops, otherwise fall through 3820 * and do it normally. Drop two refs and the lock all in 3821 * one go. 3822 */ 3823 if (pvp) { 3824 if (vm_page_unwire_quick(pvp->pv_m)) 3825 panic("_pv_free: bad wirecount on pvp"); 3826 } 3827 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { 3828 #ifdef PMAP_DEBUG2 3829 if (pmap_enter_debug > 0) { 3830 --pmap_enter_debug; 3831 kprintf("pv_free: free pv %p\n", pv); 3832 } 3833 #endif 3834 zfree(pvzone, pv); 3835 return; 3836 } 3837 pv_drop(pv); /* ref for pv_pmap */ 3838 } 3839 pv_unlock(pv); 3840 pv_drop(pv); 3841 } 3842 3843 /* 3844 * This routine is very drastic, but can save the system 3845 * in a pinch. 3846 */ 3847 void 3848 pmap_collect(void) 3849 { 3850 int i; 3851 vm_page_t m; 3852 static int warningdone=0; 3853 3854 if (pmap_pagedaemon_waken == 0) 3855 return; 3856 pmap_pagedaemon_waken = 0; 3857 if (warningdone < 5) { 3858 kprintf("pmap_collect: pv_entries exhausted -- " 3859 "suggest increasing vm.pmap_pv_entries above %ld\n", 3860 vm_pmap_pv_entries); 3861 warningdone++; 3862 } 3863 3864 for (i = 0; i < vm_page_array_size; i++) { 3865 m = &vm_page_array[i]; 3866 if (m->wire_count || m->hold_count) 3867 continue; 3868 if (vm_page_busy_try(m, TRUE) == 0) { 3869 if (m->wire_count == 0 && m->hold_count == 0) { 3870 pmap_remove_all(m); 3871 } 3872 vm_page_wakeup(m); 3873 } 3874 } 3875 } 3876 3877 /* 3878 * Scan the pmap for active page table entries and issue a callback. 3879 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 3880 * its parent page table. 3881 * 3882 * pte_pv will be NULL if the page or page table is unmanaged. 3883 * pt_pv will point to the page table page containing the pte for the page. 3884 * 3885 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 3886 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 3887 * process pmap's PD and page to the callback function. This can be 3888 * confusing because the pt_pv is really a pd_pv, and the target page 3889 * table page is simply aliased by the pmap and not owned by it. 3890 * 3891 * It is assumed that the start and end are properly rounded to the page size. 3892 * 3893 * It is assumed that PD pages and above are managed and thus in the RB tree, 3894 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 3895 */ 3896 struct pmap_scan_info { 3897 struct pmap *pmap; 3898 vm_offset_t sva; 3899 vm_offset_t eva; 3900 vm_pindex_t sva_pd_pindex; 3901 vm_pindex_t eva_pd_pindex; 3902 void (*func)(pmap_t, struct pmap_scan_info *, 3903 vm_pindex_t *, pv_entry_t, vm_offset_t, 3904 pt_entry_t *, void *); 3905 void *arg; 3906 pmap_inval_bulk_t bulk_core; 3907 pmap_inval_bulk_t *bulk; 3908 int count; 3909 int stop; 3910 }; 3911 3912 static int pmap_scan_cmp(pv_entry_t pv, void *data); 3913 static int pmap_scan_callback(pv_entry_t pv, void *data); 3914 3915 static void 3916 pmap_scan(struct pmap_scan_info *info, int smp_inval) 3917 { 3918 struct pmap *pmap = info->pmap; 3919 pv_entry_t pt_pv; /* A page table PV */ 3920 pv_entry_t pte_pv; /* A page table entry PV */ 3921 vm_pindex_t *pte_placemark; 3922 vm_pindex_t *pt_placemark; 3923 pt_entry_t *ptep; 3924 pt_entry_t oldpte; 3925 struct pv_entry dummy_pv; 3926 3927 info->stop = 0; 3928 if (pmap == NULL) 3929 return; 3930 if (info->sva == info->eva) 3931 return; 3932 if (smp_inval) { 3933 info->bulk = &info->bulk_core; 3934 pmap_inval_bulk_init(&info->bulk_core, pmap); 3935 } else { 3936 info->bulk = NULL; 3937 } 3938 3939 /* 3940 * Hold the token for stability; if the pmap is empty we have nothing 3941 * to do. 3942 */ 3943 #if 0 3944 if (pmap->pm_stats.resident_count == 0) { 3945 return; 3946 } 3947 #endif 3948 3949 info->count = 0; 3950 3951 /* 3952 * Special handling for scanning one page, which is a very common 3953 * operation (it is?). 3954 * 3955 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 3956 */ 3957 if (info->sva + PAGE_SIZE == info->eva) { 3958 if (info->sva >= VM_MAX_USER_ADDRESS) { 3959 /* 3960 * Kernel mappings do not track wire counts on 3961 * page table pages and only maintain pd_pv and 3962 * pte_pv levels so pmap_scan() works. 3963 */ 3964 pt_pv = NULL; 3965 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3966 &pte_placemark); 3967 KKASSERT(pte_pv == NULL); 3968 ptep = vtopte(info->sva); 3969 } else { 3970 /* 3971 * We hold pte_placemark across the operation for 3972 * unmanaged pages. 3973 * 3974 * WARNING! We must hold pt_placemark across the 3975 * *ptep test to prevent misintepreting 3976 * a non-zero *ptep as a shared page 3977 * table page. Hold it across the function 3978 * callback as well for SMP safety. 3979 */ 3980 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 3981 &pte_placemark); 3982 KKASSERT(pte_pv == NULL); 3983 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva), 3984 &pt_placemark); 3985 if (pt_pv == NULL) { 3986 #if 0 3987 KKASSERT(0); 3988 pd_pv = pv_get(pmap, 3989 pmap_pd_pindex(info->sva), 3990 NULL); 3991 if (pd_pv) { 3992 ptep = pv_pte_lookup(pd_pv, 3993 pmap_pt_index(info->sva)); 3994 if (*ptep) { 3995 info->func(pmap, info, 3996 pt_placemark, pd_pv, 3997 info->sva, ptep, 3998 info->arg); 3999 } else { 4000 pv_placemarker_wakeup(pmap, 4001 pt_placemark); 4002 } 4003 pv_put(pd_pv); 4004 } else { 4005 pv_placemarker_wakeup(pmap, 4006 pt_placemark); 4007 } 4008 #else 4009 pv_placemarker_wakeup(pmap, pt_placemark); 4010 #endif 4011 pv_placemarker_wakeup(pmap, pte_placemark); 4012 goto fast_skip; 4013 } 4014 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 4015 } 4016 4017 /* 4018 * NOTE: *ptep can't be ripped out from under us if we hold 4019 * pte_pv (or pte_placemark) locked, but bits can 4020 * change. 4021 */ 4022 oldpte = *ptep; 4023 cpu_ccfence(); 4024 if (oldpte == 0) { 4025 KKASSERT(pte_pv == NULL); 4026 pv_placemarker_wakeup(pmap, pte_placemark); 4027 } else { 4028 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]) == 4029 pmap->pmap_bits[PG_V_IDX], 4030 ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL", 4031 *ptep, oldpte, info->sva)); 4032 info->func(pmap, info, pte_placemark, pt_pv, 4033 info->sva, ptep, info->arg); 4034 } 4035 if (pt_pv) 4036 pv_put(pt_pv); 4037 fast_skip: 4038 pmap_inval_bulk_flush(info->bulk); 4039 return; 4040 } 4041 4042 /* 4043 * Nominal scan case, RB_SCAN() for PD pages and iterate from 4044 * there. 4045 * 4046 * WARNING! eva can overflow our standard ((N + mask) >> bits) 4047 * bounds, resulting in a pd_pindex of 0. To solve the 4048 * problem we use an inclusive range. 4049 */ 4050 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 4051 info->eva_pd_pindex = pmap_pd_pindex(info->eva - PAGE_SIZE); 4052 4053 if (info->sva >= VM_MAX_USER_ADDRESS) { 4054 /* 4055 * The kernel does not currently maintain any pv_entry's for 4056 * higher-level page tables. 4057 */ 4058 bzero(&dummy_pv, sizeof(dummy_pv)); 4059 dummy_pv.pv_pindex = info->sva_pd_pindex; 4060 spin_lock(&pmap->pm_spin); 4061 while (dummy_pv.pv_pindex <= info->eva_pd_pindex) { 4062 pmap_scan_callback(&dummy_pv, info); 4063 ++dummy_pv.pv_pindex; 4064 if (dummy_pv.pv_pindex < info->sva_pd_pindex) /*wrap*/ 4065 break; 4066 } 4067 spin_unlock(&pmap->pm_spin); 4068 } else { 4069 /* 4070 * User page tables maintain local PML4, PDP, PD, and PT 4071 * pv_entry's. pv_entry's are not used for PTEs. 4072 */ 4073 spin_lock(&pmap->pm_spin); 4074 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, pmap_scan_cmp, 4075 pmap_scan_callback, info); 4076 spin_unlock(&pmap->pm_spin); 4077 } 4078 pmap_inval_bulk_flush(info->bulk); 4079 } 4080 4081 /* 4082 * WARNING! pmap->pm_spin held 4083 * 4084 * WARNING! eva can overflow our standard ((N + mask) >> bits) 4085 * bounds, resulting in a pd_pindex of 0. To solve the 4086 * problem we use an inclusive range. 4087 */ 4088 static int 4089 pmap_scan_cmp(pv_entry_t pv, void *data) 4090 { 4091 struct pmap_scan_info *info = data; 4092 if (pv->pv_pindex < info->sva_pd_pindex) 4093 return(-1); 4094 if (pv->pv_pindex > info->eva_pd_pindex) 4095 return(1); 4096 return(0); 4097 } 4098 4099 /* 4100 * pmap_scan() by PDs 4101 * 4102 * WARNING! pmap->pm_spin held 4103 */ 4104 static int 4105 pmap_scan_callback(pv_entry_t pv, void *data) 4106 { 4107 struct pmap_scan_info *info = data; 4108 struct pmap *pmap = info->pmap; 4109 pv_entry_t pd_pv; /* A page directory PV */ 4110 pv_entry_t pt_pv; /* A page table PV */ 4111 vm_pindex_t *pt_placemark; 4112 pt_entry_t *ptep; 4113 pt_entry_t oldpte; 4114 vm_offset_t sva; 4115 vm_offset_t eva; 4116 vm_offset_t va_next; 4117 vm_pindex_t pd_pindex; 4118 int error; 4119 4120 /* 4121 * Stop if requested 4122 */ 4123 if (info->stop) 4124 return -1; 4125 4126 /* 4127 * Pull the PD pindex from the pv before releasing the spinlock. 4128 * 4129 * WARNING: pv is faked for kernel pmap scans. 4130 */ 4131 pd_pindex = pv->pv_pindex; 4132 spin_unlock(&pmap->pm_spin); 4133 pv = NULL; /* invalid after spinlock unlocked */ 4134 4135 /* 4136 * Calculate the page range within the PD. SIMPLE pmaps are 4137 * direct-mapped for the entire 2^64 address space. Normal pmaps 4138 * reflect the user and kernel address space which requires 4139 * cannonicalization w/regards to converting pd_pindex's back 4140 * into addresses. 4141 */ 4142 sva = (pd_pindex - pmap_pd_pindex(0)) << PDPSHIFT; 4143 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 4144 (sva & PML4_SIGNMASK)) { 4145 sva |= PML4_SIGNMASK; 4146 } 4147 eva = sva + NBPDP; /* can overflow */ 4148 if (sva < info->sva) 4149 sva = info->sva; 4150 if (eva < info->sva || eva > info->eva) 4151 eva = info->eva; 4152 4153 /* 4154 * NOTE: kernel mappings do not track page table pages, only 4155 * terminal pages. 4156 * 4157 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 4158 * However, for the scan to be efficient we try to 4159 * cache items top-down. 4160 */ 4161 pd_pv = NULL; 4162 pt_pv = NULL; 4163 4164 for (; sva < eva; sva = va_next) { 4165 if (info->stop) 4166 break; 4167 if (sva >= VM_MAX_USER_ADDRESS) { 4168 if (pt_pv) { 4169 pv_put(pt_pv); 4170 pt_pv = NULL; 4171 } 4172 goto kernel_skip; 4173 } 4174 4175 /* 4176 * PD cache, scan shortcut if it doesn't exist. 4177 */ 4178 if (pd_pv == NULL) { 4179 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4180 } else if (pd_pv->pv_pmap != pmap || 4181 pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 4182 pv_put(pd_pv); 4183 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4184 } 4185 if (pd_pv == NULL) { 4186 va_next = (sva + NBPDP) & ~PDPMASK; 4187 if (va_next < sva) 4188 va_next = eva; 4189 continue; 4190 } 4191 4192 /* 4193 * PT cache 4194 * 4195 * NOTE: The cached pt_pv can be removed from the pmap when 4196 * pmap_dynamic_delete is enabled. 4197 */ 4198 if (pt_pv && (pt_pv->pv_pmap != pmap || 4199 pt_pv->pv_pindex != pmap_pt_pindex(sva))) { 4200 pv_put(pt_pv); 4201 pt_pv = NULL; 4202 } 4203 if (pt_pv == NULL) { 4204 pt_pv = pv_get_try(pmap, pmap_pt_pindex(sva), 4205 &pt_placemark, &error); 4206 if (error) { 4207 pv_put(pd_pv); /* lock order */ 4208 pd_pv = NULL; 4209 if (pt_pv) { 4210 pv_lock(pt_pv); 4211 pv_put(pt_pv); 4212 pt_pv = NULL; 4213 } else { 4214 pv_placemarker_wait(pmap, pt_placemark); 4215 } 4216 va_next = sva; 4217 continue; 4218 } 4219 /* may have to re-check later if pt_pv is NULL here */ 4220 } 4221 4222 /* 4223 * If pt_pv is NULL we either have a shared page table 4224 * page (NOT IMPLEMENTED XXX) and must issue a callback 4225 * specific to that case, or there is no page table page. 4226 * 4227 * Either way we can skip the page table page. 4228 * 4229 * WARNING! pt_pv can also be NULL due to a pv creation 4230 * race where we find it to be NULL and then 4231 * later see a pte_pv. But its possible the pt_pv 4232 * got created inbetween the two operations, so 4233 * we must check. 4234 * 4235 * XXX This should no longer be the case because 4236 * we have pt_placemark. 4237 */ 4238 if (pt_pv == NULL) { 4239 #if 0 4240 /* XXX REMOVED */ 4241 /* 4242 * Possible unmanaged (shared from another pmap) 4243 * page table page. 4244 * 4245 * WARNING! We must hold pt_placemark across the 4246 * *ptep test to prevent misintepreting 4247 * a non-zero *ptep as a shared page 4248 * table page. Hold it across the function 4249 * callback as well for SMP safety. 4250 */ 4251 KKASSERT(0); 4252 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 4253 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 4254 info->func(pmap, info, pt_placemark, pd_pv, 4255 sva, ptep, info->arg); 4256 } else { 4257 pv_placemarker_wakeup(pmap, pt_placemark); 4258 } 4259 #else 4260 pv_placemarker_wakeup(pmap, pt_placemark); 4261 #endif 4262 4263 /* 4264 * Done, move to next page table page. 4265 */ 4266 va_next = (sva + NBPDR) & ~PDRMASK; 4267 if (va_next < sva) 4268 va_next = eva; 4269 continue; 4270 } 4271 4272 /* 4273 * From this point in the loop testing pt_pv for non-NULL 4274 * means we are in UVM, else if it is NULL we are in KVM. 4275 * 4276 * Limit our scan to either the end of the va represented 4277 * by the current page table page, or to the end of the 4278 * range being removed. 4279 */ 4280 kernel_skip: 4281 va_next = (sva + NBPDR) & ~PDRMASK; 4282 if (va_next < sva) 4283 va_next = eva; 4284 if (va_next > eva) 4285 va_next = eva; 4286 4287 /* 4288 * Scan the page table for pages. Some pages may not be 4289 * managed (might not have a pv_entry). 4290 * 4291 * There is no page table management for kernel pages so 4292 * pt_pv will be NULL in that case, but otherwise pt_pv 4293 * is non-NULL, locked, and referenced. 4294 */ 4295 4296 /* 4297 * At this point a non-NULL pt_pv means a UVA, and a NULL 4298 * pt_pv means a KVA. 4299 */ 4300 if (pt_pv) 4301 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 4302 else 4303 ptep = vtopte(sva); 4304 4305 while (sva < va_next) { 4306 vm_pindex_t *pte_placemark; 4307 pv_entry_t pte_pv; 4308 4309 /* 4310 * Yield every 64 pages, stop if requested. 4311 */ 4312 if ((++info->count & 63) == 0) 4313 lwkt_user_yield(); 4314 if (info->stop) 4315 break; 4316 4317 /* 4318 * We can shortcut our scan if *ptep == 0. This is 4319 * an unlocked check. 4320 */ 4321 if (*ptep == 0) { 4322 sva += PAGE_SIZE; 4323 ++ptep; 4324 continue; 4325 } 4326 cpu_ccfence(); 4327 4328 /* 4329 * Acquire the pte_placemark. pte_pv's won't exist 4330 * for leaf pages. 4331 * 4332 * A multitude of races are possible here so if we 4333 * cannot lock definite state we clean out our cache 4334 * and break the inner while() loop to force a loop 4335 * up to the top of the for(). 4336 * 4337 * XXX unlock/relock pd_pv, pt_pv, and re-test their 4338 * validity instead of looping up? 4339 */ 4340 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 4341 &pte_placemark, &error); 4342 KKASSERT(pte_pv == NULL); 4343 if (error) { 4344 if (pd_pv) { 4345 pv_put(pd_pv); /* lock order */ 4346 pd_pv = NULL; 4347 } 4348 if (pt_pv) { 4349 pv_put(pt_pv); /* lock order */ 4350 pt_pv = NULL; 4351 } 4352 pv_placemarker_wait(pmap, pte_placemark); 4353 va_next = sva; /* retry */ 4354 break; 4355 } 4356 4357 /* 4358 * Reload *ptep after successfully locking the 4359 * pindex. 4360 */ 4361 cpu_ccfence(); 4362 oldpte = *ptep; 4363 if (oldpte == 0) { 4364 pv_placemarker_wakeup(pmap, pte_placemark); 4365 sva += PAGE_SIZE; 4366 ++ptep; 4367 continue; 4368 } 4369 4370 /* 4371 * We can't hold pd_pv across the callback (because 4372 * we don't pass it to the callback and the callback 4373 * might deadlock) 4374 */ 4375 if (pd_pv) { 4376 vm_page_wire_quick(pd_pv->pv_m); 4377 pv_unlock(pd_pv); 4378 } 4379 4380 /* 4381 * Ready for the callback. The locked placemarker 4382 * is consumed by the callback. 4383 */ 4384 if (oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4385 /* 4386 * Managed pte 4387 */ 4388 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]), 4389 ("badC *ptep %016lx/%016lx sva %016lx", 4390 *ptep, oldpte, sva)); 4391 /* 4392 * We must unlock pd_pv across the callback 4393 * to avoid deadlocks on any recursive 4394 * disposal. Re-check that it still exists 4395 * after re-locking. 4396 * 4397 * Call target disposes of pte_placemark 4398 * and may destroy but will not dispose 4399 * of pt_pv. 4400 */ 4401 info->func(pmap, info, pte_placemark, pt_pv, 4402 sva, ptep, info->arg); 4403 } else { 4404 /* 4405 * Unmanaged pte 4406 * 4407 * We must unlock pd_pv across the callback 4408 * to avoid deadlocks on any recursive 4409 * disposal. Re-check that it still exists 4410 * after re-locking. 4411 * 4412 * Call target disposes of pte_placemark 4413 * and may destroy but will not dispose 4414 * of pt_pv. 4415 */ 4416 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]), 4417 ("badD *ptep %016lx/%016lx sva %016lx ", 4418 *ptep, oldpte, sva)); 4419 info->func(pmap, info, pte_placemark, pt_pv, 4420 sva, ptep, info->arg); 4421 } 4422 if (pd_pv) { 4423 pv_lock(pd_pv); 4424 if (vm_page_unwire_quick(pd_pv->pv_m)) { 4425 panic("pmap_scan_callback: " 4426 "bad wirecount on pd_pv"); 4427 } 4428 if (pd_pv->pv_pmap == NULL) { 4429 va_next = sva; /* retry */ 4430 break; 4431 } 4432 } 4433 4434 /* 4435 * NOTE: The cached pt_pv can be removed from the 4436 * pmap when pmap_dynamic_delete is enabled, 4437 * which will cause ptep to become stale. 4438 * 4439 * This also means that no pages remain under 4440 * the PT, so we can just break out of the inner 4441 * loop and let the outer loop clean everything 4442 * up. 4443 */ 4444 if (pt_pv && pt_pv->pv_pmap != pmap) 4445 break; 4446 sva += PAGE_SIZE; 4447 ++ptep; 4448 } 4449 } 4450 if (pd_pv) { 4451 pv_put(pd_pv); 4452 pd_pv = NULL; 4453 } 4454 if (pt_pv) { 4455 pv_put(pt_pv); 4456 pt_pv = NULL; 4457 } 4458 if ((++info->count & 7) == 0) 4459 lwkt_user_yield(); 4460 4461 /* 4462 * Relock before returning. 4463 */ 4464 spin_lock(&pmap->pm_spin); 4465 return (0); 4466 } 4467 4468 void 4469 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4470 { 4471 struct pmap_scan_info info; 4472 4473 info.pmap = pmap; 4474 info.sva = sva; 4475 info.eva = eva; 4476 info.func = pmap_remove_callback; 4477 info.arg = NULL; 4478 pmap_scan(&info, 1); 4479 #if 0 4480 cpu_invltlb(); 4481 if (eva - sva < 1024*1024) { 4482 while (sva < eva) { 4483 cpu_invlpg((void *)sva); 4484 sva += PAGE_SIZE; 4485 } 4486 } 4487 #endif 4488 } 4489 4490 static void 4491 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4492 { 4493 struct pmap_scan_info info; 4494 4495 info.pmap = pmap; 4496 info.sva = sva; 4497 info.eva = eva; 4498 info.func = pmap_remove_callback; 4499 info.arg = NULL; 4500 pmap_scan(&info, 0); 4501 } 4502 4503 static void 4504 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 4505 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 4506 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4507 { 4508 pt_entry_t pte; 4509 4510 /* 4511 * Managed or unmanaged pte (pte_placemark is non-NULL) 4512 * 4513 * pt_pv's wire_count is still bumped by unmanaged pages 4514 * so we must decrement it manually. 4515 * 4516 * We have to unwire the target page table page. 4517 */ 4518 pte = pmap_inval_bulk(info->bulk, va, ptep, 0); 4519 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4520 vm_page_t p; 4521 4522 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 4523 KKASSERT(pte & pmap->pmap_bits[PG_V_IDX]); 4524 if (pte & pmap->pmap_bits[PG_M_IDX]) 4525 vm_page_dirty(p); 4526 if (pte & pmap->pmap_bits[PG_A_IDX]) 4527 vm_page_flag_set(p, PG_REFERENCED); 4528 4529 /* 4530 * NOTE: p is not hard-busied so it is not safe to 4531 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4532 * transition against them being set in 4533 * pmap_enter(). 4534 */ 4535 if (pte & pmap->pmap_bits[PG_RW_IDX]) 4536 atomic_add_long(&p->md.writeable_count, -1); 4537 pmap_page_stats_deleting( 4538 atomic_fetchadd_long(&p->md.pmap_count, -1)); 4539 } 4540 if (pte & pmap->pmap_bits[PG_V_IDX]) { 4541 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4542 if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m)) 4543 panic("pmap_remove: insufficient wirecount"); 4544 } 4545 if (pte & pmap->pmap_bits[PG_W_IDX]) 4546 atomic_add_long(&pmap->pm_stats.wired_count, -1); 4547 if (pte & pmap->pmap_bits[PG_G_IDX]) 4548 cpu_invlpg((void *)va); 4549 pv_placemarker_wakeup(pmap, pte_placemark); 4550 } 4551 4552 /* 4553 * Removes this physical page from all physical maps in which it resides. 4554 * Reflects back modify bits to the pager. 4555 * 4556 * This routine may not be called from an interrupt. 4557 * 4558 * The page must be busied by its caller, preventing new ptes from being 4559 * installed. This allows us to assert that pmap_count is zero and safely 4560 * clear the MAPPED and WRITEABLE bits upon completion. 4561 */ 4562 static 4563 void 4564 pmap_remove_all(vm_page_t m) 4565 { 4566 int retry; 4567 4568 if (!pmap_initialized) 4569 return; 4570 4571 /* 4572 * pmap_count doesn't cover fictitious pages, but PG_MAPPED does 4573 * (albeit without certain race protections). 4574 */ 4575 #if 0 4576 if (m->md.pmap_count == 0) 4577 return; 4578 #endif 4579 if ((m->flags & PG_MAPPED) == 0) 4580 return; 4581 4582 retry = ticks + hz * 60; 4583 again: 4584 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 4585 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0)) 4586 PMAP_PAGE_BACKING_RETRY; 4587 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) { 4588 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 4589 vm_page_dirty(m); 4590 if (ipte & ipmap->pmap_bits[PG_A_IDX]) 4591 vm_page_flag_set(m, PG_REFERENCED); 4592 4593 /* 4594 * NOTE: m is not hard-busied so it is not safe to 4595 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4596 * transition against them being set in 4597 * pmap_enter(). 4598 */ 4599 if (ipte & ipmap->pmap_bits[PG_RW_IDX]) 4600 atomic_add_long(&m->md.writeable_count, -1); 4601 pmap_page_stats_deleting( 4602 atomic_fetchadd_long(&m->md.pmap_count, -1)); 4603 } 4604 4605 /* 4606 * Cleanup various tracking counters. pt_pv can't go away 4607 * due to our wired ref. 4608 */ 4609 if (ipmap != &kernel_pmap) { 4610 pv_entry_t pt_pv; 4611 4612 spin_lock_shared(&ipmap->pm_spin); 4613 pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva)); 4614 spin_unlock_shared(&ipmap->pm_spin); 4615 4616 if (pt_pv) { 4617 if (vm_page_unwire_quick(pt_pv->pv_m)) { 4618 panic("pmap_remove_all: bad " 4619 "wire_count on pt_pv"); 4620 } 4621 atomic_add_long( 4622 &ipmap->pm_stats.resident_count, -1); 4623 } 4624 } 4625 if (ipte & ipmap->pmap_bits[PG_W_IDX]) 4626 atomic_add_long(&ipmap->pm_stats.wired_count, -1); 4627 if (ipte & ipmap->pmap_bits[PG_G_IDX]) 4628 cpu_invlpg((void *)iva); 4629 } PMAP_PAGE_BACKING_DONE; 4630 4631 /* 4632 * pmap_count should be zero but it is possible to race a pmap_enter() 4633 * replacement (see 'oldm'). Once it is zero it cannot become 4634 * non-zero because the page is hard-busied. 4635 */ 4636 if (m->md.pmap_count || m->md.writeable_count) { 4637 tsleep(&m->md.pmap_count, 0, "pgunm", 1); 4638 if (retry - ticks > 0) 4639 goto again; 4640 panic("pmap_remove_all: cannot return pmap_count " 4641 "to 0 (%p, %ld, %ld)", 4642 m, m->md.pmap_count, m->md.writeable_count); 4643 } 4644 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 4645 } 4646 4647 /* 4648 * Removes the page from a particular pmap. 4649 * 4650 * The page must be busied by the caller. 4651 */ 4652 void 4653 pmap_remove_specific(pmap_t pmap_match, vm_page_t m) 4654 { 4655 if (!pmap_initialized) 4656 return; 4657 4658 /* 4659 * PG_MAPPED test works for both non-fictitious and fictitious pages. 4660 */ 4661 if ((m->flags & PG_MAPPED) == 0) 4662 return; 4663 4664 PMAP_PAGE_BACKING_SCAN(m, pmap_match, ipmap, iptep, ipte, iva) { 4665 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0)) 4666 PMAP_PAGE_BACKING_RETRY; 4667 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) { 4668 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 4669 vm_page_dirty(m); 4670 if (ipte & ipmap->pmap_bits[PG_A_IDX]) 4671 vm_page_flag_set(m, PG_REFERENCED); 4672 4673 /* 4674 * NOTE: m is not hard-busied so it is not safe to 4675 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4676 * transition against them being set in 4677 * pmap_enter(). 4678 */ 4679 if (ipte & ipmap->pmap_bits[PG_RW_IDX]) 4680 atomic_add_long(&m->md.writeable_count, -1); 4681 pmap_page_stats_deleting( 4682 atomic_fetchadd_long(&m->md.pmap_count, -1)); 4683 } 4684 4685 /* 4686 * Cleanup various tracking counters. pt_pv can't go away 4687 * due to our wired ref. 4688 */ 4689 if (ipmap != &kernel_pmap) { 4690 pv_entry_t pt_pv; 4691 4692 spin_lock_shared(&ipmap->pm_spin); 4693 pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva)); 4694 spin_unlock_shared(&ipmap->pm_spin); 4695 4696 if (pt_pv) { 4697 atomic_add_long( 4698 &ipmap->pm_stats.resident_count, -1); 4699 if (vm_page_unwire_quick(pt_pv->pv_m)) { 4700 panic("pmap_remove_specific: bad " 4701 "wire_count on pt_pv"); 4702 } 4703 } 4704 } 4705 if (ipte & ipmap->pmap_bits[PG_W_IDX]) 4706 atomic_add_long(&ipmap->pm_stats.wired_count, -1); 4707 if (ipte & ipmap->pmap_bits[PG_G_IDX]) 4708 cpu_invlpg((void *)iva); 4709 } PMAP_PAGE_BACKING_DONE; 4710 } 4711 4712 /* 4713 * Set the physical protection on the specified range of this map 4714 * as requested. This function is typically only used for debug watchpoints 4715 * and COW pages. 4716 * 4717 * This function may not be called from an interrupt if the map is 4718 * not the kernel_pmap. 4719 * 4720 * NOTE! For shared page table pages we just unmap the page. 4721 */ 4722 void 4723 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4724 { 4725 struct pmap_scan_info info; 4726 /* JG review for NX */ 4727 4728 if (pmap == NULL) 4729 return; 4730 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 4731 pmap_remove(pmap, sva, eva); 4732 return; 4733 } 4734 if (prot & VM_PROT_WRITE) 4735 return; 4736 info.pmap = pmap; 4737 info.sva = sva; 4738 info.eva = eva; 4739 info.func = pmap_protect_callback; 4740 info.arg = &prot; 4741 pmap_scan(&info, 1); 4742 } 4743 4744 static 4745 void 4746 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 4747 vm_pindex_t *pte_placemark, 4748 pv_entry_t pt_pv, vm_offset_t va, 4749 pt_entry_t *ptep, void *arg __unused) 4750 { 4751 pt_entry_t pbits; 4752 pt_entry_t cbits; 4753 vm_page_t m; 4754 4755 again: 4756 pbits = *ptep; 4757 cpu_ccfence(); 4758 cbits = pbits; 4759 if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) { 4760 cbits &= ~pmap->pmap_bits[PG_A_IDX]; 4761 cbits &= ~pmap->pmap_bits[PG_M_IDX]; 4762 } 4763 /* else unmanaged page, adjust bits, no wire changes */ 4764 4765 if (ptep) { 4766 cbits &= ~pmap->pmap_bits[PG_RW_IDX]; 4767 #ifdef PMAP_DEBUG2 4768 if (pmap_enter_debug > 0) { 4769 --pmap_enter_debug; 4770 kprintf("pmap_protect va=%lx ptep=%p " 4771 "pt_pv=%p cbits=%08lx\n", 4772 va, ptep, pt_pv, cbits 4773 ); 4774 } 4775 #endif 4776 if (pbits != cbits) { 4777 if (!pmap_inval_smp_cmpset(pmap, va, 4778 ptep, pbits, cbits)) { 4779 goto again; 4780 } 4781 } 4782 if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) { 4783 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4784 if (pbits & pmap->pmap_bits[PG_A_IDX]) 4785 vm_page_flag_set(m, PG_REFERENCED); 4786 if (pbits & pmap->pmap_bits[PG_M_IDX]) 4787 vm_page_dirty(m); 4788 if (pbits & pmap->pmap_bits[PG_RW_IDX]) 4789 atomic_add_long(&m->md.writeable_count, -1); 4790 4791 } 4792 } 4793 pv_placemarker_wakeup(pmap, pte_placemark); 4794 } 4795 4796 /* 4797 * Insert the vm_page (m) at the virtual address (va), replacing any prior 4798 * mapping at that address. Set protection and wiring as requested. 4799 * 4800 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 4801 * possible. If it is we enter the page into the appropriate shared pmap 4802 * hanging off the related VM object instead of the passed pmap, then we 4803 * share the page table page from the VM object's pmap into the current pmap. 4804 * 4805 * NOTE: This routine MUST insert the page into the pmap now, it cannot 4806 * lazy-evaluate. 4807 */ 4808 void 4809 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4810 boolean_t wired, vm_map_entry_t entry) 4811 { 4812 pv_entry_t pt_pv; /* page table */ 4813 pv_entry_t pte_pv; /* page table entry */ 4814 vm_pindex_t *pte_placemark; 4815 pt_entry_t *ptep; 4816 pt_entry_t origpte; 4817 vm_paddr_t opa; 4818 vm_page_t oldm; 4819 pt_entry_t newpte; 4820 vm_paddr_t pa; 4821 4822 if (pmap == NULL) 4823 return; 4824 va = trunc_page(va); 4825 #ifdef PMAP_DIAGNOSTIC 4826 if (va >= KvaEnd) 4827 panic("pmap_enter: toobig"); 4828 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 4829 panic("pmap_enter: invalid to pmap_enter page table " 4830 "pages (va: 0x%lx)", va); 4831 #endif 4832 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 4833 kprintf("Warning: pmap_enter called on UVA with " 4834 "kernel_pmap\n"); 4835 #ifdef DDB 4836 db_print_backtrace(); 4837 #endif 4838 } 4839 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 4840 kprintf("Warning: pmap_enter called on KVA without" 4841 "kernel_pmap\n"); 4842 #ifdef DDB 4843 db_print_backtrace(); 4844 #endif 4845 } 4846 4847 /* 4848 * Get the locked page table page (pt_pv) for our new page table 4849 * entry, allocating it if necessary. 4850 * 4851 * There is no pte_pv for a terminal pte so the terminal pte will 4852 * be locked via pte_placemark. 4853 * 4854 * Only MMU actions by the CPU itself can modify the ptep out from 4855 * under us. 4856 * 4857 * If the pmap is still being initialized we assume existing 4858 * page tables. 4859 * 4860 * NOTE: Kernel mapppings do not track page table pages 4861 * (i.e. there is no pt_pv pt_pv structure). 4862 * 4863 * NOTE: origpte here is 'tentative', used only to check for 4864 * the degenerate case where the entry already exists and 4865 * matches. 4866 */ 4867 if (pmap_initialized == FALSE) { 4868 pte_pv = NULL; 4869 pt_pv = NULL; 4870 pte_placemark = NULL; 4871 ptep = vtopte(va); 4872 origpte = *ptep; 4873 } else { 4874 pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark); 4875 KKASSERT(pte_pv == NULL); 4876 if (va >= VM_MAX_USER_ADDRESS) { 4877 pt_pv = NULL; 4878 ptep = vtopte(va); 4879 } else { 4880 pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL); 4881 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4882 } 4883 origpte = *ptep; 4884 cpu_ccfence(); 4885 } 4886 4887 pa = VM_PAGE_TO_PHYS(m); 4888 4889 /* 4890 * Calculate the new PTE. 4891 */ 4892 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | 4893 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); 4894 if (wired) 4895 newpte |= pmap->pmap_bits[PG_W_IDX]; 4896 if (va < VM_MAX_USER_ADDRESS) 4897 newpte |= pmap->pmap_bits[PG_U_IDX]; 4898 if ((m->flags & PG_FICTITIOUS) == 0) 4899 newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; 4900 // if (pmap == &kernel_pmap) 4901 // newpte |= pgeflag; 4902 newpte |= pmap->pmap_cache_bits[m->pat_mode]; 4903 4904 /* 4905 * It is possible for multiple faults to occur in threaded 4906 * environments, the existing pte might be correct. 4907 */ 4908 if (((origpte ^ newpte) & 4909 ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | 4910 pmap->pmap_bits[PG_A_IDX])) == 0) { 4911 goto done; 4912 } 4913 4914 /* 4915 * Adjust page flags. The page is soft-busied or hard-busied, we 4916 * should be able to safely set PG_* flag bits even with the (shared) 4917 * soft-busy. 4918 * 4919 * The pmap_count and writeable_count is only tracked for 4920 * non-fictitious pages. As a bit of a safety, bump pmap_count 4921 * and set the PG_* bits before mapping the page. If another part 4922 * of the system does not properly hard-busy the page (against our 4923 * soft-busy or hard-busy) in order to remove mappings it might not 4924 * see the pte that we are about to add and thus will not be able to 4925 * drop pmap_count to 0. 4926 * 4927 * The PG_MAPPED and PG_WRITEABLE flags are set for any type of page. 4928 * 4929 * NOTE! PG_MAPPED and PG_WRITEABLE can only be cleared when 4930 * the page is hard-busied AND pmap_count is 0. This 4931 * interlocks our setting of the flags here. 4932 */ 4933 /*vm_page_spin_lock(m);*/ 4934 if ((m->flags & PG_FICTITIOUS) == 0) { 4935 pmap_page_stats_adding( 4936 atomic_fetchadd_long(&m->md.pmap_count, 1)); 4937 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4938 atomic_add_long(&m->md.writeable_count, 1); 4939 } 4940 if (newpte & pmap->pmap_bits[PG_RW_IDX]) { 4941 if ((m->flags & (PG_MAPPED | PG_WRITEABLE)) == 0) 4942 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 4943 } else { 4944 if ((m->flags & PG_MAPPED) == 0) 4945 vm_page_flag_set(m, PG_MAPPED); 4946 } 4947 /*vm_page_spin_unlock(m);*/ 4948 4949 /* 4950 * A race can develop when replacing an existing mapping. The new 4951 * page has been busied and the pte is placemark-locked, but the 4952 * old page is could be ripped out from under us at any time by 4953 * a backing scan. 4954 * 4955 * The race is handled by having the backing scans check pmap_count 4956 * writeable_count when doing operations that should ensure one 4957 * becomes 0. 4958 */ 4959 opa = origpte & PG_FRAME; 4960 if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) { 4961 oldm = PHYS_TO_VM_PAGE(opa); 4962 KKASSERT(opa == oldm->phys_addr); 4963 KKASSERT(entry != NULL); 4964 } else { 4965 oldm = NULL; 4966 } 4967 4968 /* 4969 * Swap the new and old PTEs and perform any necessary SMP 4970 * synchronization. 4971 */ 4972 if ((prot & VM_PROT_NOSYNC) || (opa == 0 && pt_pv != NULL)) { 4973 /* 4974 * Explicitly permitted to avoid pmap cpu mask synchronization 4975 * or the prior content of a non-kernel-related pmap was 4976 * invalid. 4977 */ 4978 origpte = atomic_swap_long(ptep, newpte); 4979 if (opa) 4980 cpu_invlpg((void *)va); 4981 } else { 4982 /* 4983 * Not permitted to avoid pmap cpu mask synchronization 4984 * or there prior content being replaced or this is a kernel 4985 * related pmap. 4986 * 4987 * Due to other kernel optimizations, we cannot assume a 4988 * 0->non_zero transition of *ptep can be done with a swap. 4989 */ 4990 origpte = pmap_inval_smp(pmap, va, 1, ptep, newpte); 4991 } 4992 opa = origpte & PG_FRAME; 4993 4994 #ifdef PMAP_DEBUG2 4995 if (pmap_enter_debug > 0) { 4996 --pmap_enter_debug; 4997 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" 4998 " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", 4999 va, m, 5000 origpte, newpte, ptep, 5001 pte_pv, pt_pv, opa, prot); 5002 } 5003 #endif 5004 5005 /* 5006 * Account for the changes in the pt_pv and pmap. 5007 * 5008 * Retain the same wiring count due to replacing an existing page, 5009 * or bump the wiring count for a new page. 5010 */ 5011 if (pt_pv && opa == 0) { 5012 vm_page_wire_quick(pt_pv->pv_m); 5013 atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1); 5014 } 5015 if (wired && (origpte & pmap->pmap_bits[PG_W_IDX]) == 0) 5016 atomic_add_long(&pmap->pm_stats.wired_count, 1); 5017 5018 /* 5019 * Account for the removal of the old page. pmap and pt_pv stats 5020 * have already been fully adjusted for both. 5021 * 5022 * WARNING! oldm is not soft or hard-busied. The pte at worst can 5023 * only be removed out from under us since we hold the 5024 * placemarker. So if it is still there, it must not have 5025 * changed. 5026 */ 5027 if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) { 5028 KKASSERT(oldm == PHYS_TO_VM_PAGE(opa)); 5029 if (origpte & pmap->pmap_bits[PG_M_IDX]) 5030 vm_page_dirty(oldm); 5031 if (origpte & pmap->pmap_bits[PG_A_IDX]) 5032 vm_page_flag_set(oldm, PG_REFERENCED); 5033 5034 /* 5035 * NOTE: oldm is not hard-busied so it is not safe to 5036 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 5037 * transition against them being set in 5038 * pmap_enter(). 5039 */ 5040 if (origpte & pmap->pmap_bits[PG_RW_IDX]) 5041 atomic_add_long(&oldm->md.writeable_count, -1); 5042 pmap_page_stats_deleting( 5043 atomic_fetchadd_long(&oldm->md.pmap_count, -1)); 5044 } 5045 5046 done: 5047 KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || 5048 (m->flags & PG_MAPPED)); 5049 5050 /* 5051 * Cleanup the pv entry, allowing other accessors. If the new page 5052 * is not managed but we have a pte_pv (which was locking our 5053 * operation), we can free it now. pte_pv->pv_m should be NULL. 5054 */ 5055 if (pte_placemark) 5056 pv_placemarker_wakeup(pmap, pte_placemark); 5057 if (pt_pv) 5058 pv_put(pt_pv); 5059 } 5060 5061 /* 5062 * Make a temporary mapping for a physical address. This is only intended 5063 * to be used for panic dumps. 5064 * 5065 * The caller is responsible for calling smp_invltlb(). 5066 */ 5067 void * 5068 pmap_kenter_temporary(vm_paddr_t pa, long i) 5069 { 5070 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 5071 return ((void *)crashdumpmap); 5072 } 5073 5074 #if 0 5075 #define MAX_INIT_PT (96) 5076 5077 /* 5078 * This routine preloads the ptes for a given object into the specified pmap. 5079 * This eliminates the blast of soft faults on process startup and 5080 * immediately after an mmap. 5081 */ 5082 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 5083 #endif 5084 5085 void 5086 pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry, 5087 vm_offset_t addr, vm_size_t size, int limit) 5088 { 5089 #if 0 5090 vm_prot_t prot = entry->protection; 5091 vm_object_t object = entry->ba.object; 5092 vm_pindex_t pindex = atop(entry->ba.offset + (addr - entry->ba.start)); 5093 struct rb_vm_page_scan_info info; 5094 struct lwp *lp; 5095 vm_size_t psize; 5096 5097 /* 5098 * We can't preinit if read access isn't set or there is no pmap 5099 * or object. 5100 */ 5101 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 5102 return; 5103 5104 /* 5105 * We can't preinit if the pmap is not the current pmap 5106 */ 5107 lp = curthread->td_lwp; 5108 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 5109 return; 5110 5111 /* 5112 * Misc additional checks 5113 */ 5114 psize = x86_64_btop(size); 5115 5116 if ((object->type != OBJT_VNODE) || 5117 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 5118 (object->resident_page_count > MAX_INIT_PT))) { 5119 return; 5120 } 5121 5122 if (pindex + psize > object->size) { 5123 if (object->size < pindex) 5124 return; 5125 psize = object->size - pindex; 5126 } 5127 5128 if (psize == 0) 5129 return; 5130 5131 /* 5132 * If everything is segment-aligned do not pre-init here. Instead 5133 * allow the normal vm_fault path to pass a segment hint to 5134 * pmap_enter() which will then use an object-referenced shared 5135 * page table page. 5136 */ 5137 if ((addr & SEG_MASK) == 0 && 5138 (ctob(psize) & SEG_MASK) == 0 && 5139 (ctob(pindex) & SEG_MASK) == 0) { 5140 return; 5141 } 5142 5143 /* 5144 * Use a red-black scan to traverse the requested range and load 5145 * any valid pages found into the pmap. 5146 * 5147 * We cannot safely scan the object's memq without holding the 5148 * object token. 5149 */ 5150 info.start_pindex = pindex; 5151 info.end_pindex = pindex + psize - 1; 5152 info.limit = limit; 5153 info.mpte = NULL; 5154 info.addr = addr; 5155 info.pmap = pmap; 5156 info.object = object; 5157 info.entry = entry; 5158 5159 /* 5160 * By using the NOLK scan, the callback function must be sure 5161 * to return -1 if the VM page falls out of the object. 5162 */ 5163 vm_object_hold_shared(object); 5164 vm_page_rb_tree_RB_SCAN_NOLK(&object->rb_memq, rb_vm_page_scancmp, 5165 pmap_object_init_pt_callback, &info); 5166 vm_object_drop(object); 5167 #endif 5168 } 5169 5170 #if 0 5171 5172 static 5173 int 5174 pmap_object_init_pt_callback(vm_page_t p, void *data) 5175 { 5176 struct rb_vm_page_scan_info *info = data; 5177 vm_pindex_t rel_index; 5178 int hard_busy; 5179 5180 /* 5181 * don't allow an madvise to blow away our really 5182 * free pages allocating pv entries. 5183 */ 5184 if ((info->limit & MAP_PREFAULT_MADVISE) && 5185 vmstats.v_free_count < vmstats.v_free_reserved) { 5186 return(-1); 5187 } 5188 5189 /* 5190 * Ignore list markers and ignore pages we cannot instantly 5191 * busy (while holding the object token). 5192 */ 5193 if (p->flags & PG_MARKER) 5194 return 0; 5195 hard_busy = 0; 5196 again: 5197 if (hard_busy) { 5198 if (vm_page_busy_try(p, TRUE)) 5199 return 0; 5200 } else { 5201 if (vm_page_sbusy_try(p)) 5202 return 0; 5203 } 5204 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 5205 (p->flags & PG_FICTITIOUS) == 0) { 5206 if ((p->queue - p->pc) == PQ_CACHE) { 5207 if (hard_busy == 0) { 5208 vm_page_sbusy_drop(p); 5209 hard_busy = 1; 5210 goto again; 5211 } 5212 vm_page_deactivate(p); 5213 } 5214 rel_index = p->pindex - info->start_pindex; 5215 pmap_enter(info->pmap, info->addr + x86_64_ptob(rel_index), p, 5216 VM_PROT_READ, FALSE, info->entry); 5217 } 5218 if (hard_busy) 5219 vm_page_wakeup(p); 5220 else 5221 vm_page_sbusy_drop(p); 5222 5223 /* 5224 * We are using an unlocked scan (that is, the scan expects its 5225 * current element to remain in the tree on return). So we have 5226 * to check here and abort the scan if it isn't. 5227 */ 5228 if (p->object != info->object) 5229 return -1; 5230 lwkt_yield(); 5231 return(0); 5232 } 5233 5234 #endif 5235 5236 /* 5237 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 5238 * address. 5239 * 5240 * Returns FALSE if it would be non-trivial or if a pte is already loaded 5241 * into the slot. 5242 * 5243 * The address must reside within a vm_map mapped range to ensure that the 5244 * page table doesn't get ripped out from under us. 5245 * 5246 * XXX This is safe only because page table pages are not freed. 5247 */ 5248 int 5249 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 5250 { 5251 pt_entry_t *pte; 5252 5253 /*spin_lock(&pmap->pm_spin);*/ 5254 if ((pte = pmap_pte(pmap, addr)) != NULL) { 5255 if (*pte & pmap->pmap_bits[PG_V_IDX]) { 5256 /*spin_unlock(&pmap->pm_spin);*/ 5257 return FALSE; 5258 } 5259 } 5260 /*spin_unlock(&pmap->pm_spin);*/ 5261 return TRUE; 5262 } 5263 5264 /* 5265 * Change the wiring attribute for a pmap/va pair. The mapping must already 5266 * exist in the pmap. The mapping may or may not be managed. The wiring in 5267 * the page is not changed, the page is returned so the caller can adjust 5268 * its wiring (the page is not locked in any way). 5269 * 5270 * Wiring is not a hardware characteristic so there is no need to invalidate 5271 * TLB. However, in an SMP environment we must use a locked bus cycle to 5272 * update the pte (if we are not using the pmap_inval_*() API that is)... 5273 * it's ok to do this for simple wiring changes. 5274 */ 5275 vm_page_t 5276 pmap_unwire(pmap_t pmap, vm_offset_t va) 5277 { 5278 pt_entry_t *ptep; 5279 pv_entry_t pt_pv; 5280 vm_paddr_t pa; 5281 vm_page_t m; 5282 5283 if (pmap == NULL) 5284 return NULL; 5285 5286 /* 5287 * Assume elements in the kernel pmap are stable 5288 */ 5289 if (pmap == &kernel_pmap) { 5290 if (pmap_pt(pmap, va) == 0) 5291 return NULL; 5292 ptep = pmap_pte_quick(pmap, va); 5293 if (pmap_pte_v(pmap, ptep)) { 5294 if (pmap_pte_w(pmap, ptep)) 5295 atomic_add_long(&pmap->pm_stats.wired_count,-1); 5296 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5297 pa = *ptep & PG_FRAME; 5298 m = PHYS_TO_VM_PAGE(pa); 5299 } else { 5300 m = NULL; 5301 } 5302 } else { 5303 /* 5304 * We can only [un]wire pmap-local pages (we cannot wire 5305 * shared pages) 5306 */ 5307 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 5308 if (pt_pv == NULL) 5309 return NULL; 5310 5311 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 5312 if ((*ptep & pmap->pmap_bits[PG_V_IDX]) == 0) { 5313 pv_put(pt_pv); 5314 return NULL; 5315 } 5316 5317 if (pmap_pte_w(pmap, ptep)) { 5318 atomic_add_long(&pt_pv->pv_pmap->pm_stats.wired_count, 5319 -1); 5320 } 5321 /* XXX else return NULL so caller doesn't unwire m ? */ 5322 5323 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5324 5325 pa = *ptep & PG_FRAME; 5326 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 5327 pv_put(pt_pv); 5328 } 5329 return m; 5330 } 5331 5332 /* 5333 * Copy the range specified by src_addr/len from the source map to 5334 * the range dst_addr/len in the destination map. 5335 * 5336 * This routine is only advisory and need not do anything. 5337 */ 5338 void 5339 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 5340 vm_size_t len, vm_offset_t src_addr) 5341 { 5342 } 5343 5344 /* 5345 * pmap_zero_page: 5346 * 5347 * Zero the specified physical page. 5348 * 5349 * This function may be called from an interrupt and no locking is 5350 * required. 5351 */ 5352 void 5353 pmap_zero_page(vm_paddr_t phys) 5354 { 5355 vm_offset_t va = PHYS_TO_DMAP(phys); 5356 5357 pagezero((void *)va); 5358 } 5359 5360 /* 5361 * pmap_zero_page: 5362 * 5363 * Zero part of a physical page by mapping it into memory and clearing 5364 * its contents with bzero. 5365 * 5366 * off and size may not cover an area beyond a single hardware page. 5367 */ 5368 void 5369 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 5370 { 5371 vm_offset_t virt = PHYS_TO_DMAP(phys); 5372 5373 bzero((char *)virt + off, size); 5374 } 5375 5376 /* 5377 * pmap_copy_page: 5378 * 5379 * Copy the physical page from the source PA to the target PA. 5380 * This function may be called from an interrupt. No locking 5381 * is required. 5382 */ 5383 void 5384 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 5385 { 5386 vm_offset_t src_virt, dst_virt; 5387 5388 src_virt = PHYS_TO_DMAP(src); 5389 dst_virt = PHYS_TO_DMAP(dst); 5390 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 5391 } 5392 5393 /* 5394 * pmap_copy_page_frag: 5395 * 5396 * Copy the physical page from the source PA to the target PA. 5397 * This function may be called from an interrupt. No locking 5398 * is required. 5399 */ 5400 void 5401 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 5402 { 5403 vm_offset_t src_virt, dst_virt; 5404 5405 src_virt = PHYS_TO_DMAP(src); 5406 dst_virt = PHYS_TO_DMAP(dst); 5407 5408 bcopy((char *)src_virt + (src & PAGE_MASK), 5409 (char *)dst_virt + (dst & PAGE_MASK), 5410 bytes); 5411 } 5412 5413 /* 5414 * Remove all pages from specified address space this aids process exit 5415 * speeds. Also, this code may be special cased for the current process 5416 * only. 5417 */ 5418 void 5419 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5420 { 5421 pmap_remove_noinval(pmap, sva, eva); 5422 cpu_invltlb(); 5423 } 5424 5425 /* 5426 * pmap_testbit tests bits in pte's note that the testbit/clearbit 5427 * routines are inline, and a lot of things compile-time evaluate. 5428 */ 5429 static 5430 boolean_t 5431 pmap_testbit(vm_page_t m, int bit) 5432 { 5433 int res = FALSE; 5434 5435 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5436 return FALSE; 5437 /* 5438 * Nothing to do if all the mappings are already read-only. 5439 * The page's [M]odify bits have already been synchronized 5440 * to the vm_page_t and cleaned out. 5441 */ 5442 if (bit == PG_M_IDX && m->md.writeable_count == 0) 5443 return FALSE; 5444 5445 /* 5446 * Iterate the mapping 5447 */ 5448 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5449 if (ipte & ipmap->pmap_bits[bit]) { 5450 res = TRUE; 5451 break; 5452 } 5453 } PMAP_PAGE_BACKING_DONE; 5454 return res; 5455 } 5456 5457 /* 5458 * This routine is used to modify bits in ptes. Only one bit should be 5459 * specified. PG_RW requires special handling. This call works with 5460 * any sort of mapped page. PG_FICTITIOUS pages might not be optimal. 5461 * 5462 * Caller must NOT hold any spin locks 5463 * Caller must hold (m) hard-busied 5464 * 5465 * NOTE: When clearing PG_M we could also (not implemented) drop 5466 * through to the PG_RW code and clear PG_RW too, forcing 5467 * a fault on write to redetect PG_M for virtual kernels, but 5468 * it isn't necessary since virtual kernels invalidate the 5469 * pte when they clear the VPTE_M bit in their virtual page 5470 * tables. 5471 * 5472 * NOTE: Does not re-dirty the page when clearing only PG_M. 5473 * 5474 * NOTE: Because we do not lock the pv, *pte can be in a state of 5475 * flux. Despite this the value of *pte is still somewhat 5476 * related while we hold the vm_page spin lock. 5477 * 5478 * *pte can be zero due to this race. Since we are clearing 5479 * bits we basically do no harm when this race occurs. 5480 */ 5481 static __inline 5482 void 5483 pmap_clearbit(vm_page_t m, int bit_index) 5484 { 5485 pt_entry_t npte; 5486 int retry; 5487 5488 /* 5489 * Too early in the boot 5490 */ 5491 if (!pmap_initialized) { 5492 if (bit_index == PG_RW_IDX) 5493 vm_page_flag_clear(m, PG_WRITEABLE); 5494 return; 5495 } 5496 5497 /* 5498 * Being asked to clear other random bits, we don't track them 5499 * so we have to iterate. 5500 */ 5501 if (bit_index != PG_RW_IDX) { 5502 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5503 if (ipte & ipmap->pmap_bits[bit_index]) { 5504 atomic_clear_long(iptep, 5505 ipmap->pmap_bits[bit_index]); 5506 } 5507 } PMAP_PAGE_BACKING_DONE; 5508 return; 5509 } 5510 5511 /* 5512 * Being asked to clear the RW bit. 5513 * 5514 * Nothing to do if all the mappings are already read-only 5515 */ 5516 if (m->md.writeable_count == 0) 5517 return; 5518 5519 /* 5520 * Iterate the mappings and check. 5521 */ 5522 retry = ticks + hz * 60; 5523 again: 5524 /* 5525 * Clear PG_RW. This also clears PG_M and marks the page dirty if 5526 * PG_M was set. 5527 * 5528 * Since the caller holds the page hard-busied we can safely clear 5529 * PG_WRITEABLE, and callers expect us to for the PG_RW_IDX path. 5530 */ 5531 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5532 #if 0 5533 if ((ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) == 0) 5534 continue; 5535 #endif 5536 if ((ipte & ipmap->pmap_bits[PG_RW_IDX]) == 0) 5537 continue; 5538 npte = ipte & ~(ipmap->pmap_bits[PG_RW_IDX] | 5539 ipmap->pmap_bits[PG_M_IDX]); 5540 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, npte)) 5541 PMAP_PAGE_BACKING_RETRY; 5542 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 5543 vm_page_dirty(m); 5544 5545 /* 5546 * NOTE: m is not hard-busied so it is not safe to 5547 * clear PG_WRITEABLE on the 1->0 transition 5548 * against it being set in pmap_enter(). 5549 * 5550 * pmap_count and writeable_count are only applicable 5551 * to non-fictitious pages (PG_MANAGED_IDX from pte) 5552 */ 5553 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) 5554 atomic_add_long(&m->md.writeable_count, -1); 5555 } PMAP_PAGE_BACKING_DONE; 5556 5557 /* 5558 * writeable_count should be zero but it is possible to race 5559 * a pmap_enter() replacement (see 'oldm'). Once it is zero 5560 * it cannot become non-zero because the page is hard-busied. 5561 */ 5562 if (m->md.writeable_count != 0) { 5563 tsleep(&m->md.writeable_count, 0, "pgwab", 1); 5564 if (retry - ticks > 0) 5565 goto again; 5566 panic("pmap_remove_all: cannot return writeable_count " 5567 "to 0 (%ld)", 5568 m->md.writeable_count); 5569 } 5570 vm_page_flag_clear(m, PG_WRITEABLE); 5571 } 5572 5573 /* 5574 * Lower the permission for all mappings to a given page. 5575 * 5576 * Page must be hard-busied by caller. Because the page is busied by the 5577 * caller, this should not be able to race a pmap_enter(). 5578 */ 5579 void 5580 pmap_page_protect(vm_page_t m, vm_prot_t prot) 5581 { 5582 /* JG NX support? */ 5583 if ((prot & VM_PROT_WRITE) == 0) { 5584 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 5585 /* 5586 * NOTE: pmap_clearbit(.. PG_RW) also clears 5587 * the PG_WRITEABLE flag in (m). 5588 */ 5589 pmap_clearbit(m, PG_RW_IDX); 5590 } else { 5591 pmap_remove_all(m); 5592 } 5593 } 5594 } 5595 5596 vm_paddr_t 5597 pmap_phys_address(vm_pindex_t ppn) 5598 { 5599 return (x86_64_ptob(ppn)); 5600 } 5601 5602 /* 5603 * Return a count of reference bits for a page, clearing those bits. 5604 * It is not necessary for every reference bit to be cleared, but it 5605 * is necessary that 0 only be returned when there are truly no 5606 * reference bits set. 5607 * 5608 * XXX: The exact number of bits to check and clear is a matter that 5609 * should be tested and standardized at some point in the future for 5610 * optimal aging of shared pages. 5611 * 5612 * This routine may not block. 5613 */ 5614 int 5615 pmap_ts_referenced(vm_page_t m) 5616 { 5617 int rval = 0; 5618 pt_entry_t npte; 5619 5620 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5621 return rval; 5622 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5623 if (ipte & ipmap->pmap_bits[PG_A_IDX]) { 5624 npte = ipte & ~ipmap->pmap_bits[PG_A_IDX]; 5625 if (!atomic_cmpset_long(iptep, ipte, npte)) 5626 PMAP_PAGE_BACKING_RETRY; 5627 ++rval; 5628 if (rval > 4) 5629 break; 5630 } 5631 } PMAP_PAGE_BACKING_DONE; 5632 return rval; 5633 } 5634 5635 /* 5636 * pmap_is_modified: 5637 * 5638 * Return whether or not the specified physical page was modified 5639 * in any physical maps. 5640 */ 5641 boolean_t 5642 pmap_is_modified(vm_page_t m) 5643 { 5644 boolean_t res; 5645 5646 res = pmap_testbit(m, PG_M_IDX); 5647 return (res); 5648 } 5649 5650 /* 5651 * Clear the modify bit on the vm_page. 5652 * 5653 * The page must be hard-busied. 5654 */ 5655 void 5656 pmap_clear_modify(vm_page_t m) 5657 { 5658 pmap_clearbit(m, PG_M_IDX); 5659 } 5660 5661 /* 5662 * pmap_clear_reference: 5663 * 5664 * Clear the reference bit on the specified physical page. 5665 */ 5666 void 5667 pmap_clear_reference(vm_page_t m) 5668 { 5669 pmap_clearbit(m, PG_A_IDX); 5670 } 5671 5672 /* 5673 * Miscellaneous support routines follow 5674 */ 5675 5676 static 5677 void 5678 x86_64_protection_init(void) 5679 { 5680 uint64_t *kp; 5681 int prot; 5682 5683 /* 5684 * NX supported? (boot time loader.conf override only) 5685 * 5686 * -1 Automatic (sets mode 1) 5687 * 0 Disabled 5688 * 1 NX implemented, differentiates PROT_READ vs PROT_READ|PROT_EXEC 5689 * 2 NX implemented for all cases 5690 */ 5691 TUNABLE_INT_FETCH("machdep.pmap_nx_enable", &pmap_nx_enable); 5692 if ((amd_feature & AMDID_NX) == 0) { 5693 pmap_bits_default[PG_NX_IDX] = 0; 5694 pmap_nx_enable = 0; 5695 } else if (pmap_nx_enable < 0) { 5696 pmap_nx_enable = 1; /* default to mode 1 (READ) */ 5697 } 5698 5699 /* 5700 * 0 is basically read-only access, but also set the NX (no-execute) 5701 * bit when VM_PROT_EXECUTE is not specified. 5702 */ 5703 kp = protection_codes; 5704 for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { 5705 switch (prot) { 5706 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 5707 /* 5708 * This case handled elsewhere 5709 */ 5710 *kp = 0; 5711 break; 5712 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 5713 /* 5714 * Read-only is 0|NX (pmap_nx_enable mode >= 1) 5715 */ 5716 if (pmap_nx_enable >= 1) 5717 *kp = pmap_bits_default[PG_NX_IDX]; 5718 break; 5719 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 5720 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 5721 /* 5722 * Execute requires read access 5723 */ 5724 *kp = 0; 5725 break; 5726 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 5727 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 5728 /* 5729 * Write without execute is RW|NX 5730 * (pmap_nx_enable mode >= 2) 5731 */ 5732 *kp = pmap_bits_default[PG_RW_IDX]; 5733 if (pmap_nx_enable >= 2) 5734 *kp |= pmap_bits_default[PG_NX_IDX]; 5735 break; 5736 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 5737 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 5738 /* 5739 * Write with execute is RW 5740 */ 5741 *kp = pmap_bits_default[PG_RW_IDX]; 5742 break; 5743 } 5744 ++kp; 5745 } 5746 } 5747 5748 /* 5749 * Map a set of physical memory pages into the kernel virtual 5750 * address space. Return a pointer to where it is mapped. This 5751 * routine is intended to be used for mapping device memory, 5752 * NOT real memory. 5753 * 5754 * NOTE: We can't use pgeflag unless we invalidate the pages one at 5755 * a time. 5756 * 5757 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 5758 * work whether the cpu supports PAT or not. The remaining PAT 5759 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 5760 * supports PAT. 5761 */ 5762 void * 5763 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5764 { 5765 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5766 } 5767 5768 void * 5769 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 5770 { 5771 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5772 } 5773 5774 void * 5775 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5776 { 5777 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5778 } 5779 5780 /* 5781 * Map a set of physical memory pages into the kernel virtual 5782 * address space. Return a pointer to where it is mapped. This 5783 * routine is intended to be used for mapping device memory, 5784 * NOT real memory. 5785 */ 5786 void * 5787 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5788 { 5789 vm_offset_t va, tmpva, offset; 5790 pt_entry_t *pte; 5791 vm_size_t tmpsize; 5792 5793 offset = pa & PAGE_MASK; 5794 size = roundup(offset + size, PAGE_SIZE); 5795 5796 va = kmem_alloc_nofault(&kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE); 5797 if (va == 0) 5798 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5799 5800 pa = pa & ~PAGE_MASK; 5801 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 5802 pte = vtopte(tmpva); 5803 *pte = pa | 5804 kernel_pmap.pmap_bits[PG_RW_IDX] | 5805 kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */ 5806 kernel_pmap.pmap_cache_bits[mode]; 5807 tmpsize -= PAGE_SIZE; 5808 tmpva += PAGE_SIZE; 5809 pa += PAGE_SIZE; 5810 } 5811 pmap_invalidate_range(&kernel_pmap, va, va + size); 5812 pmap_invalidate_cache_range(va, va + size); 5813 5814 return ((void *)(va + offset)); 5815 } 5816 5817 void 5818 pmap_unmapdev(vm_offset_t va, vm_size_t size) 5819 { 5820 vm_offset_t base, offset; 5821 5822 base = va & ~PAGE_MASK; 5823 offset = va & PAGE_MASK; 5824 size = roundup(offset + size, PAGE_SIZE); 5825 pmap_qremove(va, size >> PAGE_SHIFT); 5826 kmem_free(&kernel_map, base, size); 5827 } 5828 5829 /* 5830 * Sets the memory attribute for the specified page. 5831 */ 5832 void 5833 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5834 { 5835 5836 m->pat_mode = ma; 5837 5838 /* 5839 * If "m" is a normal page, update its direct mapping. This update 5840 * can be relied upon to perform any cache operations that are 5841 * required for data coherence. 5842 */ 5843 if ((m->flags & PG_FICTITIOUS) == 0) 5844 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); 5845 } 5846 5847 /* 5848 * Change the PAT attribute on an existing kernel memory map. Caller 5849 * must ensure that the virtual memory in question is not accessed 5850 * during the adjustment. 5851 */ 5852 void 5853 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 5854 { 5855 pt_entry_t *pte; 5856 vm_offset_t base; 5857 int changed = 0; 5858 5859 if (va == 0) 5860 panic("pmap_change_attr: va is NULL"); 5861 base = trunc_page(va); 5862 5863 while (count) { 5864 pte = vtopte(va); 5865 *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) | 5866 kernel_pmap.pmap_cache_bits[mode]; 5867 --count; 5868 va += PAGE_SIZE; 5869 } 5870 5871 changed = 1; /* XXX: not optimal */ 5872 5873 /* 5874 * Flush CPU caches if required to make sure any data isn't cached that 5875 * shouldn't be, etc. 5876 */ 5877 if (changed) { 5878 pmap_invalidate_range(&kernel_pmap, base, va); 5879 pmap_invalidate_cache_range(base, va); 5880 } 5881 } 5882 5883 /* 5884 * perform the pmap work for mincore 5885 */ 5886 int 5887 pmap_mincore(pmap_t pmap, vm_offset_t addr) 5888 { 5889 pt_entry_t *ptep, pte; 5890 vm_page_t m; 5891 int val = 0; 5892 5893 ptep = pmap_pte(pmap, addr); 5894 5895 if (ptep && (pte = *ptep) != 0) { 5896 vm_offset_t pa; 5897 5898 val = MINCORE_INCORE; 5899 pa = pte & PG_FRAME; 5900 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) 5901 m = PHYS_TO_VM_PAGE(pa); 5902 else 5903 m = NULL; 5904 5905 /* 5906 * Modified by us 5907 */ 5908 if (pte & pmap->pmap_bits[PG_M_IDX]) 5909 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 5910 5911 /* 5912 * Modified by someone 5913 */ 5914 else if (m && (m->dirty || pmap_is_modified(m))) 5915 val |= MINCORE_MODIFIED_OTHER; 5916 5917 /* 5918 * Referenced by us, or someone else. 5919 */ 5920 if (pte & pmap->pmap_bits[PG_A_IDX]) { 5921 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 5922 } else if (m && ((m->flags & PG_REFERENCED) || 5923 pmap_ts_referenced(m))) { 5924 val |= MINCORE_REFERENCED_OTHER; 5925 vm_page_flag_set(m, PG_REFERENCED); 5926 } 5927 } 5928 return val; 5929 } 5930 5931 /* 5932 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 5933 * vmspace will be ref'd and the old one will be deref'd. 5934 * 5935 * The vmspace for all lwps associated with the process will be adjusted 5936 * and cr3 will be reloaded if any lwp is the current lwp. 5937 * 5938 * The process must hold the vmspace->vm_map.token for oldvm and newvm 5939 */ 5940 void 5941 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 5942 { 5943 struct vmspace *oldvm; 5944 struct lwp *lp; 5945 5946 oldvm = p->p_vmspace; 5947 if (oldvm != newvm) { 5948 if (adjrefs) 5949 vmspace_ref(newvm); 5950 p->p_vmspace = newvm; 5951 KKASSERT(p->p_nthreads == 1); 5952 lp = RB_ROOT(&p->p_lwp_tree); 5953 pmap_setlwpvm(lp, newvm); 5954 if (adjrefs) 5955 vmspace_rel(oldvm); 5956 } 5957 } 5958 5959 /* 5960 * Set the vmspace for a LWP. The vmspace is almost universally set the 5961 * same as the process vmspace, but virtual kernels need to swap out contexts 5962 * on a per-lwp basis. 5963 * 5964 * Caller does not necessarily hold any vmspace tokens. Caller must control 5965 * the lwp (typically be in the context of the lwp). We use a critical 5966 * section to protect against statclock and hardclock (statistics collection). 5967 */ 5968 void 5969 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 5970 { 5971 struct vmspace *oldvm; 5972 struct pmap *pmap; 5973 thread_t td; 5974 5975 oldvm = lp->lwp_vmspace; 5976 5977 if (oldvm != newvm) { 5978 crit_enter(); 5979 td = curthread; 5980 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 5981 lp->lwp_vmspace = newvm; 5982 if (td->td_lwp == lp) { 5983 pmap = vmspace_pmap(newvm); 5984 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 5985 if (pmap->pm_active_lock & CPULOCK_EXCL) 5986 pmap_interlock_wait(newvm); 5987 #if defined(SWTCH_OPTIM_STATS) 5988 tlb_flush_count++; 5989 #endif 5990 if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { 5991 td->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 5992 if (meltdown_mitigation && pmap->pm_pmlpv_iso) { 5993 td->td_pcb->pcb_cr3_iso = 5994 vtophys(pmap->pm_pml4_iso); 5995 td->td_pcb->pcb_flags |= PCB_ISOMMU; 5996 } else { 5997 td->td_pcb->pcb_cr3_iso = 0; 5998 td->td_pcb->pcb_flags &= ~PCB_ISOMMU; 5999 } 6000 } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { 6001 td->td_pcb->pcb_cr3 = KPML4phys; 6002 td->td_pcb->pcb_cr3_iso = 0; 6003 td->td_pcb->pcb_flags &= ~PCB_ISOMMU; 6004 } else { 6005 panic("pmap_setlwpvm: unknown pmap type\n"); 6006 } 6007 6008 /* 6009 * The MMU separation fields needs to be updated. 6010 * (it can't access the pcb directly from the 6011 * restricted user pmap). 6012 */ 6013 { 6014 struct trampframe *tramp; 6015 6016 tramp = &pscpu->trampoline; 6017 tramp->tr_pcb_cr3 = td->td_pcb->pcb_cr3; 6018 tramp->tr_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso; 6019 tramp->tr_pcb_flags = td->td_pcb->pcb_flags; 6020 tramp->tr_pcb_rsp = (register_t)td->td_pcb; 6021 /* tr_pcb_rsp doesn't change */ 6022 } 6023 6024 /* 6025 * In kernel-land we always use the normal PML4E 6026 * so the kernel is fully mapped and can also access 6027 * user memory. 6028 */ 6029 load_cr3(td->td_pcb->pcb_cr3); 6030 pmap = vmspace_pmap(oldvm); 6031 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 6032 mycpu->gd_cpuid); 6033 } 6034 crit_exit(); 6035 } 6036 } 6037 6038 /* 6039 * Called when switching to a locked pmap, used to interlock against pmaps 6040 * undergoing modifications to prevent us from activating the MMU for the 6041 * target pmap until all such modifications have completed. We have to do 6042 * this because the thread making the modifications has already set up its 6043 * SMP synchronization mask. 6044 * 6045 * This function cannot sleep! 6046 * 6047 * No requirements. 6048 */ 6049 void 6050 pmap_interlock_wait(struct vmspace *vm) 6051 { 6052 struct pmap *pmap = &vm->vm_pmap; 6053 6054 if (pmap->pm_active_lock & CPULOCK_EXCL) { 6055 crit_enter(); 6056 KKASSERT(curthread->td_critcount >= 2); 6057 DEBUG_PUSH_INFO("pmap_interlock_wait"); 6058 while (pmap->pm_active_lock & CPULOCK_EXCL) { 6059 cpu_ccfence(); 6060 lwkt_process_ipiq(); 6061 } 6062 DEBUG_POP_INFO(); 6063 crit_exit(); 6064 } 6065 } 6066 6067 vm_offset_t 6068 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 6069 { 6070 6071 if ((obj == NULL) || (size < NBPDR) || 6072 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { 6073 return addr; 6074 } 6075 6076 addr = roundup2(addr, NBPDR); 6077 return addr; 6078 } 6079 6080 /* 6081 * Used by kmalloc/kfree, page already exists at va 6082 */ 6083 vm_page_t 6084 pmap_kvtom(vm_offset_t va) 6085 { 6086 pt_entry_t *ptep = vtopte(va); 6087 6088 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 6089 } 6090 6091 /* 6092 * Initialize machine-specific shared page directory support. This 6093 * is executed when a VM object is created. 6094 */ 6095 void 6096 pmap_object_init(vm_object_t object) 6097 { 6098 } 6099 6100 /* 6101 * Clean up machine-specific shared page directory support. This 6102 * is executed when a VM object is destroyed. 6103 */ 6104 void 6105 pmap_object_free(vm_object_t object) 6106 { 6107 } 6108 6109 /* 6110 * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related 6111 * VM page and issue a pginfo->callback. 6112 */ 6113 static 6114 void 6115 pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info, 6116 vm_pindex_t *pte_placemark, 6117 pv_entry_t pt_pv, vm_offset_t va, 6118 pt_entry_t *ptep, void *arg) 6119 { 6120 struct pmap_pgscan_info *pginfo = arg; 6121 vm_page_t m; 6122 pt_entry_t pte; 6123 6124 pte = *ptep; 6125 cpu_ccfence(); 6126 6127 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { 6128 /* 6129 * Try to busy the page while we hold the pte_placemark locked. 6130 */ 6131 m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME); 6132 if (vm_page_busy_try(m, TRUE) == 0) { 6133 if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) { 6134 /* 6135 * The callback is issued with the pt_pv 6136 * unlocked. 6137 */ 6138 pv_placemarker_wakeup(pmap, pte_placemark); 6139 if (pt_pv) { 6140 vm_page_wire_quick(pt_pv->pv_m); 6141 pv_unlock(pt_pv); 6142 } 6143 if (pginfo->callback(pginfo, va, m) < 0) 6144 info->stop = 1; 6145 if (pt_pv) { 6146 pv_lock(pt_pv); 6147 if (vm_page_unwire_quick(pt_pv->pv_m)) { 6148 panic("pmap_pgscan: bad wire_" 6149 "count on pt_pv"); 6150 } 6151 } 6152 } else { 6153 vm_page_wakeup(m); 6154 pv_placemarker_wakeup(pmap, pte_placemark); 6155 } 6156 } else { 6157 ++pginfo->busycount; 6158 pv_placemarker_wakeup(pmap, pte_placemark); 6159 } 6160 } else { 6161 /* 6162 * Shared page table or unmanaged page (sharept or !sharept) 6163 */ 6164 pv_placemarker_wakeup(pmap, pte_placemark); 6165 } 6166 } 6167 6168 void 6169 pmap_pgscan(struct pmap_pgscan_info *pginfo) 6170 { 6171 struct pmap_scan_info info; 6172 6173 pginfo->offset = pginfo->beg_addr; 6174 info.pmap = pginfo->pmap; 6175 info.sva = pginfo->beg_addr; 6176 info.eva = pginfo->end_addr; 6177 info.func = pmap_pgscan_callback; 6178 info.arg = pginfo; 6179 pmap_scan(&info, 0); 6180 if (info.stop == 0) 6181 pginfo->offset = pginfo->end_addr; 6182 } 6183 6184 /* 6185 * Wait for a placemarker that we do not own to clear. The placemarker 6186 * in question is not necessarily set to the pindex we want, we may have 6187 * to wait on the element because we want to reserve it ourselves. 6188 * 6189 * NOTE: PM_PLACEMARK_WAKEUP sets a bit which is already set in 6190 * PM_NOPLACEMARK, so it does not interfere with placemarks 6191 * which have already been woken up. 6192 * 6193 * NOTE: This routine is called without the pmap spin-lock and so can 6194 * race changes to *pmark. Due to the sensitivity of the routine 6195 * to possible MULTIPLE interactions from other cpus, and the 6196 * overloading of the WAKEUP bit on PM_NOPLACEMARK, we have to 6197 * use a cmpset loop to avoid a race that might cause the WAKEUP 6198 * bit to be lost. 6199 * 6200 * Caller is expected to retry its operation upon return. 6201 */ 6202 static 6203 void 6204 pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark) 6205 { 6206 vm_pindex_t mark; 6207 6208 mark = *pmark; 6209 cpu_ccfence(); 6210 while (mark != PM_NOPLACEMARK) { 6211 tsleep_interlock(pmark, 0); 6212 if (atomic_fcmpset_long(pmark, &mark, 6213 mark | PM_PLACEMARK_WAKEUP)) { 6214 tsleep(pmark, PINTERLOCKED, "pvplw", 0); 6215 break; 6216 } 6217 } 6218 } 6219 6220 /* 6221 * Wakeup a placemarker that we own. Replace the entry with 6222 * PM_NOPLACEMARK and issue a wakeup() if necessary. 6223 */ 6224 static 6225 void 6226 pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark) 6227 { 6228 vm_pindex_t pindex; 6229 6230 pindex = atomic_swap_long(pmark, PM_NOPLACEMARK); 6231 KKASSERT(pindex != PM_NOPLACEMARK); 6232 if (pindex & PM_PLACEMARK_WAKEUP) 6233 wakeup(pmark); 6234 } 6235