1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47 /*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * Copyright (c) 2018 The FreeBSD Foundation 51 * All rights reserved. 52 * 53 * This software was developed for the FreeBSD Project by Jake Burkholder, 54 * Safeport Network Services, and Network Associates Laboratories, the 55 * Security Research Division of Network Associates, Inc. under 56 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 57 * CHATS research program. 58 * 59 * Portions of this software were developed by 60 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 61 * the FreeBSD Foundation. 62 * 63 * Redistribution and use in source and binary forms, with or without 64 * modification, are permitted provided that the following conditions 65 * are met: 66 * 1. Redistributions of source code must retain the above copyright 67 * notice, this list of conditions and the following disclaimer. 68 * 2. Redistributions in binary form must reproduce the above copyright 69 * notice, this list of conditions and the following disclaimer in the 70 * documentation and/or other materials provided with the distribution. 71 * 72 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 73 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 74 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 75 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 76 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 77 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 78 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 79 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 80 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 81 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 82 * SUCH DAMAGE. 83 */ 84 85 #include <sys/cdefs.h> 86 __FBSDID("$FreeBSD$"); 87 88 /* 89 * Manages physical address maps. 90 * 91 * Since the information managed by this module is 92 * also stored by the logical address mapping module, 93 * this module may throw away valid virtual-to-physical 94 * mappings at almost any time. However, invalidations 95 * of virtual-to-physical mappings must be done as 96 * requested. 97 * 98 * In order to cope with hardware architectures which 99 * make virtual-to-physical map invalidates expensive, 100 * this module may delay invalidate or reduced protection 101 * operations until such time as they are actually 102 * necessary. This module is given full information as 103 * to which processors are currently using which maps, 104 * and to when physical maps must be made correct. 105 */ 106 107 #include "opt_apic.h" 108 #include "opt_cpu.h" 109 #include "opt_pmap.h" 110 #include "opt_smp.h" 111 #include "opt_vm.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/kernel.h> 116 #include <sys/ktr.h> 117 #include <sys/lock.h> 118 #include <sys/malloc.h> 119 #include <sys/mman.h> 120 #include <sys/msgbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/proc.h> 123 #include <sys/rwlock.h> 124 #include <sys/sbuf.h> 125 #include <sys/sf_buf.h> 126 #include <sys/sx.h> 127 #include <sys/vmmeter.h> 128 #include <sys/sched.h> 129 #include <sys/sysctl.h> 130 #include <sys/smp.h> 131 #include <sys/vmem.h> 132 133 #include <vm/vm.h> 134 #include <vm/vm_param.h> 135 #include <vm/vm_kern.h> 136 #include <vm/vm_page.h> 137 #include <vm/vm_map.h> 138 #include <vm/vm_object.h> 139 #include <vm/vm_extern.h> 140 #include <vm/vm_pageout.h> 141 #include <vm/vm_pager.h> 142 #include <vm/vm_phys.h> 143 #include <vm/vm_radix.h> 144 #include <vm/vm_reserv.h> 145 #include <vm/uma.h> 146 147 #ifdef DEV_APIC 148 #include <sys/bus.h> 149 #include <machine/intr_machdep.h> 150 #include <x86/apicvar.h> 151 #endif 152 #include <x86/ifunc.h> 153 #include <machine/bootinfo.h> 154 #include <machine/cpu.h> 155 #include <machine/cputypes.h> 156 #include <machine/md_var.h> 157 #include <machine/pcb.h> 158 #include <machine/specialreg.h> 159 #ifdef SMP 160 #include <machine/smp.h> 161 #endif 162 #include <machine/pmap_base.h> 163 164 #if !defined(DIAGNOSTIC) 165 #ifdef __GNUC_GNU_INLINE__ 166 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 167 #else 168 #define PMAP_INLINE extern inline 169 #endif 170 #else 171 #define PMAP_INLINE 172 #endif 173 174 #ifdef PV_STATS 175 #define PV_STAT(x) do { x ; } while (0) 176 #else 177 #define PV_STAT(x) do { } while (0) 178 #endif 179 180 #define pa_index(pa) ((pa) >> PDRSHIFT) 181 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 182 183 /* 184 * PTmap is recursive pagemap at top of virtual address space. 185 * Within PTmap, the page directory can be found (third indirection). 186 */ 187 #define PTmap ((pt_entry_t *)(PTDPTDI << PDRSHIFT)) 188 #define PTD ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE))) 189 #define PTDpde ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE) + \ 190 (PTDPTDI * PDESIZE))) 191 192 /* 193 * Translate a virtual address to the kernel virtual address of its page table 194 * entry (PTE). This can be used recursively. If the address of a PTE as 195 * previously returned by this macro is itself given as the argument, then the 196 * address of the page directory entry (PDE) that maps the PTE will be 197 * returned. 198 * 199 * This macro may be used before pmap_bootstrap() is called. 200 */ 201 #define vtopte(va) (PTmap + i386_btop(va)) 202 203 /* 204 * Get PDEs and PTEs for user/kernel address space 205 */ 206 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 207 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 208 209 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 210 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 211 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 212 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 213 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 214 215 #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 216 atomic_clear_int((u_int *)(pte), PG_W)) 217 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 218 219 static int pgeflag = 0; /* PG_G or-in */ 220 static int pseflag = 0; /* PG_PS or-in */ 221 222 static int nkpt = NKPT; 223 224 #ifdef PMAP_PAE_COMP 225 pt_entry_t pg_nx; 226 static uma_zone_t pdptzone; 227 #else 228 #define pg_nx 0 229 #endif 230 231 _Static_assert(VM_MAXUSER_ADDRESS == VADDR(TRPTDI, 0), "VM_MAXUSER_ADDRESS"); 232 _Static_assert(VM_MAX_KERNEL_ADDRESS <= VADDR(PTDPTDI, 0), 233 "VM_MAX_KERNEL_ADDRESS"); 234 _Static_assert(PMAP_MAP_LOW == VADDR(LOWPTDI, 0), "PMAP_MAP_LOW"); 235 _Static_assert(KERNLOAD == (KERNPTDI << PDRSHIFT), "KERNLOAD"); 236 237 extern int pat_works; 238 extern int pg_ps_enabled; 239 240 extern int elf32_nxstack; 241 242 #define PAT_INDEX_SIZE 8 243 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 244 245 /* 246 * pmap_mapdev support pre initialization (i.e. console) 247 */ 248 #define PMAP_PREINIT_MAPPING_COUNT 8 249 static struct pmap_preinit_mapping { 250 vm_paddr_t pa; 251 vm_offset_t va; 252 vm_size_t sz; 253 int mode; 254 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 255 static int pmap_initialized; 256 257 static struct rwlock_padalign pvh_global_lock; 258 259 /* 260 * Data for the pv entry allocation mechanism 261 */ 262 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 263 extern int pv_entry_max, pv_entry_count; 264 static int pv_entry_high_water = 0; 265 static struct md_page *pv_table; 266 extern int shpgperproc; 267 268 static struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 269 static int pv_maxchunks; /* How many chunks we have KVA for */ 270 static vm_offset_t pv_vafree; /* freelist stored in the PTE */ 271 272 /* 273 * All those kernel PT submaps that BSD is so fond of 274 */ 275 static pt_entry_t *CMAP3; 276 static pd_entry_t *KPTD; 277 static caddr_t CADDR3; 278 279 /* 280 * Crashdump maps. 281 */ 282 static caddr_t crashdumpmap; 283 284 static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3; 285 static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3; 286 #ifdef SMP 287 static int PMAP1cpu, PMAP3cpu; 288 extern int PMAP1changedcpu; 289 #endif 290 extern int PMAP1changed; 291 extern int PMAP1unchanged; 292 static struct mtx PMAP2mutex; 293 294 /* 295 * Internal flags for pmap_enter()'s helper functions. 296 */ 297 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 298 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 299 300 static void free_pv_chunk(struct pv_chunk *pc); 301 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 302 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 303 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 304 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 305 u_int flags); 306 #if VM_NRESERVLEVEL > 0 307 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 308 #endif 309 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 310 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 311 vm_offset_t va); 312 static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 313 314 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 315 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 316 static bool pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 317 vm_prot_t prot); 318 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 319 u_int flags, vm_page_t m); 320 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 321 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 322 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); 323 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 324 pd_entry_t pde); 325 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 326 static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 327 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 328 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 329 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 330 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 331 #if VM_NRESERVLEVEL > 0 332 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 333 #endif 334 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 335 vm_prot_t prot); 336 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 337 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 338 struct spglist *free); 339 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 340 struct spglist *free); 341 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 342 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free); 343 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 344 struct spglist *free); 345 static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va); 346 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 347 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 348 vm_page_t m); 349 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 350 pd_entry_t newpde); 351 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 352 353 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 354 355 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 356 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 357 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 358 static void pmap_pte_release(pt_entry_t *pte); 359 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 360 #ifdef PMAP_PAE_COMP 361 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, 362 uint8_t *flags, int wait); 363 #endif 364 static void pmap_init_trm(void); 365 static void pmap_invalidate_all_int(pmap_t pmap); 366 367 static __inline void pagezero(void *page); 368 369 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 370 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 371 372 extern char _end[]; 373 extern u_long physfree; /* phys addr of next free page */ 374 extern u_long vm86phystk;/* PA of vm86/bios stack */ 375 extern u_long vm86paddr;/* address of vm86 region */ 376 extern int vm86pa; /* phys addr of vm86 region */ 377 extern u_long KERNend; /* phys addr end of kernel (just after bss) */ 378 #ifdef PMAP_PAE_COMP 379 pd_entry_t *IdlePTD_pae; /* phys addr of kernel PTD */ 380 pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */ 381 pt_entry_t *KPTmap_pae; /* address of kernel page tables */ 382 #define IdlePTD IdlePTD_pae 383 #define KPTmap KPTmap_pae 384 #else 385 pd_entry_t *IdlePTD_nopae; 386 pt_entry_t *KPTmap_nopae; 387 #define IdlePTD IdlePTD_nopae 388 #define KPTmap KPTmap_nopae 389 #endif 390 extern u_long KPTphys; /* phys addr of kernel page tables */ 391 extern u_long tramp_idleptd; 392 393 static u_long 394 allocpages(u_int cnt, u_long *physfree) 395 { 396 u_long res; 397 398 res = *physfree; 399 *physfree += PAGE_SIZE * cnt; 400 bzero((void *)res, PAGE_SIZE * cnt); 401 return (res); 402 } 403 404 static void 405 pmap_cold_map(u_long pa, u_long va, u_long cnt) 406 { 407 pt_entry_t *pt; 408 409 for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0; 410 cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE) 411 *pt = pa | PG_V | PG_RW | PG_A | PG_M; 412 } 413 414 static void 415 pmap_cold_mapident(u_long pa, u_long cnt) 416 { 417 418 pmap_cold_map(pa, pa, cnt); 419 } 420 421 _Static_assert(LOWPTDI * 2 * NBPDR == KERNBASE, 422 "Broken double-map of zero PTD"); 423 424 static void 425 __CONCAT(PMTYPE, remap_lower)(bool enable) 426 { 427 int i; 428 429 for (i = 0; i < LOWPTDI; i++) 430 IdlePTD[i] = enable ? IdlePTD[LOWPTDI + i] : 0; 431 load_cr3(rcr3()); /* invalidate TLB */ 432 } 433 434 /* 435 * Called from locore.s before paging is enabled. Sets up the first 436 * kernel page table. Since kernel is mapped with PA == VA, this code 437 * does not require relocations. 438 */ 439 void 440 __CONCAT(PMTYPE, cold)(void) 441 { 442 pt_entry_t *pt; 443 u_long a; 444 u_int cr3, ncr4; 445 446 physfree = (u_long)&_end; 447 if (bootinfo.bi_esymtab != 0) 448 physfree = bootinfo.bi_esymtab; 449 if (bootinfo.bi_kernend != 0) 450 physfree = bootinfo.bi_kernend; 451 physfree = roundup2(physfree, NBPDR); 452 KERNend = physfree; 453 454 /* Allocate Kernel Page Tables */ 455 KPTphys = allocpages(NKPT, &physfree); 456 KPTmap = (pt_entry_t *)KPTphys; 457 458 /* Allocate Page Table Directory */ 459 #ifdef PMAP_PAE_COMP 460 /* XXX only need 32 bytes (easier for now) */ 461 IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree); 462 #endif 463 IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree); 464 465 /* 466 * Allocate KSTACK. Leave a guard page between IdlePTD and 467 * proc0kstack, to control stack overflow for thread0 and 468 * prevent corruption of the page table. We leak the guard 469 * physical memory due to 1:1 mappings. 470 */ 471 allocpages(1, &physfree); 472 proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree); 473 474 /* vm86/bios stack */ 475 vm86phystk = allocpages(1, &physfree); 476 477 /* pgtable + ext + IOPAGES */ 478 vm86paddr = vm86pa = allocpages(3, &physfree); 479 480 /* Install page tables into PTD. Page table page 1 is wasted. */ 481 for (a = 0; a < NKPT; a++) 482 IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M; 483 484 #ifdef PMAP_PAE_COMP 485 /* PAE install PTD pointers into PDPT */ 486 for (a = 0; a < NPGPTD; a++) 487 IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V; 488 #endif 489 490 /* 491 * Install recursive mapping for kernel page tables into 492 * itself. 493 */ 494 for (a = 0; a < NPGPTD; a++) 495 IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V | 496 PG_RW; 497 498 /* 499 * Initialize page table pages mapping physical address zero 500 * through the (physical) end of the kernel. Many of these 501 * pages must be reserved, and we reserve them all and map 502 * them linearly for convenience. We do this even if we've 503 * enabled PSE above; we'll just switch the corresponding 504 * kernel PDEs before we turn on paging. 505 * 506 * This and all other page table entries allow read and write 507 * access for various reasons. Kernel mappings never have any 508 * access restrictions. 509 */ 510 pmap_cold_mapident(0, atop(NBPDR) * LOWPTDI); 511 pmap_cold_map(0, NBPDR * LOWPTDI, atop(NBPDR) * LOWPTDI); 512 pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE)); 513 514 /* Map page table directory */ 515 #ifdef PMAP_PAE_COMP 516 pmap_cold_mapident((u_long)IdlePDPT, 1); 517 #endif 518 pmap_cold_mapident((u_long)IdlePTD, NPGPTD); 519 520 /* Map early KPTmap. It is really pmap_cold_mapident. */ 521 pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT); 522 523 /* Map proc0kstack */ 524 pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES); 525 /* ISA hole already mapped */ 526 527 pmap_cold_mapident(vm86phystk, 1); 528 pmap_cold_mapident(vm86pa, 3); 529 530 /* Map page 0 into the vm86 page table */ 531 *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V; 532 533 /* ...likewise for the ISA hole for vm86 */ 534 for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0; 535 a < atop(ISA_HOLE_LENGTH); a++, pt++) 536 *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A | 537 PG_M | PG_V; 538 539 /* Enable PSE, PGE, VME, and PAE if configured. */ 540 ncr4 = 0; 541 if ((cpu_feature & CPUID_PSE) != 0) { 542 ncr4 |= CR4_PSE; 543 pseflag = PG_PS; 544 /* 545 * Superpage mapping of the kernel text. Existing 4k 546 * page table pages are wasted. 547 */ 548 for (a = KERNBASE; a < KERNend; a += NBPDR) 549 IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M | 550 PG_RW | PG_V; 551 } 552 if ((cpu_feature & CPUID_PGE) != 0) { 553 ncr4 |= CR4_PGE; 554 pgeflag = PG_G; 555 } 556 ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0; 557 #ifdef PMAP_PAE_COMP 558 ncr4 |= CR4_PAE; 559 #endif 560 if (ncr4 != 0) 561 load_cr4(rcr4() | ncr4); 562 563 /* Now enable paging */ 564 #ifdef PMAP_PAE_COMP 565 cr3 = (u_int)IdlePDPT; 566 if ((cpu_feature & CPUID_PAT) == 0) 567 wbinvd(); 568 #else 569 cr3 = (u_int)IdlePTD; 570 #endif 571 tramp_idleptd = cr3; 572 load_cr3(cr3); 573 load_cr0(rcr0() | CR0_PG); 574 575 /* 576 * Now running relocated at KERNBASE where the system is 577 * linked to run. 578 */ 579 580 /* 581 * Remove the lowest part of the double mapping of low memory 582 * to get some null pointer checks. 583 */ 584 __CONCAT(PMTYPE, remap_lower)(false); 585 586 kernel_vm_end = /* 0 + */ NKPT * NBPDR; 587 #ifdef PMAP_PAE_COMP 588 i386_pmap_VM_NFREEORDER = VM_NFREEORDER_PAE; 589 i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_PAE; 590 i386_pmap_PDRSHIFT = PDRSHIFT_PAE; 591 #else 592 i386_pmap_VM_NFREEORDER = VM_NFREEORDER_NOPAE; 593 i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_NOPAE; 594 i386_pmap_PDRSHIFT = PDRSHIFT_NOPAE; 595 #endif 596 } 597 598 static void 599 __CONCAT(PMTYPE, set_nx)(void) 600 { 601 602 #ifdef PMAP_PAE_COMP 603 if ((amd_feature & AMDID_NX) == 0) 604 return; 605 pg_nx = PG_NX; 606 elf32_nxstack = 1; 607 /* EFER.EFER_NXE is set in initializecpu(). */ 608 #endif 609 } 610 611 /* 612 * Bootstrap the system enough to run with virtual memory. 613 * 614 * On the i386 this is called after pmap_cold() created initial 615 * kernel page table and enabled paging, and just syncs the pmap 616 * module with what has already been done. 617 */ 618 static void 619 __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr) 620 { 621 vm_offset_t va; 622 pt_entry_t *pte, *unused; 623 struct pcpu *pc; 624 u_long res; 625 int i; 626 627 res = atop(firstaddr - (vm_paddr_t)KERNLOAD); 628 629 /* 630 * Add a physical memory segment (vm_phys_seg) corresponding to the 631 * preallocated kernel page table pages so that vm_page structures 632 * representing these pages will be created. The vm_page structures 633 * are required for promotion of the corresponding kernel virtual 634 * addresses to superpage mappings. 635 */ 636 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 637 638 /* 639 * Initialize the first available kernel virtual address. 640 * However, using "firstaddr" may waste a few pages of the 641 * kernel virtual address space, because pmap_cold() may not 642 * have mapped every physical page that it allocated. 643 * Preferably, pmap_cold() would provide a first unused 644 * virtual address in addition to "firstaddr". 645 */ 646 virtual_avail = (vm_offset_t)firstaddr; 647 virtual_end = VM_MAX_KERNEL_ADDRESS; 648 649 /* 650 * Initialize the kernel pmap (which is statically allocated). 651 * Count bootstrap data as being resident in case any of this data is 652 * later unmapped (using pmap_remove()) and freed. 653 */ 654 PMAP_LOCK_INIT(kernel_pmap); 655 kernel_pmap->pm_pdir = IdlePTD; 656 #ifdef PMAP_PAE_COMP 657 kernel_pmap->pm_pdpt = IdlePDPT; 658 #endif 659 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 660 kernel_pmap->pm_stats.resident_count = res; 661 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 662 663 /* 664 * Initialize the global pv list lock. 665 */ 666 rw_init(&pvh_global_lock, "pmap pv global"); 667 668 /* 669 * Reserve some special page table entries/VA space for temporary 670 * mapping of pages. 671 */ 672 #define SYSMAP(c, p, v, n) \ 673 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 674 675 va = virtual_avail; 676 pte = vtopte(va); 677 678 679 /* 680 * Initialize temporary map objects on the current CPU for use 681 * during early boot. 682 * CMAP1/CMAP2 are used for zeroing and copying pages. 683 * CMAP3 is used for the boot-time memory test. 684 */ 685 pc = get_pcpu(); 686 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 687 SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) 688 SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) 689 SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) 690 691 SYSMAP(caddr_t, CMAP3, CADDR3, 1); 692 693 /* 694 * Crashdump maps. 695 */ 696 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 697 698 /* 699 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 700 */ 701 SYSMAP(caddr_t, unused, ptvmmap, 1) 702 703 /* 704 * msgbufp is used to map the system message buffer. 705 */ 706 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 707 708 /* 709 * KPTmap is used by pmap_kextract(). 710 * 711 * KPTmap is first initialized by pmap_cold(). However, that initial 712 * KPTmap can only support NKPT page table pages. Here, a larger 713 * KPTmap is created that can support KVA_PAGES page table pages. 714 */ 715 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 716 717 for (i = 0; i < NKPT; i++) 718 KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; 719 720 /* 721 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 722 * respectively. 723 */ 724 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 725 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 726 SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1) 727 728 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 729 730 virtual_avail = va; 731 732 /* 733 * Initialize the PAT MSR if present. 734 * pmap_init_pat() clears and sets CR4_PGE, which, as a 735 * side-effect, invalidates stale PG_G TLB entries that might 736 * have been created in our pre-boot environment. We assume 737 * that PAT support implies PGE and in reverse, PGE presence 738 * comes with PAT. Both features were added for Pentium Pro. 739 */ 740 pmap_init_pat(); 741 } 742 743 static void 744 pmap_init_reserved_pages(void) 745 { 746 struct pcpu *pc; 747 vm_offset_t pages; 748 int i; 749 750 #ifdef PMAP_PAE_COMP 751 if (!pae_mode) 752 return; 753 #else 754 if (pae_mode) 755 return; 756 #endif 757 CPU_FOREACH(i) { 758 pc = pcpu_find(i); 759 mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF | 760 MTX_NEW); 761 pc->pc_copyout_maddr = kva_alloc(ptoa(2)); 762 if (pc->pc_copyout_maddr == 0) 763 panic("unable to allocate non-sleepable copyout KVA"); 764 sx_init(&pc->pc_copyout_slock, "cpslk"); 765 pc->pc_copyout_saddr = kva_alloc(ptoa(2)); 766 if (pc->pc_copyout_saddr == 0) 767 panic("unable to allocate sleepable copyout KVA"); 768 pc->pc_pmap_eh_va = kva_alloc(ptoa(1)); 769 if (pc->pc_pmap_eh_va == 0) 770 panic("unable to allocate pmap_extract_and_hold KVA"); 771 pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va); 772 773 /* 774 * Skip if the mappings have already been initialized, 775 * i.e. this is the BSP. 776 */ 777 if (pc->pc_cmap_addr1 != 0) 778 continue; 779 780 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 781 pages = kva_alloc(PAGE_SIZE * 3); 782 if (pages == 0) 783 panic("unable to allocate CMAP KVA"); 784 pc->pc_cmap_pte1 = vtopte(pages); 785 pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); 786 pc->pc_cmap_addr1 = (caddr_t)pages; 787 pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); 788 pc->pc_qmap_addr = pages + ptoa(2); 789 } 790 } 791 792 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 793 794 /* 795 * Setup the PAT MSR. 796 */ 797 static void 798 __CONCAT(PMTYPE, init_pat)(void) 799 { 800 int pat_table[PAT_INDEX_SIZE]; 801 uint64_t pat_msr; 802 u_long cr0, cr4; 803 int i; 804 805 /* Set default PAT index table. */ 806 for (i = 0; i < PAT_INDEX_SIZE; i++) 807 pat_table[i] = -1; 808 pat_table[PAT_WRITE_BACK] = 0; 809 pat_table[PAT_WRITE_THROUGH] = 1; 810 pat_table[PAT_UNCACHEABLE] = 3; 811 pat_table[PAT_WRITE_COMBINING] = 3; 812 pat_table[PAT_WRITE_PROTECTED] = 3; 813 pat_table[PAT_UNCACHED] = 3; 814 815 /* 816 * Bail if this CPU doesn't implement PAT. 817 * We assume that PAT support implies PGE. 818 */ 819 if ((cpu_feature & CPUID_PAT) == 0) { 820 for (i = 0; i < PAT_INDEX_SIZE; i++) 821 pat_index[i] = pat_table[i]; 822 pat_works = 0; 823 return; 824 } 825 826 /* 827 * Due to some Intel errata, we can only safely use the lower 4 828 * PAT entries. 829 * 830 * Intel Pentium III Processor Specification Update 831 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 832 * or Mode C Paging) 833 * 834 * Intel Pentium IV Processor Specification Update 835 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 836 */ 837 if (cpu_vendor_id == CPU_VENDOR_INTEL && 838 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 839 pat_works = 0; 840 841 /* Initialize default PAT entries. */ 842 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 843 PAT_VALUE(1, PAT_WRITE_THROUGH) | 844 PAT_VALUE(2, PAT_UNCACHED) | 845 PAT_VALUE(3, PAT_UNCACHEABLE) | 846 PAT_VALUE(4, PAT_WRITE_BACK) | 847 PAT_VALUE(5, PAT_WRITE_THROUGH) | 848 PAT_VALUE(6, PAT_UNCACHED) | 849 PAT_VALUE(7, PAT_UNCACHEABLE); 850 851 if (pat_works) { 852 /* 853 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 854 * Program 5 and 6 as WP and WC. 855 * Leave 4 and 7 as WB and UC. 856 */ 857 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 858 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 859 PAT_VALUE(6, PAT_WRITE_COMBINING); 860 pat_table[PAT_UNCACHED] = 2; 861 pat_table[PAT_WRITE_PROTECTED] = 5; 862 pat_table[PAT_WRITE_COMBINING] = 6; 863 } else { 864 /* 865 * Just replace PAT Index 2 with WC instead of UC-. 866 */ 867 pat_msr &= ~PAT_MASK(2); 868 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 869 pat_table[PAT_WRITE_COMBINING] = 2; 870 } 871 872 /* Disable PGE. */ 873 cr4 = rcr4(); 874 load_cr4(cr4 & ~CR4_PGE); 875 876 /* Disable caches (CD = 1, NW = 0). */ 877 cr0 = rcr0(); 878 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 879 880 /* Flushes caches and TLBs. */ 881 wbinvd(); 882 invltlb(); 883 884 /* Update PAT and index table. */ 885 wrmsr(MSR_PAT, pat_msr); 886 for (i = 0; i < PAT_INDEX_SIZE; i++) 887 pat_index[i] = pat_table[i]; 888 889 /* Flush caches and TLBs again. */ 890 wbinvd(); 891 invltlb(); 892 893 /* Restore caches and PGE. */ 894 load_cr0(cr0); 895 load_cr4(cr4); 896 } 897 898 #ifdef PMAP_PAE_COMP 899 static void * 900 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 901 int wait) 902 { 903 904 /* Inform UMA that this allocator uses kernel_map/object. */ 905 *flags = UMA_SLAB_KERNEL; 906 return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), 907 bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 908 } 909 #endif 910 911 /* 912 * Abuse the pte nodes for unmapped kva to thread a kva freelist through. 913 * Requirements: 914 * - Must deal with pages in order to ensure that none of the PG_* bits 915 * are ever set, PG_V in particular. 916 * - Assumes we can write to ptes without pte_store() atomic ops, even 917 * on PAE systems. This should be ok. 918 * - Assumes nothing will ever test these addresses for 0 to indicate 919 * no mapping instead of correctly checking PG_V. 920 * - Assumes a vm_offset_t will fit in a pte (true for i386). 921 * Because PG_V is never set, there can be no mappings to invalidate. 922 */ 923 static vm_offset_t 924 pmap_ptelist_alloc(vm_offset_t *head) 925 { 926 pt_entry_t *pte; 927 vm_offset_t va; 928 929 va = *head; 930 if (va == 0) 931 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 932 pte = vtopte(va); 933 *head = *pte; 934 if (*head & PG_V) 935 panic("pmap_ptelist_alloc: va with PG_V set!"); 936 *pte = 0; 937 return (va); 938 } 939 940 static void 941 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 942 { 943 pt_entry_t *pte; 944 945 if (va & PG_V) 946 panic("pmap_ptelist_free: freeing va with PG_V set!"); 947 pte = vtopte(va); 948 *pte = *head; /* virtual! PG_V is 0 though */ 949 *head = va; 950 } 951 952 static void 953 pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 954 { 955 int i; 956 vm_offset_t va; 957 958 *head = 0; 959 for (i = npages - 1; i >= 0; i--) { 960 va = (vm_offset_t)base + i * PAGE_SIZE; 961 pmap_ptelist_free(head, va); 962 } 963 } 964 965 966 /* 967 * Initialize the pmap module. 968 * Called by vm_init, to initialize any structures that the pmap 969 * system needs to map virtual memory. 970 */ 971 static void 972 __CONCAT(PMTYPE, init)(void) 973 { 974 struct pmap_preinit_mapping *ppim; 975 vm_page_t mpte; 976 vm_size_t s; 977 int i, pv_npg; 978 979 /* 980 * Initialize the vm page array entries for the kernel pmap's 981 * page table pages. 982 */ 983 PMAP_LOCK(kernel_pmap); 984 for (i = 0; i < NKPT; i++) { 985 mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i)); 986 KASSERT(mpte >= vm_page_array && 987 mpte < &vm_page_array[vm_page_array_size], 988 ("pmap_init: page table page is out of range")); 989 mpte->pindex = i + KPTDI; 990 mpte->phys_addr = KPTphys + ptoa(i); 991 mpte->ref_count = 1; 992 993 /* 994 * Collect the page table pages that were replaced by a 2/4MB 995 * page. They are filled with equivalent 4KB page mappings. 996 */ 997 if (pseflag != 0 && 998 KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend && 999 pmap_insert_pt_page(kernel_pmap, mpte, true)) 1000 panic("pmap_init: pmap_insert_pt_page failed"); 1001 } 1002 PMAP_UNLOCK(kernel_pmap); 1003 vm_wire_add(NKPT); 1004 1005 /* 1006 * Initialize the address space (zone) for the pv entries. Set a 1007 * high water mark so that the system can recover from excessive 1008 * numbers of pv entries. 1009 */ 1010 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1011 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1012 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1013 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1014 pv_entry_high_water = 9 * (pv_entry_max / 10); 1015 1016 /* 1017 * If the kernel is running on a virtual machine, then it must assume 1018 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1019 * be prepared for the hypervisor changing the vendor and family that 1020 * are reported by CPUID. Consequently, the workaround for AMD Family 1021 * 10h Erratum 383 is enabled if the processor's feature set does not 1022 * include at least one feature that is only supported by older Intel 1023 * or newer AMD processors. 1024 */ 1025 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 1026 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1027 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1028 AMDID2_FMA4)) == 0) 1029 workaround_erratum383 = 1; 1030 1031 /* 1032 * Are large page mappings supported and enabled? 1033 */ 1034 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1035 if (pseflag == 0) 1036 pg_ps_enabled = 0; 1037 else if (pg_ps_enabled) { 1038 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1039 ("pmap_init: can't assign to pagesizes[1]")); 1040 pagesizes[1] = NBPDR; 1041 } 1042 1043 /* 1044 * Calculate the size of the pv head table for superpages. 1045 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1046 */ 1047 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 1048 PAGE_SIZE) / NBPDR + 1; 1049 1050 /* 1051 * Allocate memory for the pv head table for superpages. 1052 */ 1053 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1054 s = round_page(s); 1055 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1056 for (i = 0; i < pv_npg; i++) 1057 TAILQ_INIT(&pv_table[i].pv_list); 1058 1059 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1060 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1061 if (pv_chunkbase == NULL) 1062 panic("pmap_init: not enough kvm for pv chunks"); 1063 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1064 #ifdef PMAP_PAE_COMP 1065 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 1066 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 1067 UMA_ZONE_CONTIG | UMA_ZONE_VM | UMA_ZONE_NOFREE); 1068 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 1069 #endif 1070 1071 pmap_initialized = 1; 1072 pmap_init_trm(); 1073 1074 if (!bootverbose) 1075 return; 1076 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1077 ppim = pmap_preinit_mapping + i; 1078 if (ppim->va == 0) 1079 continue; 1080 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, 1081 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); 1082 } 1083 1084 } 1085 1086 extern u_long pmap_pde_demotions; 1087 extern u_long pmap_pde_mappings; 1088 extern u_long pmap_pde_p_failures; 1089 extern u_long pmap_pde_promotions; 1090 1091 /*************************************************** 1092 * Low level helper routines..... 1093 ***************************************************/ 1094 1095 static boolean_t 1096 __CONCAT(PMTYPE, is_valid_memattr)(pmap_t pmap __unused, vm_memattr_t mode) 1097 { 1098 1099 return (mode >= 0 && mode < PAT_INDEX_SIZE && 1100 pat_index[(int)mode] >= 0); 1101 } 1102 1103 /* 1104 * Determine the appropriate bits to set in a PTE or PDE for a specified 1105 * caching mode. 1106 */ 1107 static int 1108 __CONCAT(PMTYPE, cache_bits)(pmap_t pmap, int mode, boolean_t is_pde) 1109 { 1110 int cache_bits, pat_flag, pat_idx; 1111 1112 if (!pmap_is_valid_memattr(pmap, mode)) 1113 panic("Unknown caching mode %d\n", mode); 1114 1115 /* The PAT bit is different for PTE's and PDE's. */ 1116 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 1117 1118 /* Map the caching mode to a PAT index. */ 1119 pat_idx = pat_index[mode]; 1120 1121 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1122 cache_bits = 0; 1123 if (pat_idx & 0x4) 1124 cache_bits |= pat_flag; 1125 if (pat_idx & 0x2) 1126 cache_bits |= PG_NC_PCD; 1127 if (pat_idx & 0x1) 1128 cache_bits |= PG_NC_PWT; 1129 return (cache_bits); 1130 } 1131 1132 static int 1133 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 1134 { 1135 int pat_flag, pat_idx; 1136 1137 if ((cpu_feature & CPUID_PAT) == 0) 1138 return (0); 1139 1140 pat_idx = 0; 1141 /* The PAT bit is different for PTE's and PDE's. */ 1142 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 1143 1144 if ((pte & pat_flag) != 0) 1145 pat_idx |= 0x4; 1146 if ((pte & PG_NC_PCD) != 0) 1147 pat_idx |= 0x2; 1148 if ((pte & PG_NC_PWT) != 0) 1149 pat_idx |= 0x1; 1150 1151 /* See pmap_init_pat(). */ 1152 if (pat_works) { 1153 if (pat_idx == 4) 1154 pat_idx = 0; 1155 if (pat_idx == 7) 1156 pat_idx = 3; 1157 } else { 1158 /* XXXKIB */ 1159 } 1160 1161 return (pat_idx); 1162 } 1163 1164 static bool 1165 __CONCAT(PMTYPE, ps_enabled)(pmap_t pmap __unused) 1166 { 1167 1168 return (pg_ps_enabled); 1169 } 1170 1171 /* 1172 * The caller is responsible for maintaining TLB consistency. 1173 */ 1174 static void 1175 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 1176 { 1177 pd_entry_t *pde; 1178 1179 pde = pmap_pde(kernel_pmap, va); 1180 pde_store(pde, newpde); 1181 } 1182 1183 /* 1184 * After changing the page size for the specified virtual address in the page 1185 * table, flush the corresponding entries from the processor's TLB. Only the 1186 * calling processor's TLB is affected. 1187 * 1188 * The calling thread must be pinned to a processor. 1189 */ 1190 static void 1191 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 1192 { 1193 1194 if ((newpde & PG_PS) == 0) 1195 /* Demotion: flush a specific 2MB page mapping. */ 1196 invlpg(va); 1197 else /* if ((newpde & PG_G) == 0) */ 1198 /* 1199 * Promotion: flush every 4KB page mapping from the TLB 1200 * because there are too many to flush individually. 1201 */ 1202 invltlb(); 1203 } 1204 1205 #ifdef SMP 1206 /* 1207 * For SMP, these functions have to use the IPI mechanism for coherence. 1208 * 1209 * N.B.: Before calling any of the following TLB invalidation functions, 1210 * the calling processor must ensure that all stores updating a non- 1211 * kernel page table are globally performed. Otherwise, another 1212 * processor could cache an old, pre-update entry without being 1213 * invalidated. This can happen one of two ways: (1) The pmap becomes 1214 * active on another processor after its pm_active field is checked by 1215 * one of the following functions but before a store updating the page 1216 * table is globally performed. (2) The pmap becomes active on another 1217 * processor before its pm_active field is checked but due to 1218 * speculative loads one of the following functions stills reads the 1219 * pmap as inactive on the other processor. 1220 * 1221 * The kernel page table is exempt because its pm_active field is 1222 * immutable. The kernel page table is always active on every 1223 * processor. 1224 */ 1225 static void 1226 pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) 1227 { 1228 cpuset_t *mask, other_cpus; 1229 u_int cpuid; 1230 1231 sched_pin(); 1232 if (pmap == kernel_pmap) { 1233 invlpg(va); 1234 mask = &all_cpus; 1235 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1236 mask = &all_cpus; 1237 } else { 1238 cpuid = PCPU_GET(cpuid); 1239 other_cpus = all_cpus; 1240 CPU_CLR(cpuid, &other_cpus); 1241 CPU_AND(&other_cpus, &pmap->pm_active); 1242 mask = &other_cpus; 1243 } 1244 smp_masked_invlpg(*mask, va, pmap); 1245 sched_unpin(); 1246 } 1247 1248 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1249 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1250 1251 static void 1252 pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1253 { 1254 cpuset_t *mask, other_cpus; 1255 vm_offset_t addr; 1256 u_int cpuid; 1257 1258 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1259 pmap_invalidate_all_int(pmap); 1260 return; 1261 } 1262 1263 sched_pin(); 1264 if (pmap == kernel_pmap) { 1265 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1266 invlpg(addr); 1267 mask = &all_cpus; 1268 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1269 mask = &all_cpus; 1270 } else { 1271 cpuid = PCPU_GET(cpuid); 1272 other_cpus = all_cpus; 1273 CPU_CLR(cpuid, &other_cpus); 1274 CPU_AND(&other_cpus, &pmap->pm_active); 1275 mask = &other_cpus; 1276 } 1277 smp_masked_invlpg_range(*mask, sva, eva, pmap); 1278 sched_unpin(); 1279 } 1280 1281 static void 1282 pmap_invalidate_all_int(pmap_t pmap) 1283 { 1284 cpuset_t *mask, other_cpus; 1285 u_int cpuid; 1286 1287 sched_pin(); 1288 if (pmap == kernel_pmap) { 1289 invltlb(); 1290 mask = &all_cpus; 1291 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1292 mask = &all_cpus; 1293 } else { 1294 cpuid = PCPU_GET(cpuid); 1295 other_cpus = all_cpus; 1296 CPU_CLR(cpuid, &other_cpus); 1297 CPU_AND(&other_cpus, &pmap->pm_active); 1298 mask = &other_cpus; 1299 } 1300 smp_masked_invltlb(*mask, pmap); 1301 sched_unpin(); 1302 } 1303 1304 static void 1305 __CONCAT(PMTYPE, invalidate_cache)(void) 1306 { 1307 1308 sched_pin(); 1309 wbinvd(); 1310 smp_cache_flush(); 1311 sched_unpin(); 1312 } 1313 1314 struct pde_action { 1315 cpuset_t invalidate; /* processors that invalidate their TLB */ 1316 vm_offset_t va; 1317 pd_entry_t *pde; 1318 pd_entry_t newpde; 1319 u_int store; /* processor that updates the PDE */ 1320 }; 1321 1322 static void 1323 pmap_update_pde_kernel(void *arg) 1324 { 1325 struct pde_action *act = arg; 1326 pd_entry_t *pde; 1327 1328 if (act->store == PCPU_GET(cpuid)) { 1329 pde = pmap_pde(kernel_pmap, act->va); 1330 pde_store(pde, act->newpde); 1331 } 1332 } 1333 1334 static void 1335 pmap_update_pde_user(void *arg) 1336 { 1337 struct pde_action *act = arg; 1338 1339 if (act->store == PCPU_GET(cpuid)) 1340 pde_store(act->pde, act->newpde); 1341 } 1342 1343 static void 1344 pmap_update_pde_teardown(void *arg) 1345 { 1346 struct pde_action *act = arg; 1347 1348 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1349 pmap_update_pde_invalidate(act->va, act->newpde); 1350 } 1351 1352 /* 1353 * Change the page size for the specified virtual address in a way that 1354 * prevents any possibility of the TLB ever having two entries that map the 1355 * same virtual address using different page sizes. This is the recommended 1356 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1357 * machine check exception for a TLB state that is improperly diagnosed as a 1358 * hardware error. 1359 */ 1360 static void 1361 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1362 { 1363 struct pde_action act; 1364 cpuset_t active, other_cpus; 1365 u_int cpuid; 1366 1367 sched_pin(); 1368 cpuid = PCPU_GET(cpuid); 1369 other_cpus = all_cpus; 1370 CPU_CLR(cpuid, &other_cpus); 1371 if (pmap == kernel_pmap) 1372 active = all_cpus; 1373 else 1374 active = pmap->pm_active; 1375 if (CPU_OVERLAP(&active, &other_cpus)) { 1376 act.store = cpuid; 1377 act.invalidate = active; 1378 act.va = va; 1379 act.pde = pde; 1380 act.newpde = newpde; 1381 CPU_SET(cpuid, &active); 1382 smp_rendezvous_cpus(active, 1383 smp_no_rendezvous_barrier, pmap == kernel_pmap ? 1384 pmap_update_pde_kernel : pmap_update_pde_user, 1385 pmap_update_pde_teardown, &act); 1386 } else { 1387 if (pmap == kernel_pmap) 1388 pmap_kenter_pde(va, newpde); 1389 else 1390 pde_store(pde, newpde); 1391 if (CPU_ISSET(cpuid, &active)) 1392 pmap_update_pde_invalidate(va, newpde); 1393 } 1394 sched_unpin(); 1395 } 1396 #else /* !SMP */ 1397 /* 1398 * Normal, non-SMP, 486+ invalidation functions. 1399 * We inline these within pmap.c for speed. 1400 */ 1401 static void 1402 pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) 1403 { 1404 1405 if (pmap == kernel_pmap) 1406 invlpg(va); 1407 } 1408 1409 static void 1410 pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1411 { 1412 vm_offset_t addr; 1413 1414 if (pmap == kernel_pmap) 1415 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1416 invlpg(addr); 1417 } 1418 1419 static void 1420 pmap_invalidate_all_int(pmap_t pmap) 1421 { 1422 1423 if (pmap == kernel_pmap) 1424 invltlb(); 1425 } 1426 1427 static void 1428 __CONCAT(PMTYPE, invalidate_cache)(void) 1429 { 1430 1431 wbinvd(); 1432 } 1433 1434 static void 1435 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1436 { 1437 1438 if (pmap == kernel_pmap) 1439 pmap_kenter_pde(va, newpde); 1440 else 1441 pde_store(pde, newpde); 1442 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1443 pmap_update_pde_invalidate(va, newpde); 1444 } 1445 #endif /* !SMP */ 1446 1447 static void 1448 __CONCAT(PMTYPE, invalidate_page)(pmap_t pmap, vm_offset_t va) 1449 { 1450 1451 pmap_invalidate_page_int(pmap, va); 1452 } 1453 1454 static void 1455 __CONCAT(PMTYPE, invalidate_range)(pmap_t pmap, vm_offset_t sva, 1456 vm_offset_t eva) 1457 { 1458 1459 pmap_invalidate_range_int(pmap, sva, eva); 1460 } 1461 1462 static void 1463 __CONCAT(PMTYPE, invalidate_all)(pmap_t pmap) 1464 { 1465 1466 pmap_invalidate_all_int(pmap); 1467 } 1468 1469 static void 1470 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1471 { 1472 1473 /* 1474 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was 1475 * created by a promotion that did not invalidate the 512 or 1024 4KB 1476 * page mappings that might exist in the TLB. Consequently, at this 1477 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for 1478 * the address range [va, va + NBPDR). Therefore, the entire range 1479 * must be invalidated here. In contrast, when PG_PROMOTED is clear, 1480 * the TLB will not hold any 4KB page mappings for the address range 1481 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the 1482 * 2- or 4MB page mapping from the TLB. 1483 */ 1484 if ((pde & PG_PROMOTED) != 0) 1485 pmap_invalidate_range_int(pmap, va, va + NBPDR - 1); 1486 else 1487 pmap_invalidate_page_int(pmap, va); 1488 } 1489 1490 /* 1491 * Are we current address space or kernel? 1492 */ 1493 static __inline int 1494 pmap_is_current(pmap_t pmap) 1495 { 1496 1497 return (pmap == kernel_pmap); 1498 } 1499 1500 /* 1501 * If the given pmap is not the current or kernel pmap, the returned pte must 1502 * be released by passing it to pmap_pte_release(). 1503 */ 1504 static pt_entry_t * 1505 __CONCAT(PMTYPE, pte)(pmap_t pmap, vm_offset_t va) 1506 { 1507 pd_entry_t newpf; 1508 pd_entry_t *pde; 1509 1510 pde = pmap_pde(pmap, va); 1511 if (*pde & PG_PS) 1512 return (pde); 1513 if (*pde != 0) { 1514 /* are we current address space or kernel? */ 1515 if (pmap_is_current(pmap)) 1516 return (vtopte(va)); 1517 mtx_lock(&PMAP2mutex); 1518 newpf = *pde & PG_FRAME; 1519 if ((*PMAP2 & PG_FRAME) != newpf) { 1520 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1521 pmap_invalidate_page_int(kernel_pmap, 1522 (vm_offset_t)PADDR2); 1523 } 1524 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1525 } 1526 return (NULL); 1527 } 1528 1529 /* 1530 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1531 * being NULL. 1532 */ 1533 static __inline void 1534 pmap_pte_release(pt_entry_t *pte) 1535 { 1536 1537 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1538 mtx_unlock(&PMAP2mutex); 1539 } 1540 1541 /* 1542 * NB: The sequence of updating a page table followed by accesses to the 1543 * corresponding pages is subject to the situation described in the "AMD64 1544 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1545 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1546 * right after modifying the PTE bits is crucial. 1547 */ 1548 static __inline void 1549 invlcaddr(void *caddr) 1550 { 1551 1552 invlpg((u_int)caddr); 1553 } 1554 1555 /* 1556 * Super fast pmap_pte routine best used when scanning 1557 * the pv lists. This eliminates many coarse-grained 1558 * invltlb calls. Note that many of the pv list 1559 * scans are across different pmaps. It is very wasteful 1560 * to do an entire invltlb for checking a single mapping. 1561 * 1562 * If the given pmap is not the current pmap, pvh_global_lock 1563 * must be held and curthread pinned to a CPU. 1564 */ 1565 static pt_entry_t * 1566 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1567 { 1568 pd_entry_t newpf; 1569 pd_entry_t *pde; 1570 1571 pde = pmap_pde(pmap, va); 1572 if (*pde & PG_PS) 1573 return (pde); 1574 if (*pde != 0) { 1575 /* are we current address space or kernel? */ 1576 if (pmap_is_current(pmap)) 1577 return (vtopte(va)); 1578 rw_assert(&pvh_global_lock, RA_WLOCKED); 1579 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1580 newpf = *pde & PG_FRAME; 1581 if ((*PMAP1 & PG_FRAME) != newpf) { 1582 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1583 #ifdef SMP 1584 PMAP1cpu = PCPU_GET(cpuid); 1585 #endif 1586 invlcaddr(PADDR1); 1587 PMAP1changed++; 1588 } else 1589 #ifdef SMP 1590 if (PMAP1cpu != PCPU_GET(cpuid)) { 1591 PMAP1cpu = PCPU_GET(cpuid); 1592 invlcaddr(PADDR1); 1593 PMAP1changedcpu++; 1594 } else 1595 #endif 1596 PMAP1unchanged++; 1597 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1598 } 1599 return (0); 1600 } 1601 1602 static pt_entry_t * 1603 pmap_pte_quick3(pmap_t pmap, vm_offset_t va) 1604 { 1605 pd_entry_t newpf; 1606 pd_entry_t *pde; 1607 1608 pde = pmap_pde(pmap, va); 1609 if (*pde & PG_PS) 1610 return (pde); 1611 if (*pde != 0) { 1612 rw_assert(&pvh_global_lock, RA_WLOCKED); 1613 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1614 newpf = *pde & PG_FRAME; 1615 if ((*PMAP3 & PG_FRAME) != newpf) { 1616 *PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M; 1617 #ifdef SMP 1618 PMAP3cpu = PCPU_GET(cpuid); 1619 #endif 1620 invlcaddr(PADDR3); 1621 PMAP1changed++; 1622 } else 1623 #ifdef SMP 1624 if (PMAP3cpu != PCPU_GET(cpuid)) { 1625 PMAP3cpu = PCPU_GET(cpuid); 1626 invlcaddr(PADDR3); 1627 PMAP1changedcpu++; 1628 } else 1629 #endif 1630 PMAP1unchanged++; 1631 return (PADDR3 + (i386_btop(va) & (NPTEPG - 1))); 1632 } 1633 return (0); 1634 } 1635 1636 static pt_entry_t 1637 pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1638 { 1639 pt_entry_t *eh_ptep, pte, *ptep; 1640 1641 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1642 pde &= PG_FRAME; 1643 critical_enter(); 1644 eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep); 1645 if ((*eh_ptep & PG_FRAME) != pde) { 1646 *eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M; 1647 invlcaddr((void *)PCPU_GET(pmap_eh_va)); 1648 } 1649 ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) & 1650 (NPTEPG - 1)); 1651 pte = *ptep; 1652 critical_exit(); 1653 return (pte); 1654 } 1655 1656 /* 1657 * Extract from the kernel page table the physical address that is mapped by 1658 * the given virtual address "va". 1659 * 1660 * This function may be used before pmap_bootstrap() is called. 1661 */ 1662 static vm_paddr_t 1663 __CONCAT(PMTYPE, kextract)(vm_offset_t va) 1664 { 1665 vm_paddr_t pa; 1666 1667 if ((pa = pte_load(&PTD[va >> PDRSHIFT])) & PG_PS) { 1668 pa = (pa & PG_PS_FRAME) | (va & PDRMASK); 1669 } else { 1670 /* 1671 * Beware of a concurrent promotion that changes the PDE at 1672 * this point! For example, vtopte() must not be used to 1673 * access the PTE because it would use the new PDE. It is, 1674 * however, safe to use the old PDE because the page table 1675 * page is preserved by the promotion. 1676 */ 1677 pa = KPTmap[i386_btop(va)]; 1678 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1679 } 1680 return (pa); 1681 } 1682 1683 /* 1684 * Routine: pmap_extract 1685 * Function: 1686 * Extract the physical page address associated 1687 * with the given map/virtual_address pair. 1688 */ 1689 static vm_paddr_t 1690 __CONCAT(PMTYPE, extract)(pmap_t pmap, vm_offset_t va) 1691 { 1692 vm_paddr_t rtval; 1693 pt_entry_t pte; 1694 pd_entry_t pde; 1695 1696 rtval = 0; 1697 PMAP_LOCK(pmap); 1698 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1699 if (pde != 0) { 1700 if ((pde & PG_PS) != 0) 1701 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1702 else { 1703 pte = pmap_pte_ufast(pmap, va, pde); 1704 rtval = (pte & PG_FRAME) | (va & PAGE_MASK); 1705 } 1706 } 1707 PMAP_UNLOCK(pmap); 1708 return (rtval); 1709 } 1710 1711 /* 1712 * Routine: pmap_extract_and_hold 1713 * Function: 1714 * Atomically extract and hold the physical page 1715 * with the given pmap and virtual address pair 1716 * if that mapping permits the given protection. 1717 */ 1718 static vm_page_t 1719 __CONCAT(PMTYPE, extract_and_hold)(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1720 { 1721 pd_entry_t pde; 1722 pt_entry_t pte; 1723 vm_page_t m; 1724 1725 m = NULL; 1726 PMAP_LOCK(pmap); 1727 pde = *pmap_pde(pmap, va); 1728 if (pde != 0) { 1729 if (pde & PG_PS) { 1730 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) 1731 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1732 (va & PDRMASK)); 1733 } else { 1734 pte = pmap_pte_ufast(pmap, va, pde); 1735 if (pte != 0 && 1736 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) 1737 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1738 } 1739 if (m != NULL && !vm_page_wire_mapped(m)) 1740 m = NULL; 1741 } 1742 PMAP_UNLOCK(pmap); 1743 return (m); 1744 } 1745 1746 /*************************************************** 1747 * Low level mapping routines..... 1748 ***************************************************/ 1749 1750 /* 1751 * Add a wired page to the kva. 1752 * Note: not SMP coherent. 1753 * 1754 * This function may be used before pmap_bootstrap() is called. 1755 */ 1756 static void 1757 __CONCAT(PMTYPE, kenter)(vm_offset_t va, vm_paddr_t pa) 1758 { 1759 pt_entry_t *pte; 1760 1761 pte = vtopte(va); 1762 pte_store(pte, pa | PG_RW | PG_V); 1763 } 1764 1765 static __inline void 1766 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1767 { 1768 pt_entry_t *pte; 1769 1770 pte = vtopte(va); 1771 pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, 1772 mode, 0)); 1773 } 1774 1775 /* 1776 * Remove a page from the kernel pagetables. 1777 * Note: not SMP coherent. 1778 * 1779 * This function may be used before pmap_bootstrap() is called. 1780 */ 1781 static void 1782 __CONCAT(PMTYPE, kremove)(vm_offset_t va) 1783 { 1784 pt_entry_t *pte; 1785 1786 pte = vtopte(va); 1787 pte_clear(pte); 1788 } 1789 1790 /* 1791 * Used to map a range of physical addresses into kernel 1792 * virtual address space. 1793 * 1794 * The value passed in '*virt' is a suggested virtual address for 1795 * the mapping. Architectures which can support a direct-mapped 1796 * physical to virtual region can return the appropriate address 1797 * within that region, leaving '*virt' unchanged. Other 1798 * architectures should map the pages starting at '*virt' and 1799 * update '*virt' with the first usable address after the mapped 1800 * region. 1801 */ 1802 static vm_offset_t 1803 __CONCAT(PMTYPE, map)(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1804 int prot) 1805 { 1806 vm_offset_t va, sva; 1807 vm_paddr_t superpage_offset; 1808 pd_entry_t newpde; 1809 1810 va = *virt; 1811 /* 1812 * Does the physical address range's size and alignment permit at 1813 * least one superpage mapping to be created? 1814 */ 1815 superpage_offset = start & PDRMASK; 1816 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1817 /* 1818 * Increase the starting virtual address so that its alignment 1819 * does not preclude the use of superpage mappings. 1820 */ 1821 if ((va & PDRMASK) < superpage_offset) 1822 va = (va & ~PDRMASK) + superpage_offset; 1823 else if ((va & PDRMASK) > superpage_offset) 1824 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1825 } 1826 sva = va; 1827 while (start < end) { 1828 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1829 pseflag != 0) { 1830 KASSERT((va & PDRMASK) == 0, 1831 ("pmap_map: misaligned va %#x", va)); 1832 newpde = start | PG_PS | PG_RW | PG_V; 1833 pmap_kenter_pde(va, newpde); 1834 va += NBPDR; 1835 start += NBPDR; 1836 } else { 1837 pmap_kenter(va, start); 1838 va += PAGE_SIZE; 1839 start += PAGE_SIZE; 1840 } 1841 } 1842 pmap_invalidate_range_int(kernel_pmap, sva, va); 1843 *virt = va; 1844 return (sva); 1845 } 1846 1847 1848 /* 1849 * Add a list of wired pages to the kva 1850 * this routine is only used for temporary 1851 * kernel mappings that do not need to have 1852 * page modification or references recorded. 1853 * Note that old mappings are simply written 1854 * over. The page *must* be wired. 1855 * Note: SMP coherent. Uses a ranged shootdown IPI. 1856 */ 1857 static void 1858 __CONCAT(PMTYPE, qenter)(vm_offset_t sva, vm_page_t *ma, int count) 1859 { 1860 pt_entry_t *endpte, oldpte, pa, *pte; 1861 vm_page_t m; 1862 1863 oldpte = 0; 1864 pte = vtopte(sva); 1865 endpte = pte + count; 1866 while (pte < endpte) { 1867 m = *ma++; 1868 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(kernel_pmap, 1869 m->md.pat_mode, 0); 1870 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1871 oldpte |= *pte; 1872 pte_store(pte, pa | pg_nx | PG_RW | PG_V); 1873 } 1874 pte++; 1875 } 1876 if (__predict_false((oldpte & PG_V) != 0)) 1877 pmap_invalidate_range_int(kernel_pmap, sva, sva + count * 1878 PAGE_SIZE); 1879 } 1880 1881 /* 1882 * This routine tears out page mappings from the 1883 * kernel -- it is meant only for temporary mappings. 1884 * Note: SMP coherent. Uses a ranged shootdown IPI. 1885 */ 1886 static void 1887 __CONCAT(PMTYPE, qremove)(vm_offset_t sva, int count) 1888 { 1889 vm_offset_t va; 1890 1891 va = sva; 1892 while (count-- > 0) { 1893 pmap_kremove(va); 1894 va += PAGE_SIZE; 1895 } 1896 pmap_invalidate_range_int(kernel_pmap, sva, va); 1897 } 1898 1899 /*************************************************** 1900 * Page table page management routines..... 1901 ***************************************************/ 1902 /* 1903 * Schedule the specified unused page table page to be freed. Specifically, 1904 * add the page to the specified list of pages that will be released to the 1905 * physical memory manager after the TLB has been updated. 1906 */ 1907 static __inline void 1908 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1909 boolean_t set_PG_ZERO) 1910 { 1911 1912 if (set_PG_ZERO) 1913 m->flags |= PG_ZERO; 1914 else 1915 m->flags &= ~PG_ZERO; 1916 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1917 } 1918 1919 /* 1920 * Inserts the specified page table page into the specified pmap's collection 1921 * of idle page table pages. Each of a pmap's page table pages is responsible 1922 * for mapping a distinct range of virtual addresses. The pmap's collection is 1923 * ordered by this virtual address range. 1924 * 1925 * If "promoted" is false, then the page table page "mpte" must be zero filled. 1926 */ 1927 static __inline int 1928 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) 1929 { 1930 1931 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1932 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; 1933 return (vm_radix_insert(&pmap->pm_root, mpte)); 1934 } 1935 1936 /* 1937 * Removes the page table page mapping the specified virtual address from the 1938 * specified pmap's collection of idle page table pages, and returns it. 1939 * Otherwise, returns NULL if there is no page table page corresponding to the 1940 * specified virtual address. 1941 */ 1942 static __inline vm_page_t 1943 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1944 { 1945 1946 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1947 return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); 1948 } 1949 1950 /* 1951 * Decrements a page table page's reference count, which is used to record the 1952 * number of valid page table entries within the page. If the reference count 1953 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1954 * page table page was unmapped and FALSE otherwise. 1955 */ 1956 static inline boolean_t 1957 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1958 { 1959 1960 --m->ref_count; 1961 if (m->ref_count == 0) { 1962 _pmap_unwire_ptp(pmap, m, free); 1963 return (TRUE); 1964 } else 1965 return (FALSE); 1966 } 1967 1968 static void 1969 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1970 { 1971 1972 /* 1973 * unmap the page table page 1974 */ 1975 pmap->pm_pdir[m->pindex] = 0; 1976 --pmap->pm_stats.resident_count; 1977 1978 /* 1979 * There is not need to invalidate the recursive mapping since 1980 * we never instantiate such mapping for the usermode pmaps, 1981 * and never remove page table pages from the kernel pmap. 1982 * Put page on a list so that it is released since all TLB 1983 * shootdown is done. 1984 */ 1985 MPASS(pmap != kernel_pmap); 1986 pmap_add_delayed_free_list(m, free, TRUE); 1987 } 1988 1989 /* 1990 * After removing a page table entry, this routine is used to 1991 * conditionally free the page, and manage the reference count. 1992 */ 1993 static int 1994 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 1995 { 1996 pd_entry_t ptepde; 1997 vm_page_t mpte; 1998 1999 if (pmap == kernel_pmap) 2000 return (0); 2001 ptepde = *pmap_pde(pmap, va); 2002 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2003 return (pmap_unwire_ptp(pmap, mpte, free)); 2004 } 2005 2006 /* 2007 * Release a page table page reference after a failed attempt to create a 2008 * mapping. 2009 */ 2010 static void 2011 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 2012 { 2013 struct spglist free; 2014 2015 SLIST_INIT(&free); 2016 if (pmap_unwire_ptp(pmap, mpte, &free)) { 2017 /* 2018 * Although "va" was never mapped, paging-structure caches 2019 * could nonetheless have entries that refer to the freed 2020 * page table pages. Invalidate those entries. 2021 */ 2022 pmap_invalidate_page_int(pmap, va); 2023 vm_page_free_pages_toq(&free, true); 2024 } 2025 } 2026 2027 /* 2028 * Initialize the pmap for the swapper process. 2029 */ 2030 static void 2031 __CONCAT(PMTYPE, pinit0)(pmap_t pmap) 2032 { 2033 2034 PMAP_LOCK_INIT(pmap); 2035 pmap->pm_pdir = IdlePTD; 2036 #ifdef PMAP_PAE_COMP 2037 pmap->pm_pdpt = IdlePDPT; 2038 #endif 2039 pmap->pm_root.rt_root = 0; 2040 CPU_ZERO(&pmap->pm_active); 2041 TAILQ_INIT(&pmap->pm_pvchunk); 2042 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2043 pmap_activate_boot(pmap); 2044 } 2045 2046 /* 2047 * Initialize a preallocated and zeroed pmap structure, 2048 * such as one in a vmspace structure. 2049 */ 2050 static int 2051 __CONCAT(PMTYPE, pinit)(pmap_t pmap) 2052 { 2053 vm_page_t m; 2054 int i; 2055 2056 /* 2057 * No need to allocate page table space yet but we do need a valid 2058 * page directory table. 2059 */ 2060 if (pmap->pm_pdir == NULL) { 2061 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 2062 if (pmap->pm_pdir == NULL) 2063 return (0); 2064 #ifdef PMAP_PAE_COMP 2065 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 2066 KASSERT(((vm_offset_t)pmap->pm_pdpt & 2067 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 2068 ("pmap_pinit: pdpt misaligned")); 2069 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 2070 ("pmap_pinit: pdpt above 4g")); 2071 #endif 2072 pmap->pm_root.rt_root = 0; 2073 } 2074 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2075 ("pmap_pinit: pmap has reserved page table page(s)")); 2076 2077 /* 2078 * allocate the page directory page(s) 2079 */ 2080 for (i = 0; i < NPGPTD; i++) { 2081 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2082 VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); 2083 pmap->pm_ptdpg[i] = m; 2084 #ifdef PMAP_PAE_COMP 2085 pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V; 2086 #endif 2087 } 2088 2089 pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD); 2090 #ifdef PMAP_PAE_COMP 2091 if ((cpu_feature & CPUID_PAT) == 0) { 2092 pmap_invalidate_cache_range( 2093 trunc_page((vm_offset_t)pmap->pm_pdpt), 2094 round_page((vm_offset_t)pmap->pm_pdpt + 2095 NPGPTD * sizeof(pdpt_entry_t))); 2096 } 2097 #endif 2098 2099 for (i = 0; i < NPGPTD; i++) 2100 if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0) 2101 pagezero(pmap->pm_pdir + (i * NPDEPG)); 2102 2103 /* Install the trampoline mapping. */ 2104 pmap->pm_pdir[TRPTDI] = PTD[TRPTDI]; 2105 2106 CPU_ZERO(&pmap->pm_active); 2107 TAILQ_INIT(&pmap->pm_pvchunk); 2108 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2109 2110 return (1); 2111 } 2112 2113 /* 2114 * this routine is called if the page table page is not 2115 * mapped correctly. 2116 */ 2117 static vm_page_t 2118 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 2119 { 2120 vm_paddr_t ptepa; 2121 vm_page_t m; 2122 2123 /* 2124 * Allocate a page table page. 2125 */ 2126 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2127 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2128 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2129 PMAP_UNLOCK(pmap); 2130 rw_wunlock(&pvh_global_lock); 2131 vm_wait(NULL); 2132 rw_wlock(&pvh_global_lock); 2133 PMAP_LOCK(pmap); 2134 } 2135 2136 /* 2137 * Indicate the need to retry. While waiting, the page table 2138 * page may have been allocated. 2139 */ 2140 return (NULL); 2141 } 2142 if ((m->flags & PG_ZERO) == 0) 2143 pmap_zero_page(m); 2144 2145 /* 2146 * Map the pagetable page into the process address space, if 2147 * it isn't already there. 2148 */ 2149 2150 pmap->pm_stats.resident_count++; 2151 2152 ptepa = VM_PAGE_TO_PHYS(m); 2153 pmap->pm_pdir[ptepindex] = 2154 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 2155 2156 return (m); 2157 } 2158 2159 static vm_page_t 2160 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 2161 { 2162 u_int ptepindex; 2163 pd_entry_t ptepa; 2164 vm_page_t m; 2165 2166 /* 2167 * Calculate pagetable page index 2168 */ 2169 ptepindex = va >> PDRSHIFT; 2170 retry: 2171 /* 2172 * Get the page directory entry 2173 */ 2174 ptepa = pmap->pm_pdir[ptepindex]; 2175 2176 /* 2177 * This supports switching from a 4MB page to a 2178 * normal 4K page. 2179 */ 2180 if (ptepa & PG_PS) { 2181 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 2182 ptepa = pmap->pm_pdir[ptepindex]; 2183 } 2184 2185 /* 2186 * If the page table page is mapped, we just increment the 2187 * hold count, and activate it. 2188 */ 2189 if (ptepa) { 2190 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 2191 m->ref_count++; 2192 } else { 2193 /* 2194 * Here if the pte page isn't mapped, or if it has 2195 * been deallocated. 2196 */ 2197 m = _pmap_allocpte(pmap, ptepindex, flags); 2198 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2199 goto retry; 2200 } 2201 return (m); 2202 } 2203 2204 2205 /*************************************************** 2206 * Pmap allocation/deallocation routines. 2207 ***************************************************/ 2208 2209 /* 2210 * Release any resources held by the given physical map. 2211 * Called when a pmap initialized by pmap_pinit is being released. 2212 * Should only be called if the map contains no valid mappings. 2213 */ 2214 static void 2215 __CONCAT(PMTYPE, release)(pmap_t pmap) 2216 { 2217 vm_page_t m; 2218 int i; 2219 2220 KASSERT(pmap->pm_stats.resident_count == 0, 2221 ("pmap_release: pmap resident count %ld != 0", 2222 pmap->pm_stats.resident_count)); 2223 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2224 ("pmap_release: pmap has reserved page table page(s)")); 2225 KASSERT(CPU_EMPTY(&pmap->pm_active), 2226 ("releasing active pmap %p", pmap)); 2227 2228 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2229 2230 for (i = 0; i < NPGPTD; i++) { 2231 m = pmap->pm_ptdpg[i]; 2232 #ifdef PMAP_PAE_COMP 2233 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2234 ("pmap_release: got wrong ptd page")); 2235 #endif 2236 vm_page_unwire_noq(m); 2237 vm_page_free(m); 2238 } 2239 } 2240 2241 /* 2242 * grow the number of kernel page table entries, if needed 2243 */ 2244 static void 2245 __CONCAT(PMTYPE, growkernel)(vm_offset_t addr) 2246 { 2247 vm_paddr_t ptppaddr; 2248 vm_page_t nkpg; 2249 pd_entry_t newpdir; 2250 2251 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2252 addr = roundup2(addr, NBPDR); 2253 if (addr - 1 >= vm_map_max(kernel_map)) 2254 addr = vm_map_max(kernel_map); 2255 while (kernel_vm_end < addr) { 2256 if (pdir_pde(PTD, kernel_vm_end)) { 2257 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2258 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2259 kernel_vm_end = vm_map_max(kernel_map); 2260 break; 2261 } 2262 continue; 2263 } 2264 2265 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2266 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2267 VM_ALLOC_ZERO); 2268 if (nkpg == NULL) 2269 panic("pmap_growkernel: no memory to grow kernel"); 2270 2271 nkpt++; 2272 2273 if ((nkpg->flags & PG_ZERO) == 0) 2274 pmap_zero_page(nkpg); 2275 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2276 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2277 pdir_pde(KPTD, kernel_vm_end) = newpdir; 2278 2279 pmap_kenter_pde(kernel_vm_end, newpdir); 2280 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2281 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2282 kernel_vm_end = vm_map_max(kernel_map); 2283 break; 2284 } 2285 } 2286 } 2287 2288 2289 /*************************************************** 2290 * page management routines. 2291 ***************************************************/ 2292 2293 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2294 CTASSERT(_NPCM == 11); 2295 CTASSERT(_NPCPV == 336); 2296 2297 static __inline struct pv_chunk * 2298 pv_to_chunk(pv_entry_t pv) 2299 { 2300 2301 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2302 } 2303 2304 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2305 2306 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2307 #define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2308 2309 static const uint32_t pc_freemask[_NPCM] = { 2310 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2311 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2312 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2313 PC_FREE0_9, PC_FREE10 2314 }; 2315 2316 #ifdef PV_STATS 2317 extern int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2318 extern long pv_entry_frees, pv_entry_allocs; 2319 extern int pv_entry_spare; 2320 #endif 2321 2322 /* 2323 * We are in a serious low memory condition. Resort to 2324 * drastic measures to free some pages so we can allocate 2325 * another pv entry chunk. 2326 */ 2327 static vm_page_t 2328 pmap_pv_reclaim(pmap_t locked_pmap) 2329 { 2330 struct pch newtail; 2331 struct pv_chunk *pc; 2332 struct md_page *pvh; 2333 pd_entry_t *pde; 2334 pmap_t pmap; 2335 pt_entry_t *pte, tpte; 2336 pv_entry_t pv; 2337 vm_offset_t va; 2338 vm_page_t m, m_pc; 2339 struct spglist free; 2340 uint32_t inuse; 2341 int bit, field, freed; 2342 2343 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2344 pmap = NULL; 2345 m_pc = NULL; 2346 SLIST_INIT(&free); 2347 TAILQ_INIT(&newtail); 2348 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2349 SLIST_EMPTY(&free))) { 2350 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2351 if (pmap != pc->pc_pmap) { 2352 if (pmap != NULL) { 2353 pmap_invalidate_all_int(pmap); 2354 if (pmap != locked_pmap) 2355 PMAP_UNLOCK(pmap); 2356 } 2357 pmap = pc->pc_pmap; 2358 /* Avoid deadlock and lock recursion. */ 2359 if (pmap > locked_pmap) 2360 PMAP_LOCK(pmap); 2361 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2362 pmap = NULL; 2363 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2364 continue; 2365 } 2366 } 2367 2368 /* 2369 * Destroy every non-wired, 4 KB page mapping in the chunk. 2370 */ 2371 freed = 0; 2372 for (field = 0; field < _NPCM; field++) { 2373 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2374 inuse != 0; inuse &= ~(1UL << bit)) { 2375 bit = bsfl(inuse); 2376 pv = &pc->pc_pventry[field * 32 + bit]; 2377 va = pv->pv_va; 2378 pde = pmap_pde(pmap, va); 2379 if ((*pde & PG_PS) != 0) 2380 continue; 2381 pte = __CONCAT(PMTYPE, pte)(pmap, va); 2382 tpte = *pte; 2383 if ((tpte & PG_W) == 0) 2384 tpte = pte_load_clear(pte); 2385 pmap_pte_release(pte); 2386 if ((tpte & PG_W) != 0) 2387 continue; 2388 KASSERT(tpte != 0, 2389 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2390 pmap, va)); 2391 if ((tpte & PG_G) != 0) 2392 pmap_invalidate_page_int(pmap, va); 2393 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2394 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2395 vm_page_dirty(m); 2396 if ((tpte & PG_A) != 0) 2397 vm_page_aflag_set(m, PGA_REFERENCED); 2398 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2399 if (TAILQ_EMPTY(&m->md.pv_list) && 2400 (m->flags & PG_FICTITIOUS) == 0) { 2401 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2402 if (TAILQ_EMPTY(&pvh->pv_list)) { 2403 vm_page_aflag_clear(m, 2404 PGA_WRITEABLE); 2405 } 2406 } 2407 pc->pc_map[field] |= 1UL << bit; 2408 pmap_unuse_pt(pmap, va, &free); 2409 freed++; 2410 } 2411 } 2412 if (freed == 0) { 2413 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2414 continue; 2415 } 2416 /* Every freed mapping is for a 4 KB page. */ 2417 pmap->pm_stats.resident_count -= freed; 2418 PV_STAT(pv_entry_frees += freed); 2419 PV_STAT(pv_entry_spare += freed); 2420 pv_entry_count -= freed; 2421 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2422 for (field = 0; field < _NPCM; field++) 2423 if (pc->pc_map[field] != pc_freemask[field]) { 2424 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2425 pc_list); 2426 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2427 2428 /* 2429 * One freed pv entry in locked_pmap is 2430 * sufficient. 2431 */ 2432 if (pmap == locked_pmap) 2433 goto out; 2434 break; 2435 } 2436 if (field == _NPCM) { 2437 PV_STAT(pv_entry_spare -= _NPCPV); 2438 PV_STAT(pc_chunk_count--); 2439 PV_STAT(pc_chunk_frees++); 2440 /* Entire chunk is free; return it. */ 2441 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2442 pmap_qremove((vm_offset_t)pc, 1); 2443 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2444 break; 2445 } 2446 } 2447 out: 2448 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2449 if (pmap != NULL) { 2450 pmap_invalidate_all_int(pmap); 2451 if (pmap != locked_pmap) 2452 PMAP_UNLOCK(pmap); 2453 } 2454 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2455 m_pc = SLIST_FIRST(&free); 2456 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2457 /* Recycle a freed page table page. */ 2458 m_pc->ref_count = 1; 2459 } 2460 vm_page_free_pages_toq(&free, true); 2461 return (m_pc); 2462 } 2463 2464 /* 2465 * free the pv_entry back to the free list 2466 */ 2467 static void 2468 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2469 { 2470 struct pv_chunk *pc; 2471 int idx, field, bit; 2472 2473 rw_assert(&pvh_global_lock, RA_WLOCKED); 2474 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2475 PV_STAT(pv_entry_frees++); 2476 PV_STAT(pv_entry_spare++); 2477 pv_entry_count--; 2478 pc = pv_to_chunk(pv); 2479 idx = pv - &pc->pc_pventry[0]; 2480 field = idx / 32; 2481 bit = idx % 32; 2482 pc->pc_map[field] |= 1ul << bit; 2483 for (idx = 0; idx < _NPCM; idx++) 2484 if (pc->pc_map[idx] != pc_freemask[idx]) { 2485 /* 2486 * 98% of the time, pc is already at the head of the 2487 * list. If it isn't already, move it to the head. 2488 */ 2489 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2490 pc)) { 2491 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2492 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2493 pc_list); 2494 } 2495 return; 2496 } 2497 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2498 free_pv_chunk(pc); 2499 } 2500 2501 static void 2502 free_pv_chunk(struct pv_chunk *pc) 2503 { 2504 vm_page_t m; 2505 2506 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2507 PV_STAT(pv_entry_spare -= _NPCPV); 2508 PV_STAT(pc_chunk_count--); 2509 PV_STAT(pc_chunk_frees++); 2510 /* entire chunk is free, return it */ 2511 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2512 pmap_qremove((vm_offset_t)pc, 1); 2513 vm_page_unwire_noq(m); 2514 vm_page_free(m); 2515 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2516 } 2517 2518 /* 2519 * get a new pv_entry, allocating a block from the system 2520 * when needed. 2521 */ 2522 static pv_entry_t 2523 get_pv_entry(pmap_t pmap, boolean_t try) 2524 { 2525 static const struct timeval printinterval = { 60, 0 }; 2526 static struct timeval lastprint; 2527 int bit, field; 2528 pv_entry_t pv; 2529 struct pv_chunk *pc; 2530 vm_page_t m; 2531 2532 rw_assert(&pvh_global_lock, RA_WLOCKED); 2533 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2534 PV_STAT(pv_entry_allocs++); 2535 pv_entry_count++; 2536 if (pv_entry_count > pv_entry_high_water) 2537 if (ratecheck(&lastprint, &printinterval)) 2538 printf("Approaching the limit on PV entries, consider " 2539 "increasing either the vm.pmap.shpgperproc or the " 2540 "vm.pmap.pv_entries tunable.\n"); 2541 retry: 2542 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2543 if (pc != NULL) { 2544 for (field = 0; field < _NPCM; field++) { 2545 if (pc->pc_map[field]) { 2546 bit = bsfl(pc->pc_map[field]); 2547 break; 2548 } 2549 } 2550 if (field < _NPCM) { 2551 pv = &pc->pc_pventry[field * 32 + bit]; 2552 pc->pc_map[field] &= ~(1ul << bit); 2553 /* If this was the last item, move it to tail */ 2554 for (field = 0; field < _NPCM; field++) 2555 if (pc->pc_map[field] != 0) { 2556 PV_STAT(pv_entry_spare--); 2557 return (pv); /* not full, return */ 2558 } 2559 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2560 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2561 PV_STAT(pv_entry_spare--); 2562 return (pv); 2563 } 2564 } 2565 /* 2566 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2567 * global lock. If "pv_vafree" is currently non-empty, it will 2568 * remain non-empty until pmap_ptelist_alloc() completes. 2569 */ 2570 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2571 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2572 if (try) { 2573 pv_entry_count--; 2574 PV_STAT(pc_chunk_tryfail++); 2575 return (NULL); 2576 } 2577 m = pmap_pv_reclaim(pmap); 2578 if (m == NULL) 2579 goto retry; 2580 } 2581 PV_STAT(pc_chunk_count++); 2582 PV_STAT(pc_chunk_allocs++); 2583 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2584 pmap_qenter((vm_offset_t)pc, &m, 1); 2585 pc->pc_pmap = pmap; 2586 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2587 for (field = 1; field < _NPCM; field++) 2588 pc->pc_map[field] = pc_freemask[field]; 2589 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2590 pv = &pc->pc_pventry[0]; 2591 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2592 PV_STAT(pv_entry_spare += _NPCPV - 1); 2593 return (pv); 2594 } 2595 2596 static __inline pv_entry_t 2597 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2598 { 2599 pv_entry_t pv; 2600 2601 rw_assert(&pvh_global_lock, RA_WLOCKED); 2602 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2603 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2604 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2605 break; 2606 } 2607 } 2608 return (pv); 2609 } 2610 2611 static void 2612 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2613 { 2614 struct md_page *pvh; 2615 pv_entry_t pv; 2616 vm_offset_t va_last; 2617 vm_page_t m; 2618 2619 rw_assert(&pvh_global_lock, RA_WLOCKED); 2620 KASSERT((pa & PDRMASK) == 0, 2621 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2622 2623 /* 2624 * Transfer the 4mpage's pv entry for this mapping to the first 2625 * page's pv list. 2626 */ 2627 pvh = pa_to_pvh(pa); 2628 va = trunc_4mpage(va); 2629 pv = pmap_pvh_remove(pvh, pmap, va); 2630 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2631 m = PHYS_TO_VM_PAGE(pa); 2632 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2633 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2634 va_last = va + NBPDR - PAGE_SIZE; 2635 do { 2636 m++; 2637 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2638 ("pmap_pv_demote_pde: page %p is not managed", m)); 2639 va += PAGE_SIZE; 2640 pmap_insert_entry(pmap, va, m); 2641 } while (va < va_last); 2642 } 2643 2644 #if VM_NRESERVLEVEL > 0 2645 static void 2646 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2647 { 2648 struct md_page *pvh; 2649 pv_entry_t pv; 2650 vm_offset_t va_last; 2651 vm_page_t m; 2652 2653 rw_assert(&pvh_global_lock, RA_WLOCKED); 2654 KASSERT((pa & PDRMASK) == 0, 2655 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2656 2657 /* 2658 * Transfer the first page's pv entry for this mapping to the 2659 * 4mpage's pv list. Aside from avoiding the cost of a call 2660 * to get_pv_entry(), a transfer avoids the possibility that 2661 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2662 * removes one of the mappings that is being promoted. 2663 */ 2664 m = PHYS_TO_VM_PAGE(pa); 2665 va = trunc_4mpage(va); 2666 pv = pmap_pvh_remove(&m->md, pmap, va); 2667 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2668 pvh = pa_to_pvh(pa); 2669 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2670 /* Free the remaining NPTEPG - 1 pv entries. */ 2671 va_last = va + NBPDR - PAGE_SIZE; 2672 do { 2673 m++; 2674 va += PAGE_SIZE; 2675 pmap_pvh_free(&m->md, pmap, va); 2676 } while (va < va_last); 2677 } 2678 #endif /* VM_NRESERVLEVEL > 0 */ 2679 2680 static void 2681 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2682 { 2683 pv_entry_t pv; 2684 2685 pv = pmap_pvh_remove(pvh, pmap, va); 2686 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2687 free_pv_entry(pmap, pv); 2688 } 2689 2690 static void 2691 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2692 { 2693 struct md_page *pvh; 2694 2695 rw_assert(&pvh_global_lock, RA_WLOCKED); 2696 pmap_pvh_free(&m->md, pmap, va); 2697 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2698 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2699 if (TAILQ_EMPTY(&pvh->pv_list)) 2700 vm_page_aflag_clear(m, PGA_WRITEABLE); 2701 } 2702 } 2703 2704 /* 2705 * Create a pv entry for page at pa for 2706 * (pmap, va). 2707 */ 2708 static void 2709 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2710 { 2711 pv_entry_t pv; 2712 2713 rw_assert(&pvh_global_lock, RA_WLOCKED); 2714 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2715 pv = get_pv_entry(pmap, FALSE); 2716 pv->pv_va = va; 2717 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2718 } 2719 2720 /* 2721 * Conditionally create a pv entry. 2722 */ 2723 static boolean_t 2724 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2725 { 2726 pv_entry_t pv; 2727 2728 rw_assert(&pvh_global_lock, RA_WLOCKED); 2729 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2730 if (pv_entry_count < pv_entry_high_water && 2731 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2732 pv->pv_va = va; 2733 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2734 return (TRUE); 2735 } else 2736 return (FALSE); 2737 } 2738 2739 /* 2740 * Create the pv entries for each of the pages within a superpage. 2741 */ 2742 static bool 2743 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags) 2744 { 2745 struct md_page *pvh; 2746 pv_entry_t pv; 2747 bool noreclaim; 2748 2749 rw_assert(&pvh_global_lock, RA_WLOCKED); 2750 noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; 2751 if ((noreclaim && pv_entry_count >= pv_entry_high_water) || 2752 (pv = get_pv_entry(pmap, noreclaim)) == NULL) 2753 return (false); 2754 pv->pv_va = va; 2755 pvh = pa_to_pvh(pde & PG_PS_FRAME); 2756 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2757 return (true); 2758 } 2759 2760 /* 2761 * Fills a page table page with mappings to consecutive physical pages. 2762 */ 2763 static void 2764 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2765 { 2766 pt_entry_t *pte; 2767 2768 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2769 *pte = newpte; 2770 newpte += PAGE_SIZE; 2771 } 2772 } 2773 2774 /* 2775 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2776 * 2- or 4MB page mapping is invalidated. 2777 */ 2778 static boolean_t 2779 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2780 { 2781 pd_entry_t newpde, oldpde; 2782 pt_entry_t *firstpte, newpte; 2783 vm_paddr_t mptepa; 2784 vm_page_t mpte; 2785 struct spglist free; 2786 vm_offset_t sva; 2787 2788 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2789 oldpde = *pde; 2790 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2791 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2792 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2793 NULL) { 2794 KASSERT((oldpde & PG_W) == 0, 2795 ("pmap_demote_pde: page table page for a wired mapping" 2796 " is missing")); 2797 2798 /* 2799 * Invalidate the 2- or 4MB page mapping and return 2800 * "failure" if the mapping was never accessed or the 2801 * allocation of the new page table page fails. 2802 */ 2803 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2804 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2805 VM_ALLOC_WIRED)) == NULL) { 2806 SLIST_INIT(&free); 2807 sva = trunc_4mpage(va); 2808 pmap_remove_pde(pmap, pde, sva, &free); 2809 if ((oldpde & PG_G) == 0) 2810 pmap_invalidate_pde_page(pmap, sva, oldpde); 2811 vm_page_free_pages_toq(&free, true); 2812 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2813 " in pmap %p", va, pmap); 2814 return (FALSE); 2815 } 2816 if (pmap != kernel_pmap) { 2817 mpte->ref_count = NPTEPG; 2818 pmap->pm_stats.resident_count++; 2819 } 2820 } 2821 mptepa = VM_PAGE_TO_PHYS(mpte); 2822 2823 /* 2824 * If the page mapping is in the kernel's address space, then the 2825 * KPTmap can provide access to the page table page. Otherwise, 2826 * temporarily map the page table page (mpte) into the kernel's 2827 * address space at either PADDR1 or PADDR2. 2828 */ 2829 if (pmap == kernel_pmap) 2830 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2831 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2832 if ((*PMAP1 & PG_FRAME) != mptepa) { 2833 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2834 #ifdef SMP 2835 PMAP1cpu = PCPU_GET(cpuid); 2836 #endif 2837 invlcaddr(PADDR1); 2838 PMAP1changed++; 2839 } else 2840 #ifdef SMP 2841 if (PMAP1cpu != PCPU_GET(cpuid)) { 2842 PMAP1cpu = PCPU_GET(cpuid); 2843 invlcaddr(PADDR1); 2844 PMAP1changedcpu++; 2845 } else 2846 #endif 2847 PMAP1unchanged++; 2848 firstpte = PADDR1; 2849 } else { 2850 mtx_lock(&PMAP2mutex); 2851 if ((*PMAP2 & PG_FRAME) != mptepa) { 2852 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2853 pmap_invalidate_page_int(kernel_pmap, 2854 (vm_offset_t)PADDR2); 2855 } 2856 firstpte = PADDR2; 2857 } 2858 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2859 KASSERT((oldpde & PG_A) != 0, 2860 ("pmap_demote_pde: oldpde is missing PG_A")); 2861 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2862 ("pmap_demote_pde: oldpde is missing PG_M")); 2863 newpte = oldpde & ~PG_PS; 2864 if ((newpte & PG_PDE_PAT) != 0) 2865 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2866 2867 /* 2868 * If the page table page is not leftover from an earlier promotion, 2869 * initialize it. 2870 */ 2871 if (mpte->valid == 0) 2872 pmap_fill_ptp(firstpte, newpte); 2873 2874 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2875 ("pmap_demote_pde: firstpte and newpte map different physical" 2876 " addresses")); 2877 2878 /* 2879 * If the mapping has changed attributes, update the page table 2880 * entries. 2881 */ 2882 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2883 pmap_fill_ptp(firstpte, newpte); 2884 2885 /* 2886 * Demote the mapping. This pmap is locked. The old PDE has 2887 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2888 * set. Thus, there is no danger of a race with another 2889 * processor changing the setting of PG_A and/or PG_M between 2890 * the read above and the store below. 2891 */ 2892 if (workaround_erratum383) 2893 pmap_update_pde(pmap, va, pde, newpde); 2894 else if (pmap == kernel_pmap) 2895 pmap_kenter_pde(va, newpde); 2896 else 2897 pde_store(pde, newpde); 2898 if (firstpte == PADDR2) 2899 mtx_unlock(&PMAP2mutex); 2900 2901 /* 2902 * Invalidate the recursive mapping of the page table page. 2903 */ 2904 pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); 2905 2906 /* 2907 * Demote the pv entry. This depends on the earlier demotion 2908 * of the mapping. Specifically, the (re)creation of a per- 2909 * page pv entry might trigger the execution of pmap_collect(), 2910 * which might reclaim a newly (re)created per-page pv entry 2911 * and destroy the associated mapping. In order to destroy 2912 * the mapping, the PDE must have already changed from mapping 2913 * the 2mpage to referencing the page table page. 2914 */ 2915 if ((oldpde & PG_MANAGED) != 0) 2916 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2917 2918 pmap_pde_demotions++; 2919 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2920 " in pmap %p", va, pmap); 2921 return (TRUE); 2922 } 2923 2924 /* 2925 * Removes a 2- or 4MB page mapping from the kernel pmap. 2926 */ 2927 static void 2928 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2929 { 2930 pd_entry_t newpde; 2931 vm_paddr_t mptepa; 2932 vm_page_t mpte; 2933 2934 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2935 mpte = pmap_remove_pt_page(pmap, va); 2936 if (mpte == NULL) 2937 panic("pmap_remove_kernel_pde: Missing pt page."); 2938 2939 mptepa = VM_PAGE_TO_PHYS(mpte); 2940 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2941 2942 /* 2943 * If this page table page was unmapped by a promotion, then it 2944 * contains valid mappings. Zero it to invalidate those mappings. 2945 */ 2946 if (mpte->valid != 0) 2947 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2948 2949 /* 2950 * Remove the mapping. 2951 */ 2952 if (workaround_erratum383) 2953 pmap_update_pde(pmap, va, pde, newpde); 2954 else 2955 pmap_kenter_pde(va, newpde); 2956 2957 /* 2958 * Invalidate the recursive mapping of the page table page. 2959 */ 2960 pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); 2961 } 2962 2963 /* 2964 * pmap_remove_pde: do the things to unmap a superpage in a process 2965 */ 2966 static void 2967 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2968 struct spglist *free) 2969 { 2970 struct md_page *pvh; 2971 pd_entry_t oldpde; 2972 vm_offset_t eva, va; 2973 vm_page_t m, mpte; 2974 2975 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2976 KASSERT((sva & PDRMASK) == 0, 2977 ("pmap_remove_pde: sva is not 4mpage aligned")); 2978 oldpde = pte_load_clear(pdq); 2979 if (oldpde & PG_W) 2980 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2981 2982 /* 2983 * Machines that don't support invlpg, also don't support 2984 * PG_G. 2985 */ 2986 if ((oldpde & PG_G) != 0) 2987 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 2988 2989 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2990 if (oldpde & PG_MANAGED) { 2991 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2992 pmap_pvh_free(pvh, pmap, sva); 2993 eva = sva + NBPDR; 2994 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2995 va < eva; va += PAGE_SIZE, m++) { 2996 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2997 vm_page_dirty(m); 2998 if (oldpde & PG_A) 2999 vm_page_aflag_set(m, PGA_REFERENCED); 3000 if (TAILQ_EMPTY(&m->md.pv_list) && 3001 TAILQ_EMPTY(&pvh->pv_list)) 3002 vm_page_aflag_clear(m, PGA_WRITEABLE); 3003 } 3004 } 3005 if (pmap == kernel_pmap) { 3006 pmap_remove_kernel_pde(pmap, pdq, sva); 3007 } else { 3008 mpte = pmap_remove_pt_page(pmap, sva); 3009 if (mpte != NULL) { 3010 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 3011 ("pmap_remove_pde: pte page not promoted")); 3012 pmap->pm_stats.resident_count--; 3013 KASSERT(mpte->ref_count == NPTEPG, 3014 ("pmap_remove_pde: pte page ref count error")); 3015 mpte->ref_count = 0; 3016 pmap_add_delayed_free_list(mpte, free, FALSE); 3017 } 3018 } 3019 } 3020 3021 /* 3022 * pmap_remove_pte: do the things to unmap a page in a process 3023 */ 3024 static int 3025 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 3026 struct spglist *free) 3027 { 3028 pt_entry_t oldpte; 3029 vm_page_t m; 3030 3031 rw_assert(&pvh_global_lock, RA_WLOCKED); 3032 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3033 oldpte = pte_load_clear(ptq); 3034 KASSERT(oldpte != 0, 3035 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 3036 if (oldpte & PG_W) 3037 pmap->pm_stats.wired_count -= 1; 3038 /* 3039 * Machines that don't support invlpg, also don't support 3040 * PG_G. 3041 */ 3042 if (oldpte & PG_G) 3043 pmap_invalidate_page_int(kernel_pmap, va); 3044 pmap->pm_stats.resident_count -= 1; 3045 if (oldpte & PG_MANAGED) { 3046 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3047 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3048 vm_page_dirty(m); 3049 if (oldpte & PG_A) 3050 vm_page_aflag_set(m, PGA_REFERENCED); 3051 pmap_remove_entry(pmap, m, va); 3052 } 3053 return (pmap_unuse_pt(pmap, va, free)); 3054 } 3055 3056 /* 3057 * Remove a single page from a process address space 3058 */ 3059 static void 3060 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 3061 { 3062 pt_entry_t *pte; 3063 3064 rw_assert(&pvh_global_lock, RA_WLOCKED); 3065 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 3066 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3067 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 3068 return; 3069 pmap_remove_pte(pmap, pte, va, free); 3070 pmap_invalidate_page_int(pmap, va); 3071 } 3072 3073 /* 3074 * Removes the specified range of addresses from the page table page. 3075 */ 3076 static bool 3077 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3078 struct spglist *free) 3079 { 3080 pt_entry_t *pte; 3081 bool anyvalid; 3082 3083 rw_assert(&pvh_global_lock, RA_WLOCKED); 3084 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 3085 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3086 anyvalid = false; 3087 for (pte = pmap_pte_quick(pmap, sva); sva != eva; pte++, 3088 sva += PAGE_SIZE) { 3089 if (*pte == 0) 3090 continue; 3091 3092 /* 3093 * The TLB entry for a PG_G mapping is invalidated by 3094 * pmap_remove_pte(). 3095 */ 3096 if ((*pte & PG_G) == 0) 3097 anyvalid = true; 3098 3099 if (pmap_remove_pte(pmap, pte, sva, free)) 3100 break; 3101 } 3102 return (anyvalid); 3103 } 3104 3105 /* 3106 * Remove the given range of addresses from the specified map. 3107 * 3108 * It is assumed that the start and end are properly 3109 * rounded to the page size. 3110 */ 3111 static void 3112 __CONCAT(PMTYPE, remove)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3113 { 3114 vm_offset_t pdnxt; 3115 pd_entry_t ptpaddr; 3116 struct spglist free; 3117 int anyvalid; 3118 3119 /* 3120 * Perform an unsynchronized read. This is, however, safe. 3121 */ 3122 if (pmap->pm_stats.resident_count == 0) 3123 return; 3124 3125 anyvalid = 0; 3126 SLIST_INIT(&free); 3127 3128 rw_wlock(&pvh_global_lock); 3129 sched_pin(); 3130 PMAP_LOCK(pmap); 3131 3132 /* 3133 * special handling of removing one page. a very 3134 * common operation and easy to short circuit some 3135 * code. 3136 */ 3137 if ((sva + PAGE_SIZE == eva) && 3138 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 3139 pmap_remove_page(pmap, sva, &free); 3140 goto out; 3141 } 3142 3143 for (; sva < eva; sva = pdnxt) { 3144 u_int pdirindex; 3145 3146 /* 3147 * Calculate index for next page table. 3148 */ 3149 pdnxt = (sva + NBPDR) & ~PDRMASK; 3150 if (pdnxt < sva) 3151 pdnxt = eva; 3152 if (pmap->pm_stats.resident_count == 0) 3153 break; 3154 3155 pdirindex = sva >> PDRSHIFT; 3156 ptpaddr = pmap->pm_pdir[pdirindex]; 3157 3158 /* 3159 * Weed out invalid mappings. Note: we assume that the page 3160 * directory table is always allocated, and in kernel virtual. 3161 */ 3162 if (ptpaddr == 0) 3163 continue; 3164 3165 /* 3166 * Check for large page. 3167 */ 3168 if ((ptpaddr & PG_PS) != 0) { 3169 /* 3170 * Are we removing the entire large page? If not, 3171 * demote the mapping and fall through. 3172 */ 3173 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3174 /* 3175 * The TLB entry for a PG_G mapping is 3176 * invalidated by pmap_remove_pde(). 3177 */ 3178 if ((ptpaddr & PG_G) == 0) 3179 anyvalid = 1; 3180 pmap_remove_pde(pmap, 3181 &pmap->pm_pdir[pdirindex], sva, &free); 3182 continue; 3183 } else if (!pmap_demote_pde(pmap, 3184 &pmap->pm_pdir[pdirindex], sva)) { 3185 /* The large page mapping was destroyed. */ 3186 continue; 3187 } 3188 } 3189 3190 /* 3191 * Limit our scan to either the end of the va represented 3192 * by the current page table page, or to the end of the 3193 * range being removed. 3194 */ 3195 if (pdnxt > eva) 3196 pdnxt = eva; 3197 3198 if (pmap_remove_ptes(pmap, sva, pdnxt, &free)) 3199 anyvalid = 1; 3200 } 3201 out: 3202 sched_unpin(); 3203 if (anyvalid) 3204 pmap_invalidate_all_int(pmap); 3205 rw_wunlock(&pvh_global_lock); 3206 PMAP_UNLOCK(pmap); 3207 vm_page_free_pages_toq(&free, true); 3208 } 3209 3210 /* 3211 * Routine: pmap_remove_all 3212 * Function: 3213 * Removes this physical page from 3214 * all physical maps in which it resides. 3215 * Reflects back modify bits to the pager. 3216 * 3217 * Notes: 3218 * Original versions of this routine were very 3219 * inefficient because they iteratively called 3220 * pmap_remove (slow...) 3221 */ 3222 3223 static void 3224 __CONCAT(PMTYPE, remove_all)(vm_page_t m) 3225 { 3226 struct md_page *pvh; 3227 pv_entry_t pv; 3228 pmap_t pmap; 3229 pt_entry_t *pte, tpte; 3230 pd_entry_t *pde; 3231 vm_offset_t va; 3232 struct spglist free; 3233 3234 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3235 ("pmap_remove_all: page %p is not managed", m)); 3236 SLIST_INIT(&free); 3237 rw_wlock(&pvh_global_lock); 3238 sched_pin(); 3239 if ((m->flags & PG_FICTITIOUS) != 0) 3240 goto small_mappings; 3241 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3242 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3243 va = pv->pv_va; 3244 pmap = PV_PMAP(pv); 3245 PMAP_LOCK(pmap); 3246 pde = pmap_pde(pmap, va); 3247 (void)pmap_demote_pde(pmap, pde, va); 3248 PMAP_UNLOCK(pmap); 3249 } 3250 small_mappings: 3251 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3252 pmap = PV_PMAP(pv); 3253 PMAP_LOCK(pmap); 3254 pmap->pm_stats.resident_count--; 3255 pde = pmap_pde(pmap, pv->pv_va); 3256 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3257 " a 4mpage in page %p's pv list", m)); 3258 pte = pmap_pte_quick(pmap, pv->pv_va); 3259 tpte = pte_load_clear(pte); 3260 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3261 pmap, pv->pv_va)); 3262 if (tpte & PG_W) 3263 pmap->pm_stats.wired_count--; 3264 if (tpte & PG_A) 3265 vm_page_aflag_set(m, PGA_REFERENCED); 3266 3267 /* 3268 * Update the vm_page_t clean and reference bits. 3269 */ 3270 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3271 vm_page_dirty(m); 3272 pmap_unuse_pt(pmap, pv->pv_va, &free); 3273 pmap_invalidate_page_int(pmap, pv->pv_va); 3274 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3275 free_pv_entry(pmap, pv); 3276 PMAP_UNLOCK(pmap); 3277 } 3278 vm_page_aflag_clear(m, PGA_WRITEABLE); 3279 sched_unpin(); 3280 rw_wunlock(&pvh_global_lock); 3281 vm_page_free_pages_toq(&free, true); 3282 } 3283 3284 /* 3285 * pmap_protect_pde: do the things to protect a 4mpage in a process 3286 */ 3287 static boolean_t 3288 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3289 { 3290 pd_entry_t newpde, oldpde; 3291 vm_page_t m, mt; 3292 boolean_t anychanged; 3293 3294 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3295 KASSERT((sva & PDRMASK) == 0, 3296 ("pmap_protect_pde: sva is not 4mpage aligned")); 3297 anychanged = FALSE; 3298 retry: 3299 oldpde = newpde = *pde; 3300 if ((prot & VM_PROT_WRITE) == 0) { 3301 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 3302 (PG_MANAGED | PG_M | PG_RW)) { 3303 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3304 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 3305 vm_page_dirty(mt); 3306 } 3307 newpde &= ~(PG_RW | PG_M); 3308 } 3309 #ifdef PMAP_PAE_COMP 3310 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 3311 newpde |= pg_nx; 3312 #endif 3313 if (newpde != oldpde) { 3314 /* 3315 * As an optimization to future operations on this PDE, clear 3316 * PG_PROMOTED. The impending invalidation will remove any 3317 * lingering 4KB page mappings from the TLB. 3318 */ 3319 if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) 3320 goto retry; 3321 if ((oldpde & PG_G) != 0) 3322 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 3323 else 3324 anychanged = TRUE; 3325 } 3326 return (anychanged); 3327 } 3328 3329 /* 3330 * Set the physical protection on the 3331 * specified range of this map as requested. 3332 */ 3333 static void 3334 __CONCAT(PMTYPE, protect)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3335 vm_prot_t prot) 3336 { 3337 vm_offset_t pdnxt; 3338 pd_entry_t ptpaddr; 3339 pt_entry_t *pte; 3340 boolean_t anychanged, pv_lists_locked; 3341 3342 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3343 if (prot == VM_PROT_NONE) { 3344 pmap_remove(pmap, sva, eva); 3345 return; 3346 } 3347 3348 #ifdef PMAP_PAE_COMP 3349 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 3350 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 3351 return; 3352 #else 3353 if (prot & VM_PROT_WRITE) 3354 return; 3355 #endif 3356 3357 if (pmap_is_current(pmap)) 3358 pv_lists_locked = FALSE; 3359 else { 3360 pv_lists_locked = TRUE; 3361 resume: 3362 rw_wlock(&pvh_global_lock); 3363 sched_pin(); 3364 } 3365 anychanged = FALSE; 3366 3367 PMAP_LOCK(pmap); 3368 for (; sva < eva; sva = pdnxt) { 3369 pt_entry_t obits, pbits; 3370 u_int pdirindex; 3371 3372 pdnxt = (sva + NBPDR) & ~PDRMASK; 3373 if (pdnxt < sva) 3374 pdnxt = eva; 3375 3376 pdirindex = sva >> PDRSHIFT; 3377 ptpaddr = pmap->pm_pdir[pdirindex]; 3378 3379 /* 3380 * Weed out invalid mappings. Note: we assume that the page 3381 * directory table is always allocated, and in kernel virtual. 3382 */ 3383 if (ptpaddr == 0) 3384 continue; 3385 3386 /* 3387 * Check for large page. 3388 */ 3389 if ((ptpaddr & PG_PS) != 0) { 3390 /* 3391 * Are we protecting the entire large page? If not, 3392 * demote the mapping and fall through. 3393 */ 3394 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3395 /* 3396 * The TLB entry for a PG_G mapping is 3397 * invalidated by pmap_protect_pde(). 3398 */ 3399 if (pmap_protect_pde(pmap, 3400 &pmap->pm_pdir[pdirindex], sva, prot)) 3401 anychanged = TRUE; 3402 continue; 3403 } else { 3404 if (!pv_lists_locked) { 3405 pv_lists_locked = TRUE; 3406 if (!rw_try_wlock(&pvh_global_lock)) { 3407 if (anychanged) 3408 pmap_invalidate_all_int( 3409 pmap); 3410 PMAP_UNLOCK(pmap); 3411 goto resume; 3412 } 3413 sched_pin(); 3414 } 3415 if (!pmap_demote_pde(pmap, 3416 &pmap->pm_pdir[pdirindex], sva)) { 3417 /* 3418 * The large page mapping was 3419 * destroyed. 3420 */ 3421 continue; 3422 } 3423 } 3424 } 3425 3426 if (pdnxt > eva) 3427 pdnxt = eva; 3428 3429 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3430 sva += PAGE_SIZE) { 3431 vm_page_t m; 3432 3433 retry: 3434 /* 3435 * Regardless of whether a pte is 32 or 64 bits in 3436 * size, PG_RW, PG_A, and PG_M are among the least 3437 * significant 32 bits. 3438 */ 3439 obits = pbits = *pte; 3440 if ((pbits & PG_V) == 0) 3441 continue; 3442 3443 if ((prot & VM_PROT_WRITE) == 0) { 3444 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3445 (PG_MANAGED | PG_M | PG_RW)) { 3446 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3447 vm_page_dirty(m); 3448 } 3449 pbits &= ~(PG_RW | PG_M); 3450 } 3451 #ifdef PMAP_PAE_COMP 3452 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 3453 pbits |= pg_nx; 3454 #endif 3455 3456 if (pbits != obits) { 3457 #ifdef PMAP_PAE_COMP 3458 if (!atomic_cmpset_64(pte, obits, pbits)) 3459 goto retry; 3460 #else 3461 if (!atomic_cmpset_int((u_int *)pte, obits, 3462 pbits)) 3463 goto retry; 3464 #endif 3465 if (obits & PG_G) 3466 pmap_invalidate_page_int(pmap, sva); 3467 else 3468 anychanged = TRUE; 3469 } 3470 } 3471 } 3472 if (anychanged) 3473 pmap_invalidate_all_int(pmap); 3474 if (pv_lists_locked) { 3475 sched_unpin(); 3476 rw_wunlock(&pvh_global_lock); 3477 } 3478 PMAP_UNLOCK(pmap); 3479 } 3480 3481 #if VM_NRESERVLEVEL > 0 3482 /* 3483 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3484 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3485 * For promotion to occur, two conditions must be met: (1) the 4KB page 3486 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3487 * mappings must have identical characteristics. 3488 * 3489 * Managed (PG_MANAGED) mappings within the kernel address space are not 3490 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3491 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3492 * pmap. 3493 */ 3494 static void 3495 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3496 { 3497 pd_entry_t newpde; 3498 pt_entry_t *firstpte, oldpte, pa, *pte; 3499 vm_offset_t oldpteva; 3500 vm_page_t mpte; 3501 3502 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3503 3504 /* 3505 * Examine the first PTE in the specified PTP. Abort if this PTE is 3506 * either invalid, unused, or does not map the first 4KB physical page 3507 * within a 2- or 4MB page. 3508 */ 3509 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3510 setpde: 3511 newpde = *firstpte; 3512 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3513 pmap_pde_p_failures++; 3514 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3515 " in pmap %p", va, pmap); 3516 return; 3517 } 3518 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3519 pmap_pde_p_failures++; 3520 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3521 " in pmap %p", va, pmap); 3522 return; 3523 } 3524 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3525 /* 3526 * When PG_M is already clear, PG_RW can be cleared without 3527 * a TLB invalidation. 3528 */ 3529 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3530 ~PG_RW)) 3531 goto setpde; 3532 newpde &= ~PG_RW; 3533 } 3534 3535 /* 3536 * Examine each of the other PTEs in the specified PTP. Abort if this 3537 * PTE maps an unexpected 4KB physical page or does not have identical 3538 * characteristics to the first PTE. 3539 */ 3540 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3541 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3542 setpte: 3543 oldpte = *pte; 3544 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3545 pmap_pde_p_failures++; 3546 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3547 " in pmap %p", va, pmap); 3548 return; 3549 } 3550 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3551 /* 3552 * When PG_M is already clear, PG_RW can be cleared 3553 * without a TLB invalidation. 3554 */ 3555 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3556 oldpte & ~PG_RW)) 3557 goto setpte; 3558 oldpte &= ~PG_RW; 3559 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3560 (va & ~PDRMASK); 3561 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3562 " in pmap %p", oldpteva, pmap); 3563 } 3564 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3565 pmap_pde_p_failures++; 3566 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3567 " in pmap %p", va, pmap); 3568 return; 3569 } 3570 pa -= PAGE_SIZE; 3571 } 3572 3573 /* 3574 * Save the page table page in its current state until the PDE 3575 * mapping the superpage is demoted by pmap_demote_pde() or 3576 * destroyed by pmap_remove_pde(). 3577 */ 3578 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3579 KASSERT(mpte >= vm_page_array && 3580 mpte < &vm_page_array[vm_page_array_size], 3581 ("pmap_promote_pde: page table page is out of range")); 3582 KASSERT(mpte->pindex == va >> PDRSHIFT, 3583 ("pmap_promote_pde: page table page's pindex is wrong")); 3584 if (pmap_insert_pt_page(pmap, mpte, true)) { 3585 pmap_pde_p_failures++; 3586 CTR2(KTR_PMAP, 3587 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3588 pmap); 3589 return; 3590 } 3591 3592 /* 3593 * Promote the pv entries. 3594 */ 3595 if ((newpde & PG_MANAGED) != 0) 3596 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3597 3598 /* 3599 * Propagate the PAT index to its proper position. 3600 */ 3601 if ((newpde & PG_PTE_PAT) != 0) 3602 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3603 3604 /* 3605 * Map the superpage. 3606 */ 3607 if (workaround_erratum383) 3608 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3609 else if (pmap == kernel_pmap) 3610 pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); 3611 else 3612 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 3613 3614 pmap_pde_promotions++; 3615 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3616 " in pmap %p", va, pmap); 3617 } 3618 #endif /* VM_NRESERVLEVEL > 0 */ 3619 3620 /* 3621 * Insert the given physical page (p) at 3622 * the specified virtual address (v) in the 3623 * target physical map with the protection requested. 3624 * 3625 * If specified, the page will be wired down, meaning 3626 * that the related pte can not be reclaimed. 3627 * 3628 * NB: This is the only routine which MAY NOT lazy-evaluate 3629 * or lose information. That is, this routine must actually 3630 * insert this page into the given map NOW. 3631 */ 3632 static int 3633 __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, 3634 vm_prot_t prot, u_int flags, int8_t psind) 3635 { 3636 pd_entry_t *pde; 3637 pt_entry_t *pte; 3638 pt_entry_t newpte, origpte; 3639 pv_entry_t pv; 3640 vm_paddr_t opa, pa; 3641 vm_page_t mpte, om; 3642 int rv; 3643 3644 va = trunc_page(va); 3645 KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) || 3646 (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS), 3647 ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va)); 3648 KASSERT(va < PMAP_TRM_MIN_ADDRESS, 3649 ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)", 3650 va)); 3651 KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 || 3652 va < kmi.clean_sva || va >= kmi.clean_eva, 3653 ("pmap_enter: managed mapping within the clean submap")); 3654 if ((m->oflags & VPO_UNMANAGED) == 0) 3655 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3656 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 3657 ("pmap_enter: flags %u has reserved bits set", flags)); 3658 pa = VM_PAGE_TO_PHYS(m); 3659 newpte = (pt_entry_t)(pa | PG_A | PG_V); 3660 if ((flags & VM_PROT_WRITE) != 0) 3661 newpte |= PG_M; 3662 if ((prot & VM_PROT_WRITE) != 0) 3663 newpte |= PG_RW; 3664 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 3665 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 3666 #ifdef PMAP_PAE_COMP 3667 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 3668 newpte |= pg_nx; 3669 #endif 3670 if ((flags & PMAP_ENTER_WIRED) != 0) 3671 newpte |= PG_W; 3672 if (pmap != kernel_pmap) 3673 newpte |= PG_U; 3674 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 3675 if ((m->oflags & VPO_UNMANAGED) == 0) 3676 newpte |= PG_MANAGED; 3677 3678 rw_wlock(&pvh_global_lock); 3679 PMAP_LOCK(pmap); 3680 sched_pin(); 3681 if (psind == 1) { 3682 /* Assert the required virtual and physical alignment. */ 3683 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 3684 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 3685 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m); 3686 goto out; 3687 } 3688 3689 pde = pmap_pde(pmap, va); 3690 if (pmap != kernel_pmap) { 3691 /* 3692 * va is for UVA. 3693 * In the case that a page table page is not resident, 3694 * we are creating it here. pmap_allocpte() handles 3695 * demotion. 3696 */ 3697 mpte = pmap_allocpte(pmap, va, flags); 3698 if (mpte == NULL) { 3699 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3700 ("pmap_allocpte failed with sleep allowed")); 3701 rv = KERN_RESOURCE_SHORTAGE; 3702 goto out; 3703 } 3704 } else { 3705 /* 3706 * va is for KVA, so pmap_demote_pde() will never fail 3707 * to install a page table page. PG_V is also 3708 * asserted by pmap_demote_pde(). 3709 */ 3710 mpte = NULL; 3711 KASSERT(pde != NULL && (*pde & PG_V) != 0, 3712 ("KVA %#x invalid pde pdir %#jx", va, 3713 (uintmax_t)pmap->pm_pdir[PTDPTDI])); 3714 if ((*pde & PG_PS) != 0) 3715 pmap_demote_pde(pmap, pde, va); 3716 } 3717 pte = pmap_pte_quick(pmap, va); 3718 3719 /* 3720 * Page Directory table entry is not valid, which should not 3721 * happen. We should have either allocated the page table 3722 * page or demoted the existing mapping above. 3723 */ 3724 if (pte == NULL) { 3725 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3726 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3727 } 3728 3729 origpte = *pte; 3730 pv = NULL; 3731 3732 /* 3733 * Is the specified virtual address already mapped? 3734 */ 3735 if ((origpte & PG_V) != 0) { 3736 /* 3737 * Wiring change, just update stats. We don't worry about 3738 * wiring PT pages as they remain resident as long as there 3739 * are valid mappings in them. Hence, if a user page is wired, 3740 * the PT page will be also. 3741 */ 3742 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 3743 pmap->pm_stats.wired_count++; 3744 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 3745 pmap->pm_stats.wired_count--; 3746 3747 /* 3748 * Remove the extra PT page reference. 3749 */ 3750 if (mpte != NULL) { 3751 mpte->ref_count--; 3752 KASSERT(mpte->ref_count > 0, 3753 ("pmap_enter: missing reference to page table page," 3754 " va: 0x%x", va)); 3755 } 3756 3757 /* 3758 * Has the physical page changed? 3759 */ 3760 opa = origpte & PG_FRAME; 3761 if (opa == pa) { 3762 /* 3763 * No, might be a protection or wiring change. 3764 */ 3765 if ((origpte & PG_MANAGED) != 0 && 3766 (newpte & PG_RW) != 0) 3767 vm_page_aflag_set(m, PGA_WRITEABLE); 3768 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 3769 goto unchanged; 3770 goto validate; 3771 } 3772 3773 /* 3774 * The physical page has changed. Temporarily invalidate 3775 * the mapping. This ensures that all threads sharing the 3776 * pmap keep a consistent view of the mapping, which is 3777 * necessary for the correct handling of COW faults. It 3778 * also permits reuse of the old mapping's PV entry, 3779 * avoiding an allocation. 3780 * 3781 * For consistency, handle unmanaged mappings the same way. 3782 */ 3783 origpte = pte_load_clear(pte); 3784 KASSERT((origpte & PG_FRAME) == opa, 3785 ("pmap_enter: unexpected pa update for %#x", va)); 3786 if ((origpte & PG_MANAGED) != 0) { 3787 om = PHYS_TO_VM_PAGE(opa); 3788 3789 /* 3790 * The pmap lock is sufficient to synchronize with 3791 * concurrent calls to pmap_page_test_mappings() and 3792 * pmap_ts_referenced(). 3793 */ 3794 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3795 vm_page_dirty(om); 3796 if ((origpte & PG_A) != 0) { 3797 pmap_invalidate_page_int(pmap, va); 3798 vm_page_aflag_set(om, PGA_REFERENCED); 3799 } 3800 pv = pmap_pvh_remove(&om->md, pmap, va); 3801 KASSERT(pv != NULL, 3802 ("pmap_enter: no PV entry for %#x", va)); 3803 if ((newpte & PG_MANAGED) == 0) 3804 free_pv_entry(pmap, pv); 3805 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3806 TAILQ_EMPTY(&om->md.pv_list) && 3807 ((om->flags & PG_FICTITIOUS) != 0 || 3808 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3809 vm_page_aflag_clear(om, PGA_WRITEABLE); 3810 } else { 3811 /* 3812 * Since this mapping is unmanaged, assume that PG_A 3813 * is set. 3814 */ 3815 pmap_invalidate_page_int(pmap, va); 3816 } 3817 origpte = 0; 3818 } else { 3819 /* 3820 * Increment the counters. 3821 */ 3822 if ((newpte & PG_W) != 0) 3823 pmap->pm_stats.wired_count++; 3824 pmap->pm_stats.resident_count++; 3825 } 3826 3827 /* 3828 * Enter on the PV list if part of our managed memory. 3829 */ 3830 if ((newpte & PG_MANAGED) != 0) { 3831 if (pv == NULL) { 3832 pv = get_pv_entry(pmap, FALSE); 3833 pv->pv_va = va; 3834 } 3835 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3836 if ((newpte & PG_RW) != 0) 3837 vm_page_aflag_set(m, PGA_WRITEABLE); 3838 } 3839 3840 /* 3841 * Update the PTE. 3842 */ 3843 if ((origpte & PG_V) != 0) { 3844 validate: 3845 origpte = pte_load_store(pte, newpte); 3846 KASSERT((origpte & PG_FRAME) == pa, 3847 ("pmap_enter: unexpected pa update for %#x", va)); 3848 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 3849 (PG_M | PG_RW)) { 3850 if ((origpte & PG_MANAGED) != 0) 3851 vm_page_dirty(m); 3852 3853 /* 3854 * Although the PTE may still have PG_RW set, TLB 3855 * invalidation may nonetheless be required because 3856 * the PTE no longer has PG_M set. 3857 */ 3858 } 3859 #ifdef PMAP_PAE_COMP 3860 else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 3861 /* 3862 * This PTE change does not require TLB invalidation. 3863 */ 3864 goto unchanged; 3865 } 3866 #endif 3867 if ((origpte & PG_A) != 0) 3868 pmap_invalidate_page_int(pmap, va); 3869 } else 3870 pte_store_zero(pte, newpte); 3871 3872 unchanged: 3873 3874 #if VM_NRESERVLEVEL > 0 3875 /* 3876 * If both the page table page and the reservation are fully 3877 * populated, then attempt promotion. 3878 */ 3879 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 3880 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3881 vm_reserv_level_iffullpop(m) == 0) 3882 pmap_promote_pde(pmap, pde, va); 3883 #endif 3884 3885 rv = KERN_SUCCESS; 3886 out: 3887 sched_unpin(); 3888 rw_wunlock(&pvh_global_lock); 3889 PMAP_UNLOCK(pmap); 3890 return (rv); 3891 } 3892 3893 /* 3894 * Tries to create a read- and/or execute-only 2 or 4 MB page mapping. Returns 3895 * true if successful. Returns false if (1) a mapping already exists at the 3896 * specified virtual address or (2) a PV entry cannot be allocated without 3897 * reclaiming another PV entry. 3898 */ 3899 static bool 3900 pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3901 { 3902 pd_entry_t newpde; 3903 3904 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3905 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 3906 PG_PS | PG_V; 3907 if ((m->oflags & VPO_UNMANAGED) == 0) 3908 newpde |= PG_MANAGED; 3909 #ifdef PMAP_PAE_COMP 3910 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 3911 newpde |= pg_nx; 3912 #endif 3913 if (pmap != kernel_pmap) 3914 newpde |= PG_U; 3915 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 3916 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL) == 3917 KERN_SUCCESS); 3918 } 3919 3920 /* 3921 * Returns true if every page table entry in the page table page that maps 3922 * the specified kernel virtual address is zero. 3923 */ 3924 static bool 3925 pmap_every_pte_zero(vm_offset_t va) 3926 { 3927 pt_entry_t *pt_end, *pte; 3928 3929 KASSERT((va & PDRMASK) == 0, ("va is misaligned")); 3930 pte = vtopte(va); 3931 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 3932 if (*pte != 0) 3933 return (false); 3934 } 3935 return (true); 3936 } 3937 3938 /* 3939 * Tries to create the specified 2 or 4 MB page mapping. Returns KERN_SUCCESS 3940 * if the mapping was created, and either KERN_FAILURE or 3941 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 3942 * PMAP_ENTER_NOREPLACE was specified and a mapping already exists at the 3943 * specified virtual address. Returns KERN_RESOURCE_SHORTAGE if 3944 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 3945 * 3946 * The parameter "m" is only used when creating a managed, writeable mapping. 3947 */ 3948 static int 3949 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 3950 vm_page_t m) 3951 { 3952 struct spglist free; 3953 pd_entry_t oldpde, *pde; 3954 vm_page_t mt; 3955 3956 rw_assert(&pvh_global_lock, RA_WLOCKED); 3957 KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, 3958 ("pmap_enter_pde: newpde is missing PG_M")); 3959 KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, 3960 ("pmap_enter_pde: cannot create wired user mapping")); 3961 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3962 pde = pmap_pde(pmap, va); 3963 oldpde = *pde; 3964 if ((oldpde & PG_V) != 0) { 3965 if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (pmap != 3966 kernel_pmap || (oldpde & PG_PS) != 0 || 3967 !pmap_every_pte_zero(va))) { 3968 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3969 " in pmap %p", va, pmap); 3970 return (KERN_FAILURE); 3971 } 3972 /* Break the existing mapping(s). */ 3973 SLIST_INIT(&free); 3974 if ((oldpde & PG_PS) != 0) { 3975 /* 3976 * If the PDE resulted from a promotion, then a 3977 * reserved PT page could be freed. 3978 */ 3979 (void)pmap_remove_pde(pmap, pde, va, &free); 3980 if ((oldpde & PG_G) == 0) 3981 pmap_invalidate_pde_page(pmap, va, oldpde); 3982 } else { 3983 if (pmap_remove_ptes(pmap, va, va + NBPDR, &free)) 3984 pmap_invalidate_all_int(pmap); 3985 } 3986 if (pmap != kernel_pmap) { 3987 vm_page_free_pages_toq(&free, true); 3988 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 3989 pde)); 3990 } else { 3991 KASSERT(SLIST_EMPTY(&free), 3992 ("pmap_enter_pde: freed kernel page table page")); 3993 3994 /* 3995 * Both pmap_remove_pde() and pmap_remove_ptes() will 3996 * leave the kernel page table page zero filled. 3997 */ 3998 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3999 if (pmap_insert_pt_page(pmap, mt, false)) 4000 panic("pmap_enter_pde: trie insert failed"); 4001 } 4002 } 4003 if ((newpde & PG_MANAGED) != 0) { 4004 /* 4005 * Abort this mapping if its PV entry could not be created. 4006 */ 4007 if (!pmap_pv_insert_pde(pmap, va, newpde, flags)) { 4008 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4009 " in pmap %p", va, pmap); 4010 return (KERN_RESOURCE_SHORTAGE); 4011 } 4012 if ((newpde & PG_RW) != 0) { 4013 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4014 vm_page_aflag_set(mt, PGA_WRITEABLE); 4015 } 4016 } 4017 4018 /* 4019 * Increment counters. 4020 */ 4021 if ((newpde & PG_W) != 0) 4022 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 4023 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 4024 4025 /* 4026 * Map the superpage. (This is not a promoted mapping; there will not 4027 * be any lingering 4KB page mappings in the TLB.) 4028 */ 4029 pde_store(pde, newpde); 4030 4031 pmap_pde_mappings++; 4032 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 4033 va, pmap); 4034 return (KERN_SUCCESS); 4035 } 4036 4037 /* 4038 * Maps a sequence of resident pages belonging to the same object. 4039 * The sequence begins with the given page m_start. This page is 4040 * mapped at the given virtual address start. Each subsequent page is 4041 * mapped at a virtual address that is offset from start by the same 4042 * amount as the page is offset from m_start within the object. The 4043 * last page in the sequence is the page with the largest offset from 4044 * m_start that can be mapped at a virtual address less than the given 4045 * virtual address end. Not every virtual page between start and end 4046 * is mapped; only those for which a resident page exists with the 4047 * corresponding offset from m_start are mapped. 4048 */ 4049 static void 4050 __CONCAT(PMTYPE, enter_object)(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4051 vm_page_t m_start, vm_prot_t prot) 4052 { 4053 vm_offset_t va; 4054 vm_page_t m, mpte; 4055 vm_pindex_t diff, psize; 4056 4057 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4058 4059 psize = atop(end - start); 4060 mpte = NULL; 4061 m = m_start; 4062 rw_wlock(&pvh_global_lock); 4063 PMAP_LOCK(pmap); 4064 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4065 va = start + ptoa(diff); 4066 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 4067 m->psind == 1 && pg_ps_enabled && 4068 pmap_enter_4mpage(pmap, va, m, prot)) 4069 m = &m[NBPDR / PAGE_SIZE - 1]; 4070 else 4071 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 4072 mpte); 4073 m = TAILQ_NEXT(m, listq); 4074 } 4075 rw_wunlock(&pvh_global_lock); 4076 PMAP_UNLOCK(pmap); 4077 } 4078 4079 /* 4080 * this code makes some *MAJOR* assumptions: 4081 * 1. Current pmap & pmap exists. 4082 * 2. Not wired. 4083 * 3. Read access. 4084 * 4. No page table pages. 4085 * but is *MUCH* faster than pmap_enter... 4086 */ 4087 4088 static void 4089 __CONCAT(PMTYPE, enter_quick)(pmap_t pmap, vm_offset_t va, vm_page_t m, 4090 vm_prot_t prot) 4091 { 4092 4093 rw_wlock(&pvh_global_lock); 4094 PMAP_LOCK(pmap); 4095 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4096 rw_wunlock(&pvh_global_lock); 4097 PMAP_UNLOCK(pmap); 4098 } 4099 4100 static vm_page_t 4101 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4102 vm_prot_t prot, vm_page_t mpte) 4103 { 4104 pt_entry_t newpte, *pte; 4105 4106 KASSERT(pmap != kernel_pmap || va < kmi.clean_sva || 4107 va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, 4108 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4109 rw_assert(&pvh_global_lock, RA_WLOCKED); 4110 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4111 4112 /* 4113 * In the case that a page table page is not 4114 * resident, we are creating it here. 4115 */ 4116 if (pmap != kernel_pmap) { 4117 u_int ptepindex; 4118 pd_entry_t ptepa; 4119 4120 /* 4121 * Calculate pagetable page index 4122 */ 4123 ptepindex = va >> PDRSHIFT; 4124 if (mpte && (mpte->pindex == ptepindex)) { 4125 mpte->ref_count++; 4126 } else { 4127 /* 4128 * Get the page directory entry 4129 */ 4130 ptepa = pmap->pm_pdir[ptepindex]; 4131 4132 /* 4133 * If the page table page is mapped, we just increment 4134 * the hold count, and activate it. 4135 */ 4136 if (ptepa) { 4137 if (ptepa & PG_PS) 4138 return (NULL); 4139 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 4140 mpte->ref_count++; 4141 } else { 4142 mpte = _pmap_allocpte(pmap, ptepindex, 4143 PMAP_ENTER_NOSLEEP); 4144 if (mpte == NULL) 4145 return (mpte); 4146 } 4147 } 4148 } else { 4149 mpte = NULL; 4150 } 4151 4152 sched_pin(); 4153 pte = pmap_pte_quick(pmap, va); 4154 if (*pte) { 4155 if (mpte != NULL) 4156 mpte->ref_count--; 4157 sched_unpin(); 4158 return (NULL); 4159 } 4160 4161 /* 4162 * Enter on the PV list if part of our managed memory. 4163 */ 4164 if ((m->oflags & VPO_UNMANAGED) == 0 && 4165 !pmap_try_insert_pv_entry(pmap, va, m)) { 4166 if (mpte != NULL) 4167 pmap_abort_ptp(pmap, va, mpte); 4168 sched_unpin(); 4169 return (NULL); 4170 } 4171 4172 /* 4173 * Increment counters 4174 */ 4175 pmap->pm_stats.resident_count++; 4176 4177 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 4178 pmap_cache_bits(pmap, m->md.pat_mode, 0); 4179 if ((m->oflags & VPO_UNMANAGED) == 0) 4180 newpte |= PG_MANAGED; 4181 #ifdef PMAP_PAE_COMP 4182 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 4183 newpte |= pg_nx; 4184 #endif 4185 if (pmap != kernel_pmap) 4186 newpte |= PG_U; 4187 pte_store_zero(pte, newpte); 4188 sched_unpin(); 4189 return (mpte); 4190 } 4191 4192 /* 4193 * Make a temporary mapping for a physical address. This is only intended 4194 * to be used for panic dumps. 4195 */ 4196 static void * 4197 __CONCAT(PMTYPE, kenter_temporary)(vm_paddr_t pa, int i) 4198 { 4199 vm_offset_t va; 4200 4201 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4202 pmap_kenter(va, pa); 4203 invlpg(va); 4204 return ((void *)crashdumpmap); 4205 } 4206 4207 /* 4208 * This code maps large physical mmap regions into the 4209 * processor address space. Note that some shortcuts 4210 * are taken, but the code works. 4211 */ 4212 static void 4213 __CONCAT(PMTYPE, object_init_pt)(pmap_t pmap, vm_offset_t addr, 4214 vm_object_t object, vm_pindex_t pindex, vm_size_t size) 4215 { 4216 pd_entry_t *pde; 4217 vm_paddr_t pa, ptepa; 4218 vm_page_t p; 4219 int pat_mode; 4220 4221 VM_OBJECT_ASSERT_WLOCKED(object); 4222 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4223 ("pmap_object_init_pt: non-device object")); 4224 if (pg_ps_enabled && 4225 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 4226 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4227 return; 4228 p = vm_page_lookup(object, pindex); 4229 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4230 ("pmap_object_init_pt: invalid page %p", p)); 4231 pat_mode = p->md.pat_mode; 4232 4233 /* 4234 * Abort the mapping if the first page is not physically 4235 * aligned to a 2/4MB page boundary. 4236 */ 4237 ptepa = VM_PAGE_TO_PHYS(p); 4238 if (ptepa & (NBPDR - 1)) 4239 return; 4240 4241 /* 4242 * Skip the first page. Abort the mapping if the rest of 4243 * the pages are not physically contiguous or have differing 4244 * memory attributes. 4245 */ 4246 p = TAILQ_NEXT(p, listq); 4247 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4248 pa += PAGE_SIZE) { 4249 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4250 ("pmap_object_init_pt: invalid page %p", p)); 4251 if (pa != VM_PAGE_TO_PHYS(p) || 4252 pat_mode != p->md.pat_mode) 4253 return; 4254 p = TAILQ_NEXT(p, listq); 4255 } 4256 4257 /* 4258 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 4259 * "size" is a multiple of 2/4M, adding the PAT setting to 4260 * "pa" will not affect the termination of this loop. 4261 */ 4262 PMAP_LOCK(pmap); 4263 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 4264 pa < ptepa + size; pa += NBPDR) { 4265 pde = pmap_pde(pmap, addr); 4266 if (*pde == 0) { 4267 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4268 PG_U | PG_RW | PG_V); 4269 pmap->pm_stats.resident_count += NBPDR / 4270 PAGE_SIZE; 4271 pmap_pde_mappings++; 4272 } 4273 /* Else continue on if the PDE is already valid. */ 4274 addr += NBPDR; 4275 } 4276 PMAP_UNLOCK(pmap); 4277 } 4278 } 4279 4280 /* 4281 * Clear the wired attribute from the mappings for the specified range of 4282 * addresses in the given pmap. Every valid mapping within that range 4283 * must have the wired attribute set. In contrast, invalid mappings 4284 * cannot have the wired attribute set, so they are ignored. 4285 * 4286 * The wired attribute of the page table entry is not a hardware feature, 4287 * so there is no need to invalidate any TLB entries. 4288 */ 4289 static void 4290 __CONCAT(PMTYPE, unwire)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4291 { 4292 vm_offset_t pdnxt; 4293 pd_entry_t *pde; 4294 pt_entry_t *pte; 4295 boolean_t pv_lists_locked; 4296 4297 if (pmap_is_current(pmap)) 4298 pv_lists_locked = FALSE; 4299 else { 4300 pv_lists_locked = TRUE; 4301 resume: 4302 rw_wlock(&pvh_global_lock); 4303 sched_pin(); 4304 } 4305 PMAP_LOCK(pmap); 4306 for (; sva < eva; sva = pdnxt) { 4307 pdnxt = (sva + NBPDR) & ~PDRMASK; 4308 if (pdnxt < sva) 4309 pdnxt = eva; 4310 pde = pmap_pde(pmap, sva); 4311 if ((*pde & PG_V) == 0) 4312 continue; 4313 if ((*pde & PG_PS) != 0) { 4314 if ((*pde & PG_W) == 0) 4315 panic("pmap_unwire: pde %#jx is missing PG_W", 4316 (uintmax_t)*pde); 4317 4318 /* 4319 * Are we unwiring the entire large page? If not, 4320 * demote the mapping and fall through. 4321 */ 4322 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4323 /* 4324 * Regardless of whether a pde (or pte) is 32 4325 * or 64 bits in size, PG_W is among the least 4326 * significant 32 bits. 4327 */ 4328 atomic_clear_int((u_int *)pde, PG_W); 4329 pmap->pm_stats.wired_count -= NBPDR / 4330 PAGE_SIZE; 4331 continue; 4332 } else { 4333 if (!pv_lists_locked) { 4334 pv_lists_locked = TRUE; 4335 if (!rw_try_wlock(&pvh_global_lock)) { 4336 PMAP_UNLOCK(pmap); 4337 /* Repeat sva. */ 4338 goto resume; 4339 } 4340 sched_pin(); 4341 } 4342 if (!pmap_demote_pde(pmap, pde, sva)) 4343 panic("pmap_unwire: demotion failed"); 4344 } 4345 } 4346 if (pdnxt > eva) 4347 pdnxt = eva; 4348 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4349 sva += PAGE_SIZE) { 4350 if ((*pte & PG_V) == 0) 4351 continue; 4352 if ((*pte & PG_W) == 0) 4353 panic("pmap_unwire: pte %#jx is missing PG_W", 4354 (uintmax_t)*pte); 4355 4356 /* 4357 * PG_W must be cleared atomically. Although the pmap 4358 * lock synchronizes access to PG_W, another processor 4359 * could be setting PG_M and/or PG_A concurrently. 4360 * 4361 * PG_W is among the least significant 32 bits. 4362 */ 4363 atomic_clear_int((u_int *)pte, PG_W); 4364 pmap->pm_stats.wired_count--; 4365 } 4366 } 4367 if (pv_lists_locked) { 4368 sched_unpin(); 4369 rw_wunlock(&pvh_global_lock); 4370 } 4371 PMAP_UNLOCK(pmap); 4372 } 4373 4374 4375 /* 4376 * Copy the range specified by src_addr/len 4377 * from the source map to the range dst_addr/len 4378 * in the destination map. 4379 * 4380 * This routine is only advisory and need not do anything. Since 4381 * current pmap is always the kernel pmap when executing in 4382 * kernel, and we do not copy from the kernel pmap to a user 4383 * pmap, this optimization is not usable in 4/4G full split i386 4384 * world. 4385 */ 4386 4387 static void 4388 __CONCAT(PMTYPE, copy)(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 4389 vm_size_t len, vm_offset_t src_addr) 4390 { 4391 pt_entry_t *src_pte, *dst_pte, ptetemp; 4392 pd_entry_t srcptepaddr; 4393 vm_page_t dstmpte, srcmpte; 4394 vm_offset_t addr, end_addr, pdnxt; 4395 u_int ptepindex; 4396 4397 if (dst_addr != src_addr) 4398 return; 4399 4400 end_addr = src_addr + len; 4401 4402 rw_wlock(&pvh_global_lock); 4403 if (dst_pmap < src_pmap) { 4404 PMAP_LOCK(dst_pmap); 4405 PMAP_LOCK(src_pmap); 4406 } else { 4407 PMAP_LOCK(src_pmap); 4408 PMAP_LOCK(dst_pmap); 4409 } 4410 sched_pin(); 4411 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4412 KASSERT(addr < PMAP_TRM_MIN_ADDRESS, 4413 ("pmap_copy: invalid to pmap_copy the trampoline")); 4414 4415 pdnxt = (addr + NBPDR) & ~PDRMASK; 4416 if (pdnxt < addr) 4417 pdnxt = end_addr; 4418 ptepindex = addr >> PDRSHIFT; 4419 4420 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4421 if (srcptepaddr == 0) 4422 continue; 4423 4424 if (srcptepaddr & PG_PS) { 4425 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4426 continue; 4427 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4428 ((srcptepaddr & PG_MANAGED) == 0 || 4429 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 4430 PMAP_ENTER_NORECLAIM))) { 4431 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4432 ~PG_W; 4433 dst_pmap->pm_stats.resident_count += 4434 NBPDR / PAGE_SIZE; 4435 pmap_pde_mappings++; 4436 } 4437 continue; 4438 } 4439 4440 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4441 KASSERT(srcmpte->ref_count > 0, 4442 ("pmap_copy: source page table page is unused")); 4443 4444 if (pdnxt > end_addr) 4445 pdnxt = end_addr; 4446 4447 src_pte = pmap_pte_quick3(src_pmap, addr); 4448 while (addr < pdnxt) { 4449 ptetemp = *src_pte; 4450 /* 4451 * we only virtual copy managed pages 4452 */ 4453 if ((ptetemp & PG_MANAGED) != 0) { 4454 dstmpte = pmap_allocpte(dst_pmap, addr, 4455 PMAP_ENTER_NOSLEEP); 4456 if (dstmpte == NULL) 4457 goto out; 4458 dst_pte = pmap_pte_quick(dst_pmap, addr); 4459 if (*dst_pte == 0 && 4460 pmap_try_insert_pv_entry(dst_pmap, addr, 4461 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4462 /* 4463 * Clear the wired, modified, and 4464 * accessed (referenced) bits 4465 * during the copy. 4466 */ 4467 *dst_pte = ptetemp & ~(PG_W | PG_M | 4468 PG_A); 4469 dst_pmap->pm_stats.resident_count++; 4470 } else { 4471 pmap_abort_ptp(dst_pmap, addr, dstmpte); 4472 goto out; 4473 } 4474 if (dstmpte->ref_count >= srcmpte->ref_count) 4475 break; 4476 } 4477 addr += PAGE_SIZE; 4478 src_pte++; 4479 } 4480 } 4481 out: 4482 sched_unpin(); 4483 rw_wunlock(&pvh_global_lock); 4484 PMAP_UNLOCK(src_pmap); 4485 PMAP_UNLOCK(dst_pmap); 4486 } 4487 4488 /* 4489 * Zero 1 page of virtual memory mapped from a hardware page by the caller. 4490 */ 4491 static __inline void 4492 pagezero(void *page) 4493 { 4494 #if defined(I686_CPU) 4495 if (cpu_class == CPUCLASS_686) { 4496 if (cpu_feature & CPUID_SSE2) 4497 sse2_pagezero(page); 4498 else 4499 i686_pagezero(page); 4500 } else 4501 #endif 4502 bzero(page, PAGE_SIZE); 4503 } 4504 4505 /* 4506 * Zero the specified hardware page. 4507 */ 4508 static void 4509 __CONCAT(PMTYPE, zero_page)(vm_page_t m) 4510 { 4511 pt_entry_t *cmap_pte2; 4512 struct pcpu *pc; 4513 4514 sched_pin(); 4515 pc = get_pcpu(); 4516 cmap_pte2 = pc->pc_cmap_pte2; 4517 mtx_lock(&pc->pc_cmap_lock); 4518 if (*cmap_pte2) 4519 panic("pmap_zero_page: CMAP2 busy"); 4520 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4521 pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 4522 invlcaddr(pc->pc_cmap_addr2); 4523 pagezero(pc->pc_cmap_addr2); 4524 *cmap_pte2 = 0; 4525 4526 /* 4527 * Unpin the thread before releasing the lock. Otherwise the thread 4528 * could be rescheduled while still bound to the current CPU, only 4529 * to unpin itself immediately upon resuming execution. 4530 */ 4531 sched_unpin(); 4532 mtx_unlock(&pc->pc_cmap_lock); 4533 } 4534 4535 /* 4536 * Zero an an area within a single hardware page. off and size must not 4537 * cover an area beyond a single hardware page. 4538 */ 4539 static void 4540 __CONCAT(PMTYPE, zero_page_area)(vm_page_t m, int off, int size) 4541 { 4542 pt_entry_t *cmap_pte2; 4543 struct pcpu *pc; 4544 4545 sched_pin(); 4546 pc = get_pcpu(); 4547 cmap_pte2 = pc->pc_cmap_pte2; 4548 mtx_lock(&pc->pc_cmap_lock); 4549 if (*cmap_pte2) 4550 panic("pmap_zero_page_area: CMAP2 busy"); 4551 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4552 pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 4553 invlcaddr(pc->pc_cmap_addr2); 4554 if (off == 0 && size == PAGE_SIZE) 4555 pagezero(pc->pc_cmap_addr2); 4556 else 4557 bzero(pc->pc_cmap_addr2 + off, size); 4558 *cmap_pte2 = 0; 4559 sched_unpin(); 4560 mtx_unlock(&pc->pc_cmap_lock); 4561 } 4562 4563 /* 4564 * Copy 1 specified hardware page to another. 4565 */ 4566 static void 4567 __CONCAT(PMTYPE, copy_page)(vm_page_t src, vm_page_t dst) 4568 { 4569 pt_entry_t *cmap_pte1, *cmap_pte2; 4570 struct pcpu *pc; 4571 4572 sched_pin(); 4573 pc = get_pcpu(); 4574 cmap_pte1 = pc->pc_cmap_pte1; 4575 cmap_pte2 = pc->pc_cmap_pte2; 4576 mtx_lock(&pc->pc_cmap_lock); 4577 if (*cmap_pte1) 4578 panic("pmap_copy_page: CMAP1 busy"); 4579 if (*cmap_pte2) 4580 panic("pmap_copy_page: CMAP2 busy"); 4581 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4582 pmap_cache_bits(kernel_pmap, src->md.pat_mode, 0); 4583 invlcaddr(pc->pc_cmap_addr1); 4584 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4585 pmap_cache_bits(kernel_pmap, dst->md.pat_mode, 0); 4586 invlcaddr(pc->pc_cmap_addr2); 4587 bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); 4588 *cmap_pte1 = 0; 4589 *cmap_pte2 = 0; 4590 sched_unpin(); 4591 mtx_unlock(&pc->pc_cmap_lock); 4592 } 4593 4594 static void 4595 __CONCAT(PMTYPE, copy_pages)(vm_page_t ma[], vm_offset_t a_offset, 4596 vm_page_t mb[], vm_offset_t b_offset, int xfersize) 4597 { 4598 vm_page_t a_pg, b_pg; 4599 char *a_cp, *b_cp; 4600 vm_offset_t a_pg_offset, b_pg_offset; 4601 pt_entry_t *cmap_pte1, *cmap_pte2; 4602 struct pcpu *pc; 4603 int cnt; 4604 4605 sched_pin(); 4606 pc = get_pcpu(); 4607 cmap_pte1 = pc->pc_cmap_pte1; 4608 cmap_pte2 = pc->pc_cmap_pte2; 4609 mtx_lock(&pc->pc_cmap_lock); 4610 if (*cmap_pte1 != 0) 4611 panic("pmap_copy_pages: CMAP1 busy"); 4612 if (*cmap_pte2 != 0) 4613 panic("pmap_copy_pages: CMAP2 busy"); 4614 while (xfersize > 0) { 4615 a_pg = ma[a_offset >> PAGE_SHIFT]; 4616 a_pg_offset = a_offset & PAGE_MASK; 4617 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4618 b_pg = mb[b_offset >> PAGE_SHIFT]; 4619 b_pg_offset = b_offset & PAGE_MASK; 4620 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4621 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4622 pmap_cache_bits(kernel_pmap, a_pg->md.pat_mode, 0); 4623 invlcaddr(pc->pc_cmap_addr1); 4624 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4625 PG_M | pmap_cache_bits(kernel_pmap, b_pg->md.pat_mode, 0); 4626 invlcaddr(pc->pc_cmap_addr2); 4627 a_cp = pc->pc_cmap_addr1 + a_pg_offset; 4628 b_cp = pc->pc_cmap_addr2 + b_pg_offset; 4629 bcopy(a_cp, b_cp, cnt); 4630 a_offset += cnt; 4631 b_offset += cnt; 4632 xfersize -= cnt; 4633 } 4634 *cmap_pte1 = 0; 4635 *cmap_pte2 = 0; 4636 sched_unpin(); 4637 mtx_unlock(&pc->pc_cmap_lock); 4638 } 4639 4640 /* 4641 * Returns true if the pmap's pv is one of the first 4642 * 16 pvs linked to from this page. This count may 4643 * be changed upwards or downwards in the future; it 4644 * is only necessary that true be returned for a small 4645 * subset of pmaps for proper page aging. 4646 */ 4647 static boolean_t 4648 __CONCAT(PMTYPE, page_exists_quick)(pmap_t pmap, vm_page_t m) 4649 { 4650 struct md_page *pvh; 4651 pv_entry_t pv; 4652 int loops = 0; 4653 boolean_t rv; 4654 4655 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4656 ("pmap_page_exists_quick: page %p is not managed", m)); 4657 rv = FALSE; 4658 rw_wlock(&pvh_global_lock); 4659 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4660 if (PV_PMAP(pv) == pmap) { 4661 rv = TRUE; 4662 break; 4663 } 4664 loops++; 4665 if (loops >= 16) 4666 break; 4667 } 4668 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4669 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4670 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4671 if (PV_PMAP(pv) == pmap) { 4672 rv = TRUE; 4673 break; 4674 } 4675 loops++; 4676 if (loops >= 16) 4677 break; 4678 } 4679 } 4680 rw_wunlock(&pvh_global_lock); 4681 return (rv); 4682 } 4683 4684 /* 4685 * pmap_page_wired_mappings: 4686 * 4687 * Return the number of managed mappings to the given physical page 4688 * that are wired. 4689 */ 4690 static int 4691 __CONCAT(PMTYPE, page_wired_mappings)(vm_page_t m) 4692 { 4693 int count; 4694 4695 count = 0; 4696 if ((m->oflags & VPO_UNMANAGED) != 0) 4697 return (count); 4698 rw_wlock(&pvh_global_lock); 4699 count = pmap_pvh_wired_mappings(&m->md, count); 4700 if ((m->flags & PG_FICTITIOUS) == 0) { 4701 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4702 count); 4703 } 4704 rw_wunlock(&pvh_global_lock); 4705 return (count); 4706 } 4707 4708 /* 4709 * pmap_pvh_wired_mappings: 4710 * 4711 * Return the updated number "count" of managed mappings that are wired. 4712 */ 4713 static int 4714 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4715 { 4716 pmap_t pmap; 4717 pt_entry_t *pte; 4718 pv_entry_t pv; 4719 4720 rw_assert(&pvh_global_lock, RA_WLOCKED); 4721 sched_pin(); 4722 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4723 pmap = PV_PMAP(pv); 4724 PMAP_LOCK(pmap); 4725 pte = pmap_pte_quick(pmap, pv->pv_va); 4726 if ((*pte & PG_W) != 0) 4727 count++; 4728 PMAP_UNLOCK(pmap); 4729 } 4730 sched_unpin(); 4731 return (count); 4732 } 4733 4734 /* 4735 * Returns TRUE if the given page is mapped individually or as part of 4736 * a 4mpage. Otherwise, returns FALSE. 4737 */ 4738 static boolean_t 4739 __CONCAT(PMTYPE, page_is_mapped)(vm_page_t m) 4740 { 4741 boolean_t rv; 4742 4743 if ((m->oflags & VPO_UNMANAGED) != 0) 4744 return (FALSE); 4745 rw_wlock(&pvh_global_lock); 4746 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4747 ((m->flags & PG_FICTITIOUS) == 0 && 4748 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4749 rw_wunlock(&pvh_global_lock); 4750 return (rv); 4751 } 4752 4753 /* 4754 * Remove all pages from specified address space 4755 * this aids process exit speeds. Also, this code 4756 * is special cased for current process only, but 4757 * can have the more generic (and slightly slower) 4758 * mode enabled. This is much faster than pmap_remove 4759 * in the case of running down an entire address space. 4760 */ 4761 static void 4762 __CONCAT(PMTYPE, remove_pages)(pmap_t pmap) 4763 { 4764 pt_entry_t *pte, tpte; 4765 vm_page_t m, mpte, mt; 4766 pv_entry_t pv; 4767 struct md_page *pvh; 4768 struct pv_chunk *pc, *npc; 4769 struct spglist free; 4770 int field, idx; 4771 int32_t bit; 4772 uint32_t inuse, bitmask; 4773 int allfree; 4774 4775 if (pmap != PCPU_GET(curpmap)) { 4776 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4777 return; 4778 } 4779 SLIST_INIT(&free); 4780 rw_wlock(&pvh_global_lock); 4781 PMAP_LOCK(pmap); 4782 sched_pin(); 4783 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4784 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4785 pc->pc_pmap)); 4786 allfree = 1; 4787 for (field = 0; field < _NPCM; field++) { 4788 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4789 while (inuse != 0) { 4790 bit = bsfl(inuse); 4791 bitmask = 1UL << bit; 4792 idx = field * 32 + bit; 4793 pv = &pc->pc_pventry[idx]; 4794 inuse &= ~bitmask; 4795 4796 pte = pmap_pde(pmap, pv->pv_va); 4797 tpte = *pte; 4798 if ((tpte & PG_PS) == 0) { 4799 pte = pmap_pte_quick(pmap, pv->pv_va); 4800 tpte = *pte & ~PG_PTE_PAT; 4801 } 4802 4803 if (tpte == 0) { 4804 printf( 4805 "TPTE at %p IS ZERO @ VA %08x\n", 4806 pte, pv->pv_va); 4807 panic("bad pte"); 4808 } 4809 4810 /* 4811 * We cannot remove wired pages from a process' mapping at this time 4812 */ 4813 if (tpte & PG_W) { 4814 allfree = 0; 4815 continue; 4816 } 4817 4818 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4819 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4820 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4821 m, (uintmax_t)m->phys_addr, 4822 (uintmax_t)tpte)); 4823 4824 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4825 m < &vm_page_array[vm_page_array_size], 4826 ("pmap_remove_pages: bad tpte %#jx", 4827 (uintmax_t)tpte)); 4828 4829 pte_clear(pte); 4830 4831 /* 4832 * Update the vm_page_t clean/reference bits. 4833 */ 4834 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4835 if ((tpte & PG_PS) != 0) { 4836 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4837 vm_page_dirty(mt); 4838 } else 4839 vm_page_dirty(m); 4840 } 4841 4842 /* Mark free */ 4843 PV_STAT(pv_entry_frees++); 4844 PV_STAT(pv_entry_spare++); 4845 pv_entry_count--; 4846 pc->pc_map[field] |= bitmask; 4847 if ((tpte & PG_PS) != 0) { 4848 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4849 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4850 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4851 if (TAILQ_EMPTY(&pvh->pv_list)) { 4852 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4853 if (TAILQ_EMPTY(&mt->md.pv_list)) 4854 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4855 } 4856 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 4857 if (mpte != NULL) { 4858 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 4859 ("pmap_remove_pages: pte page not promoted")); 4860 pmap->pm_stats.resident_count--; 4861 KASSERT(mpte->ref_count == NPTEPG, 4862 ("pmap_remove_pages: pte page ref count error")); 4863 mpte->ref_count = 0; 4864 pmap_add_delayed_free_list(mpte, &free, FALSE); 4865 } 4866 } else { 4867 pmap->pm_stats.resident_count--; 4868 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4869 if (TAILQ_EMPTY(&m->md.pv_list) && 4870 (m->flags & PG_FICTITIOUS) == 0) { 4871 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4872 if (TAILQ_EMPTY(&pvh->pv_list)) 4873 vm_page_aflag_clear(m, PGA_WRITEABLE); 4874 } 4875 pmap_unuse_pt(pmap, pv->pv_va, &free); 4876 } 4877 } 4878 } 4879 if (allfree) { 4880 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4881 free_pv_chunk(pc); 4882 } 4883 } 4884 sched_unpin(); 4885 pmap_invalidate_all_int(pmap); 4886 rw_wunlock(&pvh_global_lock); 4887 PMAP_UNLOCK(pmap); 4888 vm_page_free_pages_toq(&free, true); 4889 } 4890 4891 /* 4892 * pmap_is_modified: 4893 * 4894 * Return whether or not the specified physical page was modified 4895 * in any physical maps. 4896 */ 4897 static boolean_t 4898 __CONCAT(PMTYPE, is_modified)(vm_page_t m) 4899 { 4900 boolean_t rv; 4901 4902 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4903 ("pmap_is_modified: page %p is not managed", m)); 4904 4905 /* 4906 * If the page is not busied then this check is racy. 4907 */ 4908 if (!pmap_page_is_write_mapped(m)) 4909 return (FALSE); 4910 rw_wlock(&pvh_global_lock); 4911 rv = pmap_is_modified_pvh(&m->md) || 4912 ((m->flags & PG_FICTITIOUS) == 0 && 4913 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4914 rw_wunlock(&pvh_global_lock); 4915 return (rv); 4916 } 4917 4918 /* 4919 * Returns TRUE if any of the given mappings were used to modify 4920 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4921 * mappings are supported. 4922 */ 4923 static boolean_t 4924 pmap_is_modified_pvh(struct md_page *pvh) 4925 { 4926 pv_entry_t pv; 4927 pt_entry_t *pte; 4928 pmap_t pmap; 4929 boolean_t rv; 4930 4931 rw_assert(&pvh_global_lock, RA_WLOCKED); 4932 rv = FALSE; 4933 sched_pin(); 4934 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4935 pmap = PV_PMAP(pv); 4936 PMAP_LOCK(pmap); 4937 pte = pmap_pte_quick(pmap, pv->pv_va); 4938 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4939 PMAP_UNLOCK(pmap); 4940 if (rv) 4941 break; 4942 } 4943 sched_unpin(); 4944 return (rv); 4945 } 4946 4947 /* 4948 * pmap_is_prefaultable: 4949 * 4950 * Return whether or not the specified virtual address is elgible 4951 * for prefault. 4952 */ 4953 static boolean_t 4954 __CONCAT(PMTYPE, is_prefaultable)(pmap_t pmap, vm_offset_t addr) 4955 { 4956 pd_entry_t pde; 4957 boolean_t rv; 4958 4959 rv = FALSE; 4960 PMAP_LOCK(pmap); 4961 pde = *pmap_pde(pmap, addr); 4962 if (pde != 0 && (pde & PG_PS) == 0) 4963 rv = pmap_pte_ufast(pmap, addr, pde) == 0; 4964 PMAP_UNLOCK(pmap); 4965 return (rv); 4966 } 4967 4968 /* 4969 * pmap_is_referenced: 4970 * 4971 * Return whether or not the specified physical page was referenced 4972 * in any physical maps. 4973 */ 4974 static boolean_t 4975 __CONCAT(PMTYPE, is_referenced)(vm_page_t m) 4976 { 4977 boolean_t rv; 4978 4979 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4980 ("pmap_is_referenced: page %p is not managed", m)); 4981 rw_wlock(&pvh_global_lock); 4982 rv = pmap_is_referenced_pvh(&m->md) || 4983 ((m->flags & PG_FICTITIOUS) == 0 && 4984 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4985 rw_wunlock(&pvh_global_lock); 4986 return (rv); 4987 } 4988 4989 /* 4990 * Returns TRUE if any of the given mappings were referenced and FALSE 4991 * otherwise. Both page and 4mpage mappings are supported. 4992 */ 4993 static boolean_t 4994 pmap_is_referenced_pvh(struct md_page *pvh) 4995 { 4996 pv_entry_t pv; 4997 pt_entry_t *pte; 4998 pmap_t pmap; 4999 boolean_t rv; 5000 5001 rw_assert(&pvh_global_lock, RA_WLOCKED); 5002 rv = FALSE; 5003 sched_pin(); 5004 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5005 pmap = PV_PMAP(pv); 5006 PMAP_LOCK(pmap); 5007 pte = pmap_pte_quick(pmap, pv->pv_va); 5008 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 5009 PMAP_UNLOCK(pmap); 5010 if (rv) 5011 break; 5012 } 5013 sched_unpin(); 5014 return (rv); 5015 } 5016 5017 /* 5018 * Clear the write and modified bits in each of the given page's mappings. 5019 */ 5020 static void 5021 __CONCAT(PMTYPE, remove_write)(vm_page_t m) 5022 { 5023 struct md_page *pvh; 5024 pv_entry_t next_pv, pv; 5025 pmap_t pmap; 5026 pd_entry_t *pde; 5027 pt_entry_t oldpte, *pte; 5028 vm_offset_t va; 5029 5030 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5031 ("pmap_remove_write: page %p is not managed", m)); 5032 vm_page_assert_busied(m); 5033 5034 if (!pmap_page_is_write_mapped(m)) 5035 return; 5036 rw_wlock(&pvh_global_lock); 5037 sched_pin(); 5038 if ((m->flags & PG_FICTITIOUS) != 0) 5039 goto small_mappings; 5040 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5041 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5042 va = pv->pv_va; 5043 pmap = PV_PMAP(pv); 5044 PMAP_LOCK(pmap); 5045 pde = pmap_pde(pmap, va); 5046 if ((*pde & PG_RW) != 0) 5047 (void)pmap_demote_pde(pmap, pde, va); 5048 PMAP_UNLOCK(pmap); 5049 } 5050 small_mappings: 5051 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5052 pmap = PV_PMAP(pv); 5053 PMAP_LOCK(pmap); 5054 pde = pmap_pde(pmap, pv->pv_va); 5055 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 5056 " a 4mpage in page %p's pv list", m)); 5057 pte = pmap_pte_quick(pmap, pv->pv_va); 5058 retry: 5059 oldpte = *pte; 5060 if ((oldpte & PG_RW) != 0) { 5061 /* 5062 * Regardless of whether a pte is 32 or 64 bits 5063 * in size, PG_RW and PG_M are among the least 5064 * significant 32 bits. 5065 */ 5066 if (!atomic_cmpset_int((u_int *)pte, oldpte, 5067 oldpte & ~(PG_RW | PG_M))) 5068 goto retry; 5069 if ((oldpte & PG_M) != 0) 5070 vm_page_dirty(m); 5071 pmap_invalidate_page_int(pmap, pv->pv_va); 5072 } 5073 PMAP_UNLOCK(pmap); 5074 } 5075 vm_page_aflag_clear(m, PGA_WRITEABLE); 5076 sched_unpin(); 5077 rw_wunlock(&pvh_global_lock); 5078 } 5079 5080 /* 5081 * pmap_ts_referenced: 5082 * 5083 * Return a count of reference bits for a page, clearing those bits. 5084 * It is not necessary for every reference bit to be cleared, but it 5085 * is necessary that 0 only be returned when there are truly no 5086 * reference bits set. 5087 * 5088 * As an optimization, update the page's dirty field if a modified bit is 5089 * found while counting reference bits. This opportunistic update can be 5090 * performed at low cost and can eliminate the need for some future calls 5091 * to pmap_is_modified(). However, since this function stops after 5092 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5093 * dirty pages. Those dirty pages will only be detected by a future call 5094 * to pmap_is_modified(). 5095 */ 5096 static int 5097 __CONCAT(PMTYPE, ts_referenced)(vm_page_t m) 5098 { 5099 struct md_page *pvh; 5100 pv_entry_t pv, pvf; 5101 pmap_t pmap; 5102 pd_entry_t *pde; 5103 pt_entry_t *pte; 5104 vm_paddr_t pa; 5105 int rtval = 0; 5106 5107 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5108 ("pmap_ts_referenced: page %p is not managed", m)); 5109 pa = VM_PAGE_TO_PHYS(m); 5110 pvh = pa_to_pvh(pa); 5111 rw_wlock(&pvh_global_lock); 5112 sched_pin(); 5113 if ((m->flags & PG_FICTITIOUS) != 0 || 5114 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5115 goto small_mappings; 5116 pv = pvf; 5117 do { 5118 pmap = PV_PMAP(pv); 5119 PMAP_LOCK(pmap); 5120 pde = pmap_pde(pmap, pv->pv_va); 5121 if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5122 /* 5123 * Although "*pde" is mapping a 2/4MB page, because 5124 * this function is called at a 4KB page granularity, 5125 * we only update the 4KB page under test. 5126 */ 5127 vm_page_dirty(m); 5128 } 5129 if ((*pde & PG_A) != 0) { 5130 /* 5131 * Since this reference bit is shared by either 1024 5132 * or 512 4KB pages, it should not be cleared every 5133 * time it is tested. Apply a simple "hash" function 5134 * on the physical page number, the virtual superpage 5135 * number, and the pmap address to select one 4KB page 5136 * out of the 1024 or 512 on which testing the 5137 * reference bit will result in clearing that bit. 5138 * This function is designed to avoid the selection of 5139 * the same 4KB page for every 2- or 4MB page mapping. 5140 * 5141 * On demotion, a mapping that hasn't been referenced 5142 * is simply destroyed. To avoid the possibility of a 5143 * subsequent page fault on a demoted wired mapping, 5144 * always leave its reference bit set. Moreover, 5145 * since the superpage is wired, the current state of 5146 * its reference bit won't affect page replacement. 5147 */ 5148 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 5149 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 5150 (*pde & PG_W) == 0) { 5151 atomic_clear_int((u_int *)pde, PG_A); 5152 pmap_invalidate_page_int(pmap, pv->pv_va); 5153 } 5154 rtval++; 5155 } 5156 PMAP_UNLOCK(pmap); 5157 /* Rotate the PV list if it has more than one entry. */ 5158 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5159 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5160 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5161 } 5162 if (rtval >= PMAP_TS_REFERENCED_MAX) 5163 goto out; 5164 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5165 small_mappings: 5166 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5167 goto out; 5168 pv = pvf; 5169 do { 5170 pmap = PV_PMAP(pv); 5171 PMAP_LOCK(pmap); 5172 pde = pmap_pde(pmap, pv->pv_va); 5173 KASSERT((*pde & PG_PS) == 0, 5174 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 5175 m)); 5176 pte = pmap_pte_quick(pmap, pv->pv_va); 5177 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5178 vm_page_dirty(m); 5179 if ((*pte & PG_A) != 0) { 5180 atomic_clear_int((u_int *)pte, PG_A); 5181 pmap_invalidate_page_int(pmap, pv->pv_va); 5182 rtval++; 5183 } 5184 PMAP_UNLOCK(pmap); 5185 /* Rotate the PV list if it has more than one entry. */ 5186 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5187 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5188 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5189 } 5190 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5191 PMAP_TS_REFERENCED_MAX); 5192 out: 5193 sched_unpin(); 5194 rw_wunlock(&pvh_global_lock); 5195 return (rtval); 5196 } 5197 5198 /* 5199 * Apply the given advice to the specified range of addresses within the 5200 * given pmap. Depending on the advice, clear the referenced and/or 5201 * modified flags in each mapping and set the mapped page's dirty field. 5202 */ 5203 static void 5204 __CONCAT(PMTYPE, advise)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 5205 int advice) 5206 { 5207 pd_entry_t oldpde, *pde; 5208 pt_entry_t *pte; 5209 vm_offset_t va, pdnxt; 5210 vm_page_t m; 5211 bool anychanged, pv_lists_locked; 5212 5213 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5214 return; 5215 if (pmap_is_current(pmap)) 5216 pv_lists_locked = false; 5217 else { 5218 pv_lists_locked = true; 5219 resume: 5220 rw_wlock(&pvh_global_lock); 5221 sched_pin(); 5222 } 5223 anychanged = false; 5224 PMAP_LOCK(pmap); 5225 for (; sva < eva; sva = pdnxt) { 5226 pdnxt = (sva + NBPDR) & ~PDRMASK; 5227 if (pdnxt < sva) 5228 pdnxt = eva; 5229 pde = pmap_pde(pmap, sva); 5230 oldpde = *pde; 5231 if ((oldpde & PG_V) == 0) 5232 continue; 5233 else if ((oldpde & PG_PS) != 0) { 5234 if ((oldpde & PG_MANAGED) == 0) 5235 continue; 5236 if (!pv_lists_locked) { 5237 pv_lists_locked = true; 5238 if (!rw_try_wlock(&pvh_global_lock)) { 5239 if (anychanged) 5240 pmap_invalidate_all_int(pmap); 5241 PMAP_UNLOCK(pmap); 5242 goto resume; 5243 } 5244 sched_pin(); 5245 } 5246 if (!pmap_demote_pde(pmap, pde, sva)) { 5247 /* 5248 * The large page mapping was destroyed. 5249 */ 5250 continue; 5251 } 5252 5253 /* 5254 * Unless the page mappings are wired, remove the 5255 * mapping to a single page so that a subsequent 5256 * access may repromote. Choosing the last page 5257 * within the address range [sva, min(pdnxt, eva)) 5258 * generally results in more repromotions. Since the 5259 * underlying page table page is fully populated, this 5260 * removal never frees a page table page. 5261 */ 5262 if ((oldpde & PG_W) == 0) { 5263 va = eva; 5264 if (va > pdnxt) 5265 va = pdnxt; 5266 va -= PAGE_SIZE; 5267 KASSERT(va >= sva, 5268 ("pmap_advise: no address gap")); 5269 pte = pmap_pte_quick(pmap, va); 5270 KASSERT((*pte & PG_V) != 0, 5271 ("pmap_advise: invalid PTE")); 5272 pmap_remove_pte(pmap, pte, va, NULL); 5273 anychanged = true; 5274 } 5275 } 5276 if (pdnxt > eva) 5277 pdnxt = eva; 5278 va = pdnxt; 5279 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 5280 sva += PAGE_SIZE) { 5281 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 5282 goto maybe_invlrng; 5283 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5284 if (advice == MADV_DONTNEED) { 5285 /* 5286 * Future calls to pmap_is_modified() 5287 * can be avoided by making the page 5288 * dirty now. 5289 */ 5290 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5291 vm_page_dirty(m); 5292 } 5293 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5294 } else if ((*pte & PG_A) != 0) 5295 atomic_clear_int((u_int *)pte, PG_A); 5296 else 5297 goto maybe_invlrng; 5298 if ((*pte & PG_G) != 0) { 5299 if (va == pdnxt) 5300 va = sva; 5301 } else 5302 anychanged = true; 5303 continue; 5304 maybe_invlrng: 5305 if (va != pdnxt) { 5306 pmap_invalidate_range_int(pmap, va, sva); 5307 va = pdnxt; 5308 } 5309 } 5310 if (va != pdnxt) 5311 pmap_invalidate_range_int(pmap, va, sva); 5312 } 5313 if (anychanged) 5314 pmap_invalidate_all_int(pmap); 5315 if (pv_lists_locked) { 5316 sched_unpin(); 5317 rw_wunlock(&pvh_global_lock); 5318 } 5319 PMAP_UNLOCK(pmap); 5320 } 5321 5322 /* 5323 * Clear the modify bits on the specified physical page. 5324 */ 5325 static void 5326 __CONCAT(PMTYPE, clear_modify)(vm_page_t m) 5327 { 5328 struct md_page *pvh; 5329 pv_entry_t next_pv, pv; 5330 pmap_t pmap; 5331 pd_entry_t oldpde, *pde; 5332 pt_entry_t *pte; 5333 vm_offset_t va; 5334 5335 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5336 ("pmap_clear_modify: page %p is not managed", m)); 5337 vm_page_assert_busied(m); 5338 5339 if (!pmap_page_is_write_mapped(m)) 5340 return; 5341 rw_wlock(&pvh_global_lock); 5342 sched_pin(); 5343 if ((m->flags & PG_FICTITIOUS) != 0) 5344 goto small_mappings; 5345 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5346 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5347 va = pv->pv_va; 5348 pmap = PV_PMAP(pv); 5349 PMAP_LOCK(pmap); 5350 pde = pmap_pde(pmap, va); 5351 oldpde = *pde; 5352 /* If oldpde has PG_RW set, then it also has PG_M set. */ 5353 if ((oldpde & PG_RW) != 0 && 5354 pmap_demote_pde(pmap, pde, va) && 5355 (oldpde & PG_W) == 0) { 5356 /* 5357 * Write protect the mapping to a single page so that 5358 * a subsequent write access may repromote. 5359 */ 5360 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 5361 pte = pmap_pte_quick(pmap, va); 5362 /* 5363 * Regardless of whether a pte is 32 or 64 bits 5364 * in size, PG_RW and PG_M are among the least 5365 * significant 32 bits. 5366 */ 5367 atomic_clear_int((u_int *)pte, PG_M | PG_RW); 5368 vm_page_dirty(m); 5369 pmap_invalidate_page_int(pmap, va); 5370 } 5371 PMAP_UNLOCK(pmap); 5372 } 5373 small_mappings: 5374 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5375 pmap = PV_PMAP(pv); 5376 PMAP_LOCK(pmap); 5377 pde = pmap_pde(pmap, pv->pv_va); 5378 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5379 " a 4mpage in page %p's pv list", m)); 5380 pte = pmap_pte_quick(pmap, pv->pv_va); 5381 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5382 /* 5383 * Regardless of whether a pte is 32 or 64 bits 5384 * in size, PG_M is among the least significant 5385 * 32 bits. 5386 */ 5387 atomic_clear_int((u_int *)pte, PG_M); 5388 pmap_invalidate_page_int(pmap, pv->pv_va); 5389 } 5390 PMAP_UNLOCK(pmap); 5391 } 5392 sched_unpin(); 5393 rw_wunlock(&pvh_global_lock); 5394 } 5395 5396 /* 5397 * Miscellaneous support routines follow 5398 */ 5399 5400 /* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5401 static __inline void 5402 pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5403 { 5404 u_int opte, npte; 5405 5406 /* 5407 * The cache mode bits are all in the low 32-bits of the 5408 * PTE, so we can just spin on updating the low 32-bits. 5409 */ 5410 do { 5411 opte = *(u_int *)pte; 5412 npte = opte & ~PG_PTE_CACHE; 5413 npte |= cache_bits; 5414 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5415 } 5416 5417 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5418 static __inline void 5419 pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5420 { 5421 u_int opde, npde; 5422 5423 /* 5424 * The cache mode bits are all in the low 32-bits of the 5425 * PDE, so we can just spin on updating the low 32-bits. 5426 */ 5427 do { 5428 opde = *(u_int *)pde; 5429 npde = opde & ~PG_PDE_CACHE; 5430 npde |= cache_bits; 5431 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5432 } 5433 5434 /* 5435 * Map a set of physical memory pages into the kernel virtual 5436 * address space. Return a pointer to where it is mapped. This 5437 * routine is intended to be used for mapping device memory, 5438 * NOT real memory. 5439 */ 5440 static void * 5441 __CONCAT(PMTYPE, mapdev_attr)(vm_paddr_t pa, vm_size_t size, int mode, 5442 int flags) 5443 { 5444 struct pmap_preinit_mapping *ppim; 5445 vm_offset_t va, offset; 5446 vm_page_t m; 5447 vm_size_t tmpsize; 5448 int i; 5449 5450 offset = pa & PAGE_MASK; 5451 size = round_page(offset + size); 5452 pa = pa & PG_FRAME; 5453 5454 if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW) { 5455 va = pa + PMAP_MAP_LOW; 5456 if ((flags & MAPDEV_SETATTR) == 0) 5457 return ((void *)(va + offset)); 5458 } else if (!pmap_initialized) { 5459 va = 0; 5460 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5461 ppim = pmap_preinit_mapping + i; 5462 if (ppim->va == 0) { 5463 ppim->pa = pa; 5464 ppim->sz = size; 5465 ppim->mode = mode; 5466 ppim->va = virtual_avail; 5467 virtual_avail += size; 5468 va = ppim->va; 5469 break; 5470 } 5471 } 5472 if (va == 0) 5473 panic("%s: too many preinit mappings", __func__); 5474 } else { 5475 /* 5476 * If we have a preinit mapping, re-use it. 5477 */ 5478 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5479 ppim = pmap_preinit_mapping + i; 5480 if (ppim->pa == pa && ppim->sz == size && 5481 (ppim->mode == mode || 5482 (flags & MAPDEV_SETATTR) == 0)) 5483 return ((void *)(ppim->va + offset)); 5484 } 5485 va = kva_alloc(size); 5486 if (va == 0) 5487 panic("%s: Couldn't allocate KVA", __func__); 5488 } 5489 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) { 5490 if ((flags & MAPDEV_SETATTR) == 0 && pmap_initialized) { 5491 m = PHYS_TO_VM_PAGE(pa); 5492 if (m != NULL && VM_PAGE_TO_PHYS(m) == pa) { 5493 pmap_kenter_attr(va + tmpsize, pa + tmpsize, 5494 m->md.pat_mode); 5495 continue; 5496 } 5497 } 5498 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5499 } 5500 pmap_invalidate_range_int(kernel_pmap, va, va + tmpsize); 5501 pmap_invalidate_cache_range(va, va + size); 5502 return ((void *)(va + offset)); 5503 } 5504 5505 static void 5506 __CONCAT(PMTYPE, unmapdev)(vm_offset_t va, vm_size_t size) 5507 { 5508 struct pmap_preinit_mapping *ppim; 5509 vm_offset_t offset; 5510 int i; 5511 5512 if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE) 5513 return; 5514 offset = va & PAGE_MASK; 5515 size = round_page(offset + size); 5516 va = trunc_page(va); 5517 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5518 ppim = pmap_preinit_mapping + i; 5519 if (ppim->va == va && ppim->sz == size) { 5520 if (pmap_initialized) 5521 return; 5522 ppim->pa = 0; 5523 ppim->va = 0; 5524 ppim->sz = 0; 5525 ppim->mode = 0; 5526 if (va + size == virtual_avail) 5527 virtual_avail = va; 5528 return; 5529 } 5530 } 5531 if (pmap_initialized) 5532 kva_free(va, size); 5533 } 5534 5535 /* 5536 * Sets the memory attribute for the specified page. 5537 */ 5538 static void 5539 __CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma) 5540 { 5541 5542 m->md.pat_mode = ma; 5543 if ((m->flags & PG_FICTITIOUS) != 0) 5544 return; 5545 5546 /* 5547 * If "m" is a normal page, flush it from the cache. 5548 * See pmap_invalidate_cache_range(). 5549 * 5550 * First, try to find an existing mapping of the page by sf 5551 * buffer. sf_buf_invalidate_cache() modifies mapping and 5552 * flushes the cache. 5553 */ 5554 if (sf_buf_invalidate_cache(m)) 5555 return; 5556 5557 /* 5558 * If page is not mapped by sf buffer, but CPU does not 5559 * support self snoop, map the page transient and do 5560 * invalidation. In the worst case, whole cache is flushed by 5561 * pmap_invalidate_cache_range(). 5562 */ 5563 if ((cpu_feature & CPUID_SS) == 0) 5564 pmap_flush_page(m); 5565 } 5566 5567 static void 5568 __CONCAT(PMTYPE, flush_page)(vm_page_t m) 5569 { 5570 pt_entry_t *cmap_pte2; 5571 struct pcpu *pc; 5572 vm_offset_t sva, eva; 5573 bool useclflushopt; 5574 5575 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 5576 if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { 5577 sched_pin(); 5578 pc = get_pcpu(); 5579 cmap_pte2 = pc->pc_cmap_pte2; 5580 mtx_lock(&pc->pc_cmap_lock); 5581 if (*cmap_pte2) 5582 panic("pmap_flush_page: CMAP2 busy"); 5583 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5584 PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 5585 0); 5586 invlcaddr(pc->pc_cmap_addr2); 5587 sva = (vm_offset_t)pc->pc_cmap_addr2; 5588 eva = sva + PAGE_SIZE; 5589 5590 /* 5591 * Use mfence or sfence despite the ordering implied by 5592 * mtx_{un,}lock() because clflush on non-Intel CPUs 5593 * and clflushopt are not guaranteed to be ordered by 5594 * any other instruction. 5595 */ 5596 if (useclflushopt) 5597 sfence(); 5598 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5599 mfence(); 5600 for (; sva < eva; sva += cpu_clflush_line_size) { 5601 if (useclflushopt) 5602 clflushopt(sva); 5603 else 5604 clflush(sva); 5605 } 5606 if (useclflushopt) 5607 sfence(); 5608 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5609 mfence(); 5610 *cmap_pte2 = 0; 5611 sched_unpin(); 5612 mtx_unlock(&pc->pc_cmap_lock); 5613 } else 5614 pmap_invalidate_cache(); 5615 } 5616 5617 /* 5618 * Changes the specified virtual address range's memory type to that given by 5619 * the parameter "mode". The specified virtual address range must be 5620 * completely contained within either the kernel map. 5621 * 5622 * Returns zero if the change completed successfully, and either EINVAL or 5623 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5624 * of the virtual address range was not mapped, and ENOMEM is returned if 5625 * there was insufficient memory available to complete the change. 5626 */ 5627 static int 5628 __CONCAT(PMTYPE, change_attr)(vm_offset_t va, vm_size_t size, int mode) 5629 { 5630 vm_offset_t base, offset, tmpva; 5631 pd_entry_t *pde; 5632 pt_entry_t *pte; 5633 int cache_bits_pte, cache_bits_pde; 5634 boolean_t changed; 5635 5636 base = trunc_page(va); 5637 offset = va & PAGE_MASK; 5638 size = round_page(offset + size); 5639 5640 /* 5641 * Only supported on kernel virtual addresses above the recursive map. 5642 */ 5643 if (base < VM_MIN_KERNEL_ADDRESS) 5644 return (EINVAL); 5645 5646 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 5647 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 5648 changed = FALSE; 5649 5650 /* 5651 * Pages that aren't mapped aren't supported. Also break down 5652 * 2/4MB pages into 4KB pages if required. 5653 */ 5654 PMAP_LOCK(kernel_pmap); 5655 for (tmpva = base; tmpva < base + size; ) { 5656 pde = pmap_pde(kernel_pmap, tmpva); 5657 if (*pde == 0) { 5658 PMAP_UNLOCK(kernel_pmap); 5659 return (EINVAL); 5660 } 5661 if (*pde & PG_PS) { 5662 /* 5663 * If the current 2/4MB page already has 5664 * the required memory type, then we need not 5665 * demote this page. Just increment tmpva to 5666 * the next 2/4MB page frame. 5667 */ 5668 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5669 tmpva = trunc_4mpage(tmpva) + NBPDR; 5670 continue; 5671 } 5672 5673 /* 5674 * If the current offset aligns with a 2/4MB 5675 * page frame and there is at least 2/4MB left 5676 * within the range, then we need not break 5677 * down this page into 4KB pages. 5678 */ 5679 if ((tmpva & PDRMASK) == 0 && 5680 tmpva + PDRMASK < base + size) { 5681 tmpva += NBPDR; 5682 continue; 5683 } 5684 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5685 PMAP_UNLOCK(kernel_pmap); 5686 return (ENOMEM); 5687 } 5688 } 5689 pte = vtopte(tmpva); 5690 if (*pte == 0) { 5691 PMAP_UNLOCK(kernel_pmap); 5692 return (EINVAL); 5693 } 5694 tmpva += PAGE_SIZE; 5695 } 5696 PMAP_UNLOCK(kernel_pmap); 5697 5698 /* 5699 * Ok, all the pages exist, so run through them updating their 5700 * cache mode if required. 5701 */ 5702 for (tmpva = base; tmpva < base + size; ) { 5703 pde = pmap_pde(kernel_pmap, tmpva); 5704 if (*pde & PG_PS) { 5705 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5706 pmap_pde_attr(pde, cache_bits_pde); 5707 changed = TRUE; 5708 } 5709 tmpva = trunc_4mpage(tmpva) + NBPDR; 5710 } else { 5711 pte = vtopte(tmpva); 5712 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5713 pmap_pte_attr(pte, cache_bits_pte); 5714 changed = TRUE; 5715 } 5716 tmpva += PAGE_SIZE; 5717 } 5718 } 5719 5720 /* 5721 * Flush CPU caches to make sure any data isn't cached that 5722 * shouldn't be, etc. 5723 */ 5724 if (changed) { 5725 pmap_invalidate_range_int(kernel_pmap, base, tmpva); 5726 pmap_invalidate_cache_range(base, tmpva); 5727 } 5728 return (0); 5729 } 5730 5731 /* 5732 * Perform the pmap work for mincore(2). If the page is not both referenced and 5733 * modified by this pmap, returns its physical address so that the caller can 5734 * find other mappings. 5735 */ 5736 static int 5737 __CONCAT(PMTYPE, mincore)(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 5738 { 5739 pd_entry_t pde; 5740 pt_entry_t pte; 5741 vm_paddr_t pa; 5742 int val; 5743 5744 PMAP_LOCK(pmap); 5745 pde = *pmap_pde(pmap, addr); 5746 if (pde != 0) { 5747 if ((pde & PG_PS) != 0) { 5748 pte = pde; 5749 /* Compute the physical address of the 4KB page. */ 5750 pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) & 5751 PG_FRAME; 5752 val = MINCORE_SUPER; 5753 } else { 5754 pte = pmap_pte_ufast(pmap, addr, pde); 5755 pa = pte & PG_FRAME; 5756 val = 0; 5757 } 5758 } else { 5759 pte = 0; 5760 pa = 0; 5761 val = 0; 5762 } 5763 if ((pte & PG_V) != 0) { 5764 val |= MINCORE_INCORE; 5765 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5766 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5767 if ((pte & PG_A) != 0) 5768 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5769 } 5770 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5771 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5772 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5773 *pap = pa; 5774 } 5775 PMAP_UNLOCK(pmap); 5776 return (val); 5777 } 5778 5779 static void 5780 __CONCAT(PMTYPE, activate)(struct thread *td) 5781 { 5782 pmap_t pmap, oldpmap; 5783 u_int cpuid; 5784 u_int32_t cr3; 5785 5786 critical_enter(); 5787 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5788 oldpmap = PCPU_GET(curpmap); 5789 cpuid = PCPU_GET(cpuid); 5790 #if defined(SMP) 5791 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5792 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5793 #else 5794 CPU_CLR(cpuid, &oldpmap->pm_active); 5795 CPU_SET(cpuid, &pmap->pm_active); 5796 #endif 5797 #ifdef PMAP_PAE_COMP 5798 cr3 = vtophys(pmap->pm_pdpt); 5799 #else 5800 cr3 = vtophys(pmap->pm_pdir); 5801 #endif 5802 /* 5803 * pmap_activate is for the current thread on the current cpu 5804 */ 5805 td->td_pcb->pcb_cr3 = cr3; 5806 PCPU_SET(curpmap, pmap); 5807 critical_exit(); 5808 } 5809 5810 static void 5811 __CONCAT(PMTYPE, activate_boot)(pmap_t pmap) 5812 { 5813 u_int cpuid; 5814 5815 cpuid = PCPU_GET(cpuid); 5816 #if defined(SMP) 5817 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5818 #else 5819 CPU_SET(cpuid, &pmap->pm_active); 5820 #endif 5821 PCPU_SET(curpmap, pmap); 5822 } 5823 5824 /* 5825 * Increase the starting virtual address of the given mapping if a 5826 * different alignment might result in more superpage mappings. 5827 */ 5828 static void 5829 __CONCAT(PMTYPE, align_superpage)(vm_object_t object, vm_ooffset_t offset, 5830 vm_offset_t *addr, vm_size_t size) 5831 { 5832 vm_offset_t superpage_offset; 5833 5834 if (size < NBPDR) 5835 return; 5836 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5837 offset += ptoa(object->pg_color); 5838 superpage_offset = offset & PDRMASK; 5839 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5840 (*addr & PDRMASK) == superpage_offset) 5841 return; 5842 if ((*addr & PDRMASK) < superpage_offset) 5843 *addr = (*addr & ~PDRMASK) + superpage_offset; 5844 else 5845 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5846 } 5847 5848 static vm_offset_t 5849 __CONCAT(PMTYPE, quick_enter_page)(vm_page_t m) 5850 { 5851 vm_offset_t qaddr; 5852 pt_entry_t *pte; 5853 5854 critical_enter(); 5855 qaddr = PCPU_GET(qmap_addr); 5856 pte = vtopte(qaddr); 5857 5858 KASSERT(*pte == 0, 5859 ("pmap_quick_enter_page: PTE busy %#jx", (uintmax_t)*pte)); 5860 *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 5861 pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(m), 0); 5862 invlpg(qaddr); 5863 5864 return (qaddr); 5865 } 5866 5867 static void 5868 __CONCAT(PMTYPE, quick_remove_page)(vm_offset_t addr) 5869 { 5870 vm_offset_t qaddr; 5871 pt_entry_t *pte; 5872 5873 qaddr = PCPU_GET(qmap_addr); 5874 pte = vtopte(qaddr); 5875 5876 KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); 5877 KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); 5878 5879 *pte = 0; 5880 critical_exit(); 5881 } 5882 5883 static vmem_t *pmap_trm_arena; 5884 static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS; 5885 static int trm_guard = PAGE_SIZE; 5886 5887 static int 5888 pmap_trm_import(void *unused __unused, vmem_size_t size, int flags, 5889 vmem_addr_t *addrp) 5890 { 5891 vm_page_t m; 5892 vmem_addr_t af, addr, prev_addr; 5893 pt_entry_t *trm_pte; 5894 5895 prev_addr = atomic_load_long(&pmap_trm_arena_last); 5896 size = round_page(size) + trm_guard; 5897 for (;;) { 5898 if (prev_addr + size < prev_addr || prev_addr + size < size || 5899 prev_addr + size > PMAP_TRM_MAX_ADDRESS) 5900 return (ENOMEM); 5901 addr = prev_addr + size; 5902 if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr)) 5903 break; 5904 } 5905 prev_addr += trm_guard; 5906 trm_pte = PTmap + atop(prev_addr); 5907 for (af = prev_addr; af < addr; af += PAGE_SIZE) { 5908 m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | 5909 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 5910 pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) | 5911 PG_M | PG_A | PG_RW | PG_V | pgeflag | 5912 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE)); 5913 } 5914 *addrp = prev_addr; 5915 return (0); 5916 } 5917 5918 void 5919 pmap_init_trm(void) 5920 { 5921 vm_page_t pd_m; 5922 5923 TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard); 5924 if ((trm_guard & PAGE_MASK) != 0) 5925 trm_guard = 0; 5926 pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK); 5927 vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE); 5928 pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | 5929 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO); 5930 if ((pd_m->flags & PG_ZERO) == 0) 5931 pmap_zero_page(pd_m); 5932 PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V | 5933 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, TRUE); 5934 } 5935 5936 static void * 5937 __CONCAT(PMTYPE, trm_alloc)(size_t size, int flags) 5938 { 5939 vmem_addr_t res; 5940 int error; 5941 5942 MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0); 5943 error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int), 5944 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res); 5945 if (error != 0) 5946 return (NULL); 5947 if ((flags & M_ZERO) != 0) 5948 bzero((void *)res, size); 5949 return ((void *)res); 5950 } 5951 5952 static void 5953 __CONCAT(PMTYPE, trm_free)(void *addr, size_t size) 5954 { 5955 5956 vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4)); 5957 } 5958 5959 static void 5960 __CONCAT(PMTYPE, ksetrw)(vm_offset_t va) 5961 { 5962 5963 *vtopte(va) |= PG_RW; 5964 } 5965 5966 static void 5967 __CONCAT(PMTYPE, remap_lowptdi)(bool enable) 5968 { 5969 5970 PTD[KPTDI] = enable ? PTD[LOWPTDI] : 0; 5971 invltlb_glob(); 5972 } 5973 5974 static vm_offset_t 5975 __CONCAT(PMTYPE, get_map_low)(void) 5976 { 5977 5978 return (PMAP_MAP_LOW); 5979 } 5980 5981 static vm_offset_t 5982 __CONCAT(PMTYPE, get_vm_maxuser_address)(void) 5983 { 5984 5985 return (VM_MAXUSER_ADDRESS); 5986 } 5987 5988 static vm_paddr_t 5989 __CONCAT(PMTYPE, pg_frame)(vm_paddr_t pa) 5990 { 5991 5992 return (pa & PG_FRAME); 5993 } 5994 5995 static void 5996 __CONCAT(PMTYPE, sf_buf_map)(struct sf_buf *sf) 5997 { 5998 pt_entry_t opte, *ptep; 5999 6000 /* 6001 * Update the sf_buf's virtual-to-physical mapping, flushing the 6002 * virtual address from the TLB. Since the reference count for 6003 * the sf_buf's old mapping was zero, that mapping is not 6004 * currently in use. Consequently, there is no need to exchange 6005 * the old and new PTEs atomically, even under PAE. 6006 */ 6007 ptep = vtopte(sf->kva); 6008 opte = *ptep; 6009 *ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V | 6010 pmap_cache_bits(kernel_pmap, sf->m->md.pat_mode, 0); 6011 6012 /* 6013 * Avoid unnecessary TLB invalidations: If the sf_buf's old 6014 * virtual-to-physical mapping was not used, then any processor 6015 * that has invalidated the sf_buf's virtual address from its TLB 6016 * since the last used mapping need not invalidate again. 6017 */ 6018 #ifdef SMP 6019 if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) 6020 CPU_ZERO(&sf->cpumask); 6021 #else 6022 if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) 6023 pmap_invalidate_page_int(kernel_pmap, sf->kva); 6024 #endif 6025 } 6026 6027 static void 6028 __CONCAT(PMTYPE, cp_slow0_map)(vm_offset_t kaddr, int plen, vm_page_t *ma) 6029 { 6030 pt_entry_t *pte; 6031 int i; 6032 6033 for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) { 6034 *pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(ma[i]) | 6035 pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(ma[i]), 6036 FALSE); 6037 invlpg(kaddr + ptoa(i)); 6038 } 6039 } 6040 6041 static u_int 6042 __CONCAT(PMTYPE, get_kcr3)(void) 6043 { 6044 6045 #ifdef PMAP_PAE_COMP 6046 return ((u_int)IdlePDPT); 6047 #else 6048 return ((u_int)IdlePTD); 6049 #endif 6050 } 6051 6052 static u_int 6053 __CONCAT(PMTYPE, get_cr3)(pmap_t pmap) 6054 { 6055 6056 #ifdef PMAP_PAE_COMP 6057 return ((u_int)vtophys(pmap->pm_pdpt)); 6058 #else 6059 return ((u_int)vtophys(pmap->pm_pdir)); 6060 #endif 6061 } 6062 6063 static caddr_t 6064 __CONCAT(PMTYPE, cmap3)(vm_paddr_t pa, u_int pte_bits) 6065 { 6066 pt_entry_t *pte; 6067 6068 pte = CMAP3; 6069 *pte = pa | pte_bits; 6070 invltlb(); 6071 return (CADDR3); 6072 } 6073 6074 static void 6075 __CONCAT(PMTYPE, basemem_setup)(u_int basemem) 6076 { 6077 pt_entry_t *pte; 6078 int i; 6079 6080 /* 6081 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 6082 * the vm86 page table so that vm86 can scribble on them using 6083 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 6084 * page 0, at least as initialized here? 6085 */ 6086 pte = (pt_entry_t *)vm86paddr; 6087 for (i = basemem / 4; i < 160; i++) 6088 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 6089 } 6090 6091 struct bios16_pmap_handle { 6092 pt_entry_t *pte; 6093 pd_entry_t *ptd; 6094 pt_entry_t orig_ptd; 6095 }; 6096 6097 static void * 6098 __CONCAT(PMTYPE, bios16_enter)(void) 6099 { 6100 struct bios16_pmap_handle *h; 6101 6102 /* 6103 * no page table, so create one and install it. 6104 */ 6105 h = malloc(sizeof(struct bios16_pmap_handle), M_TEMP, M_WAITOK); 6106 h->pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); 6107 h->ptd = IdlePTD; 6108 *h->pte = vm86phystk | PG_RW | PG_V; 6109 h->orig_ptd = *h->ptd; 6110 *h->ptd = vtophys(h->pte) | PG_RW | PG_V; 6111 pmap_invalidate_all_int(kernel_pmap); /* XXX insurance for now */ 6112 return (h); 6113 } 6114 6115 static void 6116 __CONCAT(PMTYPE, bios16_leave)(void *arg) 6117 { 6118 struct bios16_pmap_handle *h; 6119 6120 h = arg; 6121 *h->ptd = h->orig_ptd; /* remove page table */ 6122 /* 6123 * XXX only needs to be invlpg(0) but that doesn't work on the 386 6124 */ 6125 pmap_invalidate_all_int(kernel_pmap); 6126 free(h->pte, M_TEMP); /* ... and free it */ 6127 } 6128 6129 struct pmap_kernel_map_range { 6130 vm_offset_t sva; 6131 pt_entry_t attrs; 6132 int ptes; 6133 int pdes; 6134 int pdpes; 6135 }; 6136 6137 static void 6138 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 6139 vm_offset_t eva) 6140 { 6141 const char *mode; 6142 int i, pat_idx; 6143 6144 if (eva <= range->sva) 6145 return; 6146 6147 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 6148 for (i = 0; i < PAT_INDEX_SIZE; i++) 6149 if (pat_index[i] == pat_idx) 6150 break; 6151 6152 switch (i) { 6153 case PAT_WRITE_BACK: 6154 mode = "WB"; 6155 break; 6156 case PAT_WRITE_THROUGH: 6157 mode = "WT"; 6158 break; 6159 case PAT_UNCACHEABLE: 6160 mode = "UC"; 6161 break; 6162 case PAT_UNCACHED: 6163 mode = "U-"; 6164 break; 6165 case PAT_WRITE_PROTECTED: 6166 mode = "WP"; 6167 break; 6168 case PAT_WRITE_COMBINING: 6169 mode = "WC"; 6170 break; 6171 default: 6172 printf("%s: unknown PAT mode %#x for range 0x%08x-0x%08x\n", 6173 __func__, pat_idx, range->sva, eva); 6174 mode = "??"; 6175 break; 6176 } 6177 6178 sbuf_printf(sb, "0x%08x-0x%08x r%c%c%c%c %s %d %d %d\n", 6179 range->sva, eva, 6180 (range->attrs & PG_RW) != 0 ? 'w' : '-', 6181 (range->attrs & pg_nx) != 0 ? '-' : 'x', 6182 (range->attrs & PG_U) != 0 ? 'u' : 's', 6183 (range->attrs & PG_G) != 0 ? 'g' : '-', 6184 mode, range->pdpes, range->pdes, range->ptes); 6185 6186 /* Reset to sentinel value. */ 6187 range->sva = 0xffffffff; 6188 } 6189 6190 /* 6191 * Determine whether the attributes specified by a page table entry match those 6192 * being tracked by the current range. This is not quite as simple as a direct 6193 * flag comparison since some PAT modes have multiple representations. 6194 */ 6195 static bool 6196 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 6197 { 6198 pt_entry_t diff, mask; 6199 6200 mask = pg_nx | PG_G | PG_RW | PG_U | PG_PDE_CACHE; 6201 diff = (range->attrs ^ attrs) & mask; 6202 if (diff == 0) 6203 return (true); 6204 if ((diff & ~PG_PDE_PAT) == 0 && 6205 pmap_pat_index(kernel_pmap, range->attrs, true) == 6206 pmap_pat_index(kernel_pmap, attrs, true)) 6207 return (true); 6208 return (false); 6209 } 6210 6211 static void 6212 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 6213 pt_entry_t attrs) 6214 { 6215 6216 memset(range, 0, sizeof(*range)); 6217 range->sva = va; 6218 range->attrs = attrs; 6219 } 6220 6221 /* 6222 * Given a leaf PTE, derive the mapping's attributes. If they do not match 6223 * those of the current run, dump the address range and its attributes, and 6224 * begin a new run. 6225 */ 6226 static void 6227 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 6228 vm_offset_t va, pd_entry_t pde, pt_entry_t pte) 6229 { 6230 pt_entry_t attrs; 6231 6232 attrs = pde & (PG_RW | PG_U | pg_nx); 6233 6234 if ((pde & PG_PS) != 0) { 6235 attrs |= pde & (PG_G | PG_PDE_CACHE); 6236 } else if (pte != 0) { 6237 attrs |= pte & pg_nx; 6238 attrs &= pg_nx | (pte & (PG_RW | PG_U)); 6239 attrs |= pte & (PG_G | PG_PTE_CACHE); 6240 6241 /* Canonicalize by always using the PDE PAT bit. */ 6242 if ((attrs & PG_PTE_PAT) != 0) 6243 attrs ^= PG_PDE_PAT | PG_PTE_PAT; 6244 } 6245 6246 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 6247 sysctl_kmaps_dump(sb, range, va); 6248 sysctl_kmaps_reinit(range, va, attrs); 6249 } 6250 } 6251 6252 static int 6253 __CONCAT(PMTYPE, sysctl_kmaps)(SYSCTL_HANDLER_ARGS) 6254 { 6255 struct pmap_kernel_map_range range; 6256 struct sbuf sbuf, *sb; 6257 pd_entry_t pde; 6258 pt_entry_t *pt, pte; 6259 vm_offset_t sva; 6260 vm_paddr_t pa; 6261 int error; 6262 u_int i, k; 6263 6264 error = sysctl_wire_old_buffer(req, 0); 6265 if (error != 0) 6266 return (error); 6267 sb = &sbuf; 6268 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 6269 6270 /* Sentinel value. */ 6271 range.sva = 0xffffffff; 6272 6273 /* 6274 * Iterate over the kernel page tables without holding the 6275 * kernel pmap lock. Kernel page table pages are never freed, 6276 * so at worst we will observe inconsistencies in the output. 6277 */ 6278 for (sva = 0, i = 0; i < NPTEPG * NPGPTD * NPDEPG ;) { 6279 if (i == 0) 6280 sbuf_printf(sb, "\nLow PDE:\n"); 6281 else if (i == LOWPTDI * NPTEPG) 6282 sbuf_printf(sb, "Low PDE dup:\n"); 6283 else if (i == PTDPTDI * NPTEPG) 6284 sbuf_printf(sb, "Recursive map:\n"); 6285 else if (i == KERNPTDI * NPTEPG) 6286 sbuf_printf(sb, "Kernel base:\n"); 6287 else if (i == TRPTDI * NPTEPG) 6288 sbuf_printf(sb, "Trampoline:\n"); 6289 pde = IdlePTD[sva >> PDRSHIFT]; 6290 if ((pde & PG_V) == 0) { 6291 sva = rounddown2(sva, NBPDR); 6292 sysctl_kmaps_dump(sb, &range, sva); 6293 sva += NBPDR; 6294 i += NPTEPG; 6295 continue; 6296 } 6297 pa = pde & PG_FRAME; 6298 if ((pde & PG_PS) != 0) { 6299 sysctl_kmaps_check(sb, &range, sva, pde, 0); 6300 range.pdes++; 6301 sva += NBPDR; 6302 i += NPTEPG; 6303 continue; 6304 } 6305 for (pt = vtopte(sva), k = 0; k < NPTEPG; i++, k++, pt++, 6306 sva += PAGE_SIZE) { 6307 pte = *pt; 6308 if ((pte & PG_V) == 0) { 6309 sysctl_kmaps_dump(sb, &range, sva); 6310 continue; 6311 } 6312 sysctl_kmaps_check(sb, &range, sva, pde, pte); 6313 range.ptes++; 6314 } 6315 } 6316 6317 error = sbuf_finish(sb); 6318 sbuf_delete(sb); 6319 return (error); 6320 } 6321 6322 #define PMM(a) \ 6323 .pm_##a = __CONCAT(PMTYPE, a), 6324 6325 struct pmap_methods __CONCAT(PMTYPE, methods) = { 6326 PMM(ksetrw) 6327 PMM(remap_lower) 6328 PMM(remap_lowptdi) 6329 PMM(align_superpage) 6330 PMM(quick_enter_page) 6331 PMM(quick_remove_page) 6332 PMM(trm_alloc) 6333 PMM(trm_free) 6334 PMM(get_map_low) 6335 PMM(get_vm_maxuser_address) 6336 PMM(kextract) 6337 PMM(pg_frame) 6338 PMM(sf_buf_map) 6339 PMM(cp_slow0_map) 6340 PMM(get_kcr3) 6341 PMM(get_cr3) 6342 PMM(cmap3) 6343 PMM(basemem_setup) 6344 PMM(set_nx) 6345 PMM(bios16_enter) 6346 PMM(bios16_leave) 6347 PMM(bootstrap) 6348 PMM(is_valid_memattr) 6349 PMM(cache_bits) 6350 PMM(ps_enabled) 6351 PMM(pinit0) 6352 PMM(pinit) 6353 PMM(activate) 6354 PMM(activate_boot) 6355 PMM(advise) 6356 PMM(clear_modify) 6357 PMM(change_attr) 6358 PMM(mincore) 6359 PMM(copy) 6360 PMM(copy_page) 6361 PMM(copy_pages) 6362 PMM(zero_page) 6363 PMM(zero_page_area) 6364 PMM(enter) 6365 PMM(enter_object) 6366 PMM(enter_quick) 6367 PMM(kenter_temporary) 6368 PMM(object_init_pt) 6369 PMM(unwire) 6370 PMM(page_exists_quick) 6371 PMM(page_wired_mappings) 6372 PMM(page_is_mapped) 6373 PMM(remove_pages) 6374 PMM(is_modified) 6375 PMM(is_prefaultable) 6376 PMM(is_referenced) 6377 PMM(remove_write) 6378 PMM(ts_referenced) 6379 PMM(mapdev_attr) 6380 PMM(unmapdev) 6381 PMM(page_set_memattr) 6382 PMM(extract) 6383 PMM(extract_and_hold) 6384 PMM(map) 6385 PMM(qenter) 6386 PMM(qremove) 6387 PMM(release) 6388 PMM(remove) 6389 PMM(protect) 6390 PMM(remove_all) 6391 PMM(init) 6392 PMM(init_pat) 6393 PMM(growkernel) 6394 PMM(invalidate_page) 6395 PMM(invalidate_range) 6396 PMM(invalidate_all) 6397 PMM(invalidate_cache) 6398 PMM(flush_page) 6399 PMM(kenter) 6400 PMM(kremove) 6401 PMM(sysctl_kmaps) 6402 }; 6403