1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 */ 45 /*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * Copyright (c) 2018 The FreeBSD Foundation 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Portions of this software were developed by 58 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 59 * the FreeBSD Foundation. 60 * 61 * Redistribution and use in source and binary forms, with or without 62 * modification, are permitted provided that the following conditions 63 * are met: 64 * 1. Redistributions of source code must retain the above copyright 65 * notice, this list of conditions and the following disclaimer. 66 * 2. Redistributions in binary form must reproduce the above copyright 67 * notice, this list of conditions and the following disclaimer in the 68 * documentation and/or other materials provided with the distribution. 69 * 70 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 71 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 72 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 73 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 74 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 75 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 76 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 77 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 78 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 79 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 80 * SUCH DAMAGE. 81 */ 82 83 #include <sys/cdefs.h> 84 /* 85 * Manages physical address maps. 86 * 87 * Since the information managed by this module is 88 * also stored by the logical address mapping module, 89 * this module may throw away valid virtual-to-physical 90 * mappings at almost any time. However, invalidations 91 * of virtual-to-physical mappings must be done as 92 * requested. 93 * 94 * In order to cope with hardware architectures which 95 * make virtual-to-physical map invalidates expensive, 96 * this module may delay invalidate or reduced protection 97 * operations until such time as they are actually 98 * necessary. This module is given full information as 99 * to which processors are currently using which maps, 100 * and to when physical maps must be made correct. 101 */ 102 103 #include "opt_apic.h" 104 #include "opt_cpu.h" 105 #include "opt_pmap.h" 106 #include "opt_smp.h" 107 #include "opt_vm.h" 108 109 #include <sys/param.h> 110 #include <sys/systm.h> 111 #include <sys/kernel.h> 112 #include <sys/ktr.h> 113 #include <sys/lock.h> 114 #include <sys/malloc.h> 115 #include <sys/mman.h> 116 #include <sys/msgbuf.h> 117 #include <sys/mutex.h> 118 #include <sys/proc.h> 119 #include <sys/rwlock.h> 120 #include <sys/sbuf.h> 121 #include <sys/sf_buf.h> 122 #include <sys/sx.h> 123 #include <sys/vmmeter.h> 124 #include <sys/sched.h> 125 #include <sys/sysctl.h> 126 #include <sys/smp.h> 127 #include <sys/vmem.h> 128 129 #include <vm/vm.h> 130 #include <vm/vm_param.h> 131 #include <vm/vm_kern.h> 132 #include <vm/vm_page.h> 133 #include <vm/vm_map.h> 134 #include <vm/vm_object.h> 135 #include <vm/vm_extern.h> 136 #include <vm/vm_pageout.h> 137 #include <vm/vm_pager.h> 138 #include <vm/vm_phys.h> 139 #include <vm/vm_radix.h> 140 #include <vm/vm_reserv.h> 141 #include <vm/uma.h> 142 143 #ifdef DEV_APIC 144 #include <sys/bus.h> 145 #include <machine/intr_machdep.h> 146 #include <x86/apicvar.h> 147 #endif 148 #include <x86/ifunc.h> 149 #include <machine/bootinfo.h> 150 #include <machine/cpu.h> 151 #include <machine/cputypes.h> 152 #include <machine/md_var.h> 153 #include <machine/pcb.h> 154 #include <machine/specialreg.h> 155 #ifdef SMP 156 #include <machine/smp.h> 157 #endif 158 #include <machine/pmap_base.h> 159 160 #ifdef PV_STATS 161 #define PV_STAT(x) do { x ; } while (0) 162 #else 163 #define PV_STAT(x) do { } while (0) 164 #endif 165 166 #define pa_index(pa) ((pa) >> PDRSHIFT) 167 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 168 169 /* 170 * PTmap is recursive pagemap at top of virtual address space. 171 * Within PTmap, the page directory can be found (third indirection). 172 */ 173 #define PTmap ((pt_entry_t *)(PTDPTDI << PDRSHIFT)) 174 #define PTD ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE))) 175 #define PTDpde ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE) + \ 176 (PTDPTDI * PDESIZE))) 177 178 /* 179 * Translate a virtual address to the kernel virtual address of its page table 180 * entry (PTE). This can be used recursively. If the address of a PTE as 181 * previously returned by this macro is itself given as the argument, then the 182 * address of the page directory entry (PDE) that maps the PTE will be 183 * returned. 184 * 185 * This macro may be used before pmap_bootstrap() is called. 186 */ 187 #define vtopte(va) (PTmap + i386_btop(va)) 188 189 /* 190 * Get PDEs and PTEs for user/kernel address space 191 */ 192 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 193 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 194 195 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 196 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 197 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 198 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 199 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 200 201 #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 202 atomic_clear_int((u_int *)(pte), PG_W)) 203 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 204 205 static int pgeflag = 0; /* PG_G or-in */ 206 static int pseflag = 0; /* PG_PS or-in */ 207 208 static int nkpt = NKPT; 209 210 #ifdef PMAP_PAE_COMP 211 pt_entry_t pg_nx; 212 static uma_zone_t pdptzone; 213 #else 214 #define pg_nx 0 215 #endif 216 217 _Static_assert(VM_MAXUSER_ADDRESS == VADDR(TRPTDI, 0), "VM_MAXUSER_ADDRESS"); 218 _Static_assert(VM_MAX_KERNEL_ADDRESS <= VADDR(PTDPTDI, 0), 219 "VM_MAX_KERNEL_ADDRESS"); 220 _Static_assert(PMAP_MAP_LOW == VADDR(LOWPTDI, 0), "PMAP_MAP_LOW"); 221 _Static_assert(KERNLOAD == (KERNPTDI << PDRSHIFT), "KERNLOAD"); 222 223 extern int pat_works; 224 extern int pg_ps_enabled; 225 226 extern int elf32_nxstack; 227 228 #define PAT_INDEX_SIZE 8 229 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 230 231 /* 232 * pmap_mapdev support pre initialization (i.e. console) 233 */ 234 #define PMAP_PREINIT_MAPPING_COUNT 8 235 static struct pmap_preinit_mapping { 236 vm_paddr_t pa; 237 vm_offset_t va; 238 vm_size_t sz; 239 int mode; 240 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 241 static int pmap_initialized; 242 243 static struct rwlock_padalign pvh_global_lock; 244 245 /* 246 * Data for the pv entry allocation mechanism 247 */ 248 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 249 extern int pv_entry_max, pv_entry_count; 250 static int pv_entry_high_water = 0; 251 static struct md_page *pv_table; 252 extern int shpgperproc; 253 254 static struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 255 static int pv_maxchunks; /* How many chunks we have KVA for */ 256 static vm_offset_t pv_vafree; /* freelist stored in the PTE */ 257 258 /* 259 * All those kernel PT submaps that BSD is so fond of 260 */ 261 static pt_entry_t *CMAP3; 262 static pd_entry_t *KPTD; 263 static caddr_t CADDR3; 264 265 /* 266 * Crashdump maps. 267 */ 268 static caddr_t crashdumpmap; 269 270 static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3; 271 static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3; 272 #ifdef SMP 273 static int PMAP1cpu, PMAP3cpu; 274 extern int PMAP1changedcpu; 275 #endif 276 extern int PMAP1changed; 277 extern int PMAP1unchanged; 278 static struct mtx PMAP2mutex; 279 280 /* 281 * Internal flags for pmap_enter()'s helper functions. 282 */ 283 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 284 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 285 286 static void free_pv_chunk(struct pv_chunk *pc); 287 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 288 static pv_entry_t get_pv_entry(pmap_t pmap, bool try); 289 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 290 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 291 u_int flags); 292 #if VM_NRESERVLEVEL > 0 293 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 294 #endif 295 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 296 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 297 vm_offset_t va); 298 static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 299 300 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 301 static bool pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 302 static int pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 303 vm_prot_t prot); 304 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 305 u_int flags, vm_page_t m); 306 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 307 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 308 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 309 bool allpte_PG_A_set); 310 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 311 pd_entry_t pde); 312 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 313 static bool pmap_is_modified_pvh(struct md_page *pvh); 314 static bool pmap_is_referenced_pvh(struct md_page *pvh); 315 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 316 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 317 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 318 #if VM_NRESERVLEVEL > 0 319 static bool pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 320 vm_page_t mpte); 321 #endif 322 static bool pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 323 vm_prot_t prot); 324 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 325 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 326 struct spglist *free); 327 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 328 struct spglist *free); 329 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 330 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free); 331 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 332 struct spglist *free); 333 static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va); 334 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 335 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 336 vm_page_t m); 337 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 338 pd_entry_t newpde); 339 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 340 341 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 342 343 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 344 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 345 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 346 static void pmap_pte_release(pt_entry_t *pte); 347 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 348 #ifdef PMAP_PAE_COMP 349 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, 350 uint8_t *flags, int wait); 351 #endif 352 static void pmap_init_trm(void); 353 static void pmap_invalidate_all_int(pmap_t pmap); 354 355 static __inline void pagezero(void *page); 356 357 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 358 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 359 360 extern char _end[]; 361 extern u_long physfree; /* phys addr of next free page */ 362 extern u_long vm86phystk;/* PA of vm86/bios stack */ 363 extern u_long vm86paddr;/* address of vm86 region */ 364 extern int vm86pa; /* phys addr of vm86 region */ 365 extern u_long KERNend; /* phys addr end of kernel (just after bss) */ 366 #ifdef PMAP_PAE_COMP 367 pd_entry_t *IdlePTD_pae; /* phys addr of kernel PTD */ 368 pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */ 369 pt_entry_t *KPTmap_pae; /* address of kernel page tables */ 370 #define IdlePTD IdlePTD_pae 371 #define KPTmap KPTmap_pae 372 #else 373 pd_entry_t *IdlePTD_nopae; 374 pt_entry_t *KPTmap_nopae; 375 #define IdlePTD IdlePTD_nopae 376 #define KPTmap KPTmap_nopae 377 #endif 378 extern u_long KPTphys; /* phys addr of kernel page tables */ 379 extern u_long tramp_idleptd; 380 381 static u_long 382 allocpages(u_int cnt, u_long *physfree) 383 { 384 u_long res; 385 386 res = *physfree; 387 *physfree += PAGE_SIZE * cnt; 388 bzero((void *)res, PAGE_SIZE * cnt); 389 return (res); 390 } 391 392 static void 393 pmap_cold_map(u_long pa, u_long va, u_long cnt) 394 { 395 pt_entry_t *pt; 396 397 for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0; 398 cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE) 399 *pt = pa | PG_V | PG_RW | PG_A | PG_M; 400 } 401 402 static void 403 pmap_cold_mapident(u_long pa, u_long cnt) 404 { 405 406 pmap_cold_map(pa, pa, cnt); 407 } 408 409 _Static_assert(LOWPTDI * 2 * NBPDR == KERNBASE, 410 "Broken double-map of zero PTD"); 411 412 static void 413 __CONCAT(PMTYPE, remap_lower)(bool enable) 414 { 415 int i; 416 417 for (i = 0; i < LOWPTDI; i++) 418 IdlePTD[i] = enable ? IdlePTD[LOWPTDI + i] : 0; 419 load_cr3(rcr3()); /* invalidate TLB */ 420 } 421 422 /* 423 * Called from locore.s before paging is enabled. Sets up the first 424 * kernel page table. Since kernel is mapped with PA == VA, this code 425 * does not require relocations. 426 */ 427 void 428 __CONCAT(PMTYPE, cold)(void) 429 { 430 pt_entry_t *pt; 431 u_long a; 432 u_int cr3, ncr4; 433 434 physfree = (u_long)&_end; 435 if (bootinfo.bi_esymtab != 0) 436 physfree = bootinfo.bi_esymtab; 437 if (bootinfo.bi_kernend != 0) 438 physfree = bootinfo.bi_kernend; 439 physfree = roundup2(physfree, NBPDR); 440 KERNend = physfree; 441 442 /* Allocate Kernel Page Tables */ 443 KPTphys = allocpages(NKPT, &physfree); 444 KPTmap = (pt_entry_t *)KPTphys; 445 446 /* Allocate Page Table Directory */ 447 #ifdef PMAP_PAE_COMP 448 /* XXX only need 32 bytes (easier for now) */ 449 IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree); 450 #endif 451 IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree); 452 453 /* 454 * Allocate KSTACK. Leave a guard page between IdlePTD and 455 * proc0kstack, to control stack overflow for thread0 and 456 * prevent corruption of the page table. We leak the guard 457 * physical memory due to 1:1 mappings. 458 */ 459 allocpages(1, &physfree); 460 proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree); 461 462 /* vm86/bios stack */ 463 vm86phystk = allocpages(1, &physfree); 464 465 /* pgtable + ext + IOPAGES */ 466 vm86paddr = vm86pa = allocpages(3, &physfree); 467 468 /* Install page tables into PTD. Page table page 1 is wasted. */ 469 for (a = 0; a < NKPT; a++) 470 IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M; 471 472 #ifdef PMAP_PAE_COMP 473 /* PAE install PTD pointers into PDPT */ 474 for (a = 0; a < NPGPTD; a++) 475 IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V; 476 #endif 477 478 /* 479 * Install recursive mapping for kernel page tables into 480 * itself. 481 */ 482 for (a = 0; a < NPGPTD; a++) 483 IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V | 484 PG_RW; 485 486 /* 487 * Initialize page table pages mapping physical address zero 488 * through the (physical) end of the kernel. Many of these 489 * pages must be reserved, and we reserve them all and map 490 * them linearly for convenience. We do this even if we've 491 * enabled PSE above; we'll just switch the corresponding 492 * kernel PDEs before we turn on paging. 493 * 494 * This and all other page table entries allow read and write 495 * access for various reasons. Kernel mappings never have any 496 * access restrictions. 497 */ 498 pmap_cold_mapident(0, atop(NBPDR) * LOWPTDI); 499 pmap_cold_map(0, NBPDR * LOWPTDI, atop(NBPDR) * LOWPTDI); 500 pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE)); 501 502 /* Map page table directory */ 503 #ifdef PMAP_PAE_COMP 504 pmap_cold_mapident((u_long)IdlePDPT, 1); 505 #endif 506 pmap_cold_mapident((u_long)IdlePTD, NPGPTD); 507 508 /* Map early KPTmap. It is really pmap_cold_mapident. */ 509 pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT); 510 511 /* Map proc0kstack */ 512 pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES); 513 /* ISA hole already mapped */ 514 515 pmap_cold_mapident(vm86phystk, 1); 516 pmap_cold_mapident(vm86pa, 3); 517 518 /* Map page 0 into the vm86 page table */ 519 *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V; 520 521 /* ...likewise for the ISA hole for vm86 */ 522 for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0; 523 a < atop(ISA_HOLE_LENGTH); a++, pt++) 524 *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A | 525 PG_M | PG_V; 526 527 /* Enable PSE, PGE, VME, and PAE if configured. */ 528 ncr4 = 0; 529 if ((cpu_feature & CPUID_PSE) != 0) { 530 ncr4 |= CR4_PSE; 531 pseflag = PG_PS; 532 /* 533 * Superpage mapping of the kernel text. Existing 4k 534 * page table pages are wasted. 535 */ 536 for (a = KERNBASE; a < KERNend; a += NBPDR) 537 IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M | 538 PG_RW | PG_V; 539 } 540 if ((cpu_feature & CPUID_PGE) != 0) { 541 ncr4 |= CR4_PGE; 542 pgeflag = PG_G; 543 } 544 ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0; 545 #ifdef PMAP_PAE_COMP 546 ncr4 |= CR4_PAE; 547 #endif 548 if (ncr4 != 0) 549 load_cr4(rcr4() | ncr4); 550 551 /* Now enable paging */ 552 #ifdef PMAP_PAE_COMP 553 cr3 = (u_int)IdlePDPT; 554 if ((cpu_feature & CPUID_PAT) == 0) 555 wbinvd(); 556 #else 557 cr3 = (u_int)IdlePTD; 558 #endif 559 tramp_idleptd = cr3; 560 load_cr3(cr3); 561 load_cr0(rcr0() | CR0_PG); 562 563 /* 564 * Now running relocated at KERNBASE where the system is 565 * linked to run. 566 */ 567 568 /* 569 * Remove the lowest part of the double mapping of low memory 570 * to get some null pointer checks. 571 */ 572 __CONCAT(PMTYPE, remap_lower)(false); 573 574 kernel_vm_end = /* 0 + */ NKPT * NBPDR; 575 #ifdef PMAP_PAE_COMP 576 i386_pmap_VM_NFREEORDER = VM_NFREEORDER_PAE; 577 i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_PAE; 578 i386_pmap_PDRSHIFT = PDRSHIFT_PAE; 579 #else 580 i386_pmap_VM_NFREEORDER = VM_NFREEORDER_NOPAE; 581 i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_NOPAE; 582 i386_pmap_PDRSHIFT = PDRSHIFT_NOPAE; 583 #endif 584 } 585 586 static void 587 __CONCAT(PMTYPE, set_nx)(void) 588 { 589 590 #ifdef PMAP_PAE_COMP 591 if ((amd_feature & AMDID_NX) == 0) 592 return; 593 pg_nx = PG_NX; 594 elf32_nxstack = 1; 595 /* EFER.EFER_NXE is set in initializecpu(). */ 596 #endif 597 } 598 599 /* 600 * Bootstrap the system enough to run with virtual memory. 601 * 602 * On the i386 this is called after pmap_cold() created initial 603 * kernel page table and enabled paging, and just syncs the pmap 604 * module with what has already been done. 605 */ 606 static void 607 __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr) 608 { 609 vm_offset_t va; 610 pt_entry_t *pte, *unused __unused; 611 struct pcpu *pc; 612 u_long res; 613 int i; 614 615 res = atop(firstaddr - (vm_paddr_t)KERNLOAD); 616 617 /* 618 * Add a physical memory segment (vm_phys_seg) corresponding to the 619 * preallocated kernel page table pages so that vm_page structures 620 * representing these pages will be created. The vm_page structures 621 * are required for promotion of the corresponding kernel virtual 622 * addresses to superpage mappings. 623 */ 624 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 625 626 /* 627 * Initialize the first available kernel virtual address. 628 * However, using "firstaddr" may waste a few pages of the 629 * kernel virtual address space, because pmap_cold() may not 630 * have mapped every physical page that it allocated. 631 * Preferably, pmap_cold() would provide a first unused 632 * virtual address in addition to "firstaddr". 633 */ 634 virtual_avail = (vm_offset_t)firstaddr; 635 virtual_end = VM_MAX_KERNEL_ADDRESS; 636 637 /* 638 * Initialize the kernel pmap (which is statically allocated). 639 * Count bootstrap data as being resident in case any of this data is 640 * later unmapped (using pmap_remove()) and freed. 641 */ 642 PMAP_LOCK_INIT(kernel_pmap); 643 kernel_pmap->pm_pdir = IdlePTD; 644 #ifdef PMAP_PAE_COMP 645 kernel_pmap->pm_pdpt = IdlePDPT; 646 #endif 647 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 648 kernel_pmap->pm_stats.resident_count = res; 649 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 650 vm_radix_init(&kernel_pmap->pm_root); 651 652 /* 653 * Initialize the global pv list lock. 654 */ 655 rw_init(&pvh_global_lock, "pmap pv global"); 656 657 /* 658 * Reserve some special page table entries/VA space for temporary 659 * mapping of pages. 660 */ 661 #define SYSMAP(c, p, v, n) \ 662 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 663 664 va = virtual_avail; 665 pte = vtopte(va); 666 667 /* 668 * Initialize temporary map objects on the current CPU for use 669 * during early boot. 670 * CMAP1/CMAP2 are used for zeroing and copying pages. 671 * CMAP3 is used for the boot-time memory test. 672 */ 673 pc = get_pcpu(); 674 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 675 SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) 676 SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) 677 SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) 678 679 SYSMAP(caddr_t, CMAP3, CADDR3, 1); 680 681 /* 682 * Crashdump maps. 683 */ 684 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 685 686 /* 687 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 688 */ 689 SYSMAP(caddr_t, unused, ptvmmap, 1) 690 691 /* 692 * msgbufp is used to map the system message buffer. 693 */ 694 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 695 696 /* 697 * KPTmap is used by pmap_kextract(). 698 * 699 * KPTmap is first initialized by pmap_cold(). However, that initial 700 * KPTmap can only support NKPT page table pages. Here, a larger 701 * KPTmap is created that can support KVA_PAGES page table pages. 702 */ 703 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 704 705 for (i = 0; i < NKPT; i++) 706 KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; 707 708 /* 709 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 710 * respectively. 711 */ 712 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 713 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 714 SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1) 715 716 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 717 718 virtual_avail = va; 719 720 /* 721 * Initialize the PAT MSR if present. 722 * pmap_init_pat() clears and sets CR4_PGE, which, as a 723 * side-effect, invalidates stale PG_G TLB entries that might 724 * have been created in our pre-boot environment. We assume 725 * that PAT support implies PGE and in reverse, PGE presence 726 * comes with PAT. Both features were added for Pentium Pro. 727 */ 728 pmap_init_pat(); 729 } 730 731 static void 732 pmap_init_reserved_pages(void) 733 { 734 struct pcpu *pc; 735 vm_offset_t pages; 736 int i; 737 738 #ifdef PMAP_PAE_COMP 739 if (!pae_mode) 740 return; 741 #else 742 if (pae_mode) 743 return; 744 #endif 745 CPU_FOREACH(i) { 746 pc = pcpu_find(i); 747 mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF | 748 MTX_NEW); 749 pc->pc_copyout_maddr = kva_alloc(ptoa(2)); 750 if (pc->pc_copyout_maddr == 0) 751 panic("unable to allocate non-sleepable copyout KVA"); 752 sx_init(&pc->pc_copyout_slock, "cpslk"); 753 pc->pc_copyout_saddr = kva_alloc(ptoa(2)); 754 if (pc->pc_copyout_saddr == 0) 755 panic("unable to allocate sleepable copyout KVA"); 756 pc->pc_pmap_eh_va = kva_alloc(ptoa(1)); 757 if (pc->pc_pmap_eh_va == 0) 758 panic("unable to allocate pmap_extract_and_hold KVA"); 759 pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va); 760 761 /* 762 * Skip if the mappings have already been initialized, 763 * i.e. this is the BSP. 764 */ 765 if (pc->pc_cmap_addr1 != 0) 766 continue; 767 768 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 769 pages = kva_alloc(PAGE_SIZE * 3); 770 if (pages == 0) 771 panic("unable to allocate CMAP KVA"); 772 pc->pc_cmap_pte1 = vtopte(pages); 773 pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); 774 pc->pc_cmap_addr1 = (caddr_t)pages; 775 pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); 776 pc->pc_qmap_addr = pages + ptoa(2); 777 } 778 } 779 780 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 781 782 /* 783 * Setup the PAT MSR. 784 */ 785 static void 786 __CONCAT(PMTYPE, init_pat)(void) 787 { 788 int pat_table[PAT_INDEX_SIZE]; 789 uint64_t pat_msr; 790 u_long cr0, cr4; 791 int i; 792 793 /* Set default PAT index table. */ 794 for (i = 0; i < PAT_INDEX_SIZE; i++) 795 pat_table[i] = -1; 796 pat_table[PAT_WRITE_BACK] = 0; 797 pat_table[PAT_WRITE_THROUGH] = 1; 798 pat_table[PAT_UNCACHEABLE] = 3; 799 pat_table[PAT_WRITE_COMBINING] = 3; 800 pat_table[PAT_WRITE_PROTECTED] = 3; 801 pat_table[PAT_UNCACHED] = 3; 802 803 /* 804 * Bail if this CPU doesn't implement PAT. 805 * We assume that PAT support implies PGE. 806 */ 807 if ((cpu_feature & CPUID_PAT) == 0) { 808 for (i = 0; i < PAT_INDEX_SIZE; i++) 809 pat_index[i] = pat_table[i]; 810 pat_works = 0; 811 return; 812 } 813 814 /* 815 * Due to some Intel errata, we can only safely use the lower 4 816 * PAT entries. 817 * 818 * Intel Pentium III Processor Specification Update 819 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 820 * or Mode C Paging) 821 * 822 * Intel Pentium IV Processor Specification Update 823 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 824 */ 825 if (cpu_vendor_id == CPU_VENDOR_INTEL && 826 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 827 pat_works = 0; 828 829 /* Initialize default PAT entries. */ 830 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 831 PAT_VALUE(1, PAT_WRITE_THROUGH) | 832 PAT_VALUE(2, PAT_UNCACHED) | 833 PAT_VALUE(3, PAT_UNCACHEABLE) | 834 PAT_VALUE(4, PAT_WRITE_BACK) | 835 PAT_VALUE(5, PAT_WRITE_THROUGH) | 836 PAT_VALUE(6, PAT_UNCACHED) | 837 PAT_VALUE(7, PAT_UNCACHEABLE); 838 839 if (pat_works) { 840 /* 841 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 842 * Program 5 and 6 as WP and WC. 843 * Leave 4 and 7 as WB and UC. 844 */ 845 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 846 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 847 PAT_VALUE(6, PAT_WRITE_COMBINING); 848 pat_table[PAT_UNCACHED] = 2; 849 pat_table[PAT_WRITE_PROTECTED] = 5; 850 pat_table[PAT_WRITE_COMBINING] = 6; 851 } else { 852 /* 853 * Just replace PAT Index 2 with WC instead of UC-. 854 */ 855 pat_msr &= ~PAT_MASK(2); 856 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 857 pat_table[PAT_WRITE_COMBINING] = 2; 858 } 859 860 /* Disable PGE. */ 861 cr4 = rcr4(); 862 load_cr4(cr4 & ~CR4_PGE); 863 864 /* Disable caches (CD = 1, NW = 0). */ 865 cr0 = rcr0(); 866 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 867 868 /* Flushes caches and TLBs. */ 869 wbinvd(); 870 invltlb(); 871 872 /* Update PAT and index table. */ 873 wrmsr(MSR_PAT, pat_msr); 874 for (i = 0; i < PAT_INDEX_SIZE; i++) 875 pat_index[i] = pat_table[i]; 876 877 /* Flush caches and TLBs again. */ 878 wbinvd(); 879 invltlb(); 880 881 /* Restore caches and PGE. */ 882 load_cr0(cr0); 883 load_cr4(cr4); 884 } 885 886 #ifdef PMAP_PAE_COMP 887 static void * 888 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 889 int wait) 890 { 891 892 /* Inform UMA that this allocator uses kernel_map/object. */ 893 *flags = UMA_SLAB_KERNEL; 894 return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), 895 bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 896 } 897 #endif 898 899 /* 900 * Abuse the pte nodes for unmapped kva to thread a kva freelist through. 901 * Requirements: 902 * - Must deal with pages in order to ensure that none of the PG_* bits 903 * are ever set, PG_V in particular. 904 * - Assumes we can write to ptes without pte_store() atomic ops, even 905 * on PAE systems. This should be ok. 906 * - Assumes nothing will ever test these addresses for 0 to indicate 907 * no mapping instead of correctly checking PG_V. 908 * - Assumes a vm_offset_t will fit in a pte (true for i386). 909 * Because PG_V is never set, there can be no mappings to invalidate. 910 */ 911 static vm_offset_t 912 pmap_ptelist_alloc(vm_offset_t *head) 913 { 914 pt_entry_t *pte; 915 vm_offset_t va; 916 917 va = *head; 918 if (va == 0) 919 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 920 pte = vtopte(va); 921 *head = *pte; 922 if (*head & PG_V) 923 panic("pmap_ptelist_alloc: va with PG_V set!"); 924 *pte = 0; 925 return (va); 926 } 927 928 static void 929 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 930 { 931 pt_entry_t *pte; 932 933 if (va & PG_V) 934 panic("pmap_ptelist_free: freeing va with PG_V set!"); 935 pte = vtopte(va); 936 *pte = *head; /* virtual! PG_V is 0 though */ 937 *head = va; 938 } 939 940 static void 941 pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 942 { 943 int i; 944 vm_offset_t va; 945 946 *head = 0; 947 for (i = npages - 1; i >= 0; i--) { 948 va = (vm_offset_t)base + i * PAGE_SIZE; 949 pmap_ptelist_free(head, va); 950 } 951 } 952 953 /* 954 * Initialize the pmap module. 955 * Called by vm_init, to initialize any structures that the pmap 956 * system needs to map virtual memory. 957 */ 958 static void 959 __CONCAT(PMTYPE, init)(void) 960 { 961 struct pmap_preinit_mapping *ppim; 962 vm_page_t mpte; 963 vm_size_t s; 964 int i, pv_npg; 965 966 /* 967 * Initialize the vm page array entries for the kernel pmap's 968 * page table pages. 969 */ 970 PMAP_LOCK(kernel_pmap); 971 for (i = 0; i < NKPT; i++) { 972 mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i)); 973 KASSERT(mpte >= vm_page_array && 974 mpte < &vm_page_array[vm_page_array_size], 975 ("pmap_init: page table page is out of range")); 976 mpte->pindex = i + KPTDI; 977 mpte->phys_addr = KPTphys + ptoa(i); 978 mpte->ref_count = 1; 979 980 /* 981 * Collect the page table pages that were replaced by a 2/4MB 982 * page. They are filled with equivalent 4KB page mappings. 983 */ 984 if (pseflag != 0 && 985 KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend && 986 pmap_insert_pt_page(kernel_pmap, mpte, true, true)) 987 panic("pmap_init: pmap_insert_pt_page failed"); 988 } 989 PMAP_UNLOCK(kernel_pmap); 990 vm_wire_add(NKPT); 991 992 /* 993 * Initialize the address space (zone) for the pv entries. Set a 994 * high water mark so that the system can recover from excessive 995 * numbers of pv entries. 996 */ 997 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 998 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 999 TUNABLE_INT_FETCH("vm.pmap.pv_entry_max", &pv_entry_max); 1000 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1001 pv_entry_high_water = 9 * (pv_entry_max / 10); 1002 1003 /* 1004 * If the kernel is running on a virtual machine, then it must assume 1005 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1006 * be prepared for the hypervisor changing the vendor and family that 1007 * are reported by CPUID. Consequently, the workaround for AMD Family 1008 * 10h Erratum 383 is enabled if the processor's feature set does not 1009 * include at least one feature that is only supported by older Intel 1010 * or newer AMD processors. 1011 */ 1012 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 1013 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1014 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1015 AMDID2_FMA4)) == 0) 1016 workaround_erratum383 = 1; 1017 1018 /* 1019 * Are large page mappings supported and enabled? 1020 */ 1021 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1022 if (pseflag == 0) 1023 pg_ps_enabled = 0; 1024 else if (pg_ps_enabled) { 1025 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1026 ("pmap_init: can't assign to pagesizes[1]")); 1027 pagesizes[1] = NBPDR; 1028 } 1029 1030 /* 1031 * Calculate the size of the pv head table for superpages. 1032 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1033 */ 1034 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 1035 PAGE_SIZE) / NBPDR + 1; 1036 1037 /* 1038 * Allocate memory for the pv head table for superpages. 1039 */ 1040 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1041 s = round_page(s); 1042 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 1043 for (i = 0; i < pv_npg; i++) 1044 TAILQ_INIT(&pv_table[i].pv_list); 1045 1046 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1047 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1048 if (pv_chunkbase == NULL) 1049 panic("pmap_init: not enough kvm for pv chunks"); 1050 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1051 #ifdef PMAP_PAE_COMP 1052 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 1053 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 1054 UMA_ZONE_CONTIG | UMA_ZONE_VM | UMA_ZONE_NOFREE); 1055 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 1056 #endif 1057 1058 pmap_initialized = 1; 1059 pmap_init_trm(); 1060 1061 if (!bootverbose) 1062 return; 1063 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1064 ppim = pmap_preinit_mapping + i; 1065 if (ppim->va == 0) 1066 continue; 1067 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, 1068 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); 1069 } 1070 1071 } 1072 1073 extern u_long pmap_pde_demotions; 1074 extern u_long pmap_pde_mappings; 1075 extern u_long pmap_pde_p_failures; 1076 extern u_long pmap_pde_promotions; 1077 1078 /*************************************************** 1079 * Low level helper routines..... 1080 ***************************************************/ 1081 1082 static bool 1083 __CONCAT(PMTYPE, is_valid_memattr)(pmap_t pmap __unused, vm_memattr_t mode) 1084 { 1085 1086 return (mode >= 0 && mode < PAT_INDEX_SIZE && 1087 pat_index[(int)mode] >= 0); 1088 } 1089 1090 /* 1091 * Determine the appropriate bits to set in a PTE or PDE for a specified 1092 * caching mode. 1093 */ 1094 static int 1095 __CONCAT(PMTYPE, cache_bits)(pmap_t pmap, int mode, bool is_pde) 1096 { 1097 int cache_bits, pat_flag, pat_idx; 1098 1099 if (!pmap_is_valid_memattr(pmap, mode)) 1100 panic("Unknown caching mode %d\n", mode); 1101 1102 /* The PAT bit is different for PTE's and PDE's. */ 1103 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 1104 1105 /* Map the caching mode to a PAT index. */ 1106 pat_idx = pat_index[mode]; 1107 1108 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1109 cache_bits = 0; 1110 if (pat_idx & 0x4) 1111 cache_bits |= pat_flag; 1112 if (pat_idx & 0x2) 1113 cache_bits |= PG_NC_PCD; 1114 if (pat_idx & 0x1) 1115 cache_bits |= PG_NC_PWT; 1116 return (cache_bits); 1117 } 1118 1119 static int 1120 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 1121 { 1122 int pat_flag, pat_idx; 1123 1124 if ((cpu_feature & CPUID_PAT) == 0) 1125 return (0); 1126 1127 pat_idx = 0; 1128 /* The PAT bit is different for PTE's and PDE's. */ 1129 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 1130 1131 if ((pte & pat_flag) != 0) 1132 pat_idx |= 0x4; 1133 if ((pte & PG_NC_PCD) != 0) 1134 pat_idx |= 0x2; 1135 if ((pte & PG_NC_PWT) != 0) 1136 pat_idx |= 0x1; 1137 1138 /* See pmap_init_pat(). */ 1139 if (pat_works) { 1140 if (pat_idx == 4) 1141 pat_idx = 0; 1142 if (pat_idx == 7) 1143 pat_idx = 3; 1144 } else { 1145 /* XXXKIB */ 1146 } 1147 1148 return (pat_idx); 1149 } 1150 1151 static bool 1152 __CONCAT(PMTYPE, ps_enabled)(pmap_t pmap __unused) 1153 { 1154 1155 return (pg_ps_enabled); 1156 } 1157 1158 /* 1159 * The caller is responsible for maintaining TLB consistency. 1160 */ 1161 static void 1162 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 1163 { 1164 pd_entry_t *pde; 1165 1166 pde = pmap_pde(kernel_pmap, va); 1167 pde_store(pde, newpde); 1168 } 1169 1170 /* 1171 * After changing the page size for the specified virtual address in the page 1172 * table, flush the corresponding entries from the processor's TLB. Only the 1173 * calling processor's TLB is affected. 1174 * 1175 * The calling thread must be pinned to a processor. 1176 */ 1177 static void 1178 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 1179 { 1180 1181 if ((newpde & PG_PS) == 0) 1182 /* Demotion: flush a specific 2MB page mapping. */ 1183 invlpg(va); 1184 else /* if ((newpde & PG_G) == 0) */ 1185 /* 1186 * Promotion: flush every 4KB page mapping from the TLB 1187 * because there are too many to flush individually. 1188 */ 1189 invltlb(); 1190 } 1191 1192 #ifdef SMP 1193 1194 static void 1195 pmap_curcpu_cb_dummy(pmap_t pmap __unused, vm_offset_t addr1 __unused, 1196 vm_offset_t addr2 __unused) 1197 { 1198 } 1199 1200 /* 1201 * For SMP, these functions have to use the IPI mechanism for coherence. 1202 * 1203 * N.B.: Before calling any of the following TLB invalidation functions, 1204 * the calling processor must ensure that all stores updating a non- 1205 * kernel page table are globally performed. Otherwise, another 1206 * processor could cache an old, pre-update entry without being 1207 * invalidated. This can happen one of two ways: (1) The pmap becomes 1208 * active on another processor after its pm_active field is checked by 1209 * one of the following functions but before a store updating the page 1210 * table is globally performed. (2) The pmap becomes active on another 1211 * processor before its pm_active field is checked but due to 1212 * speculative loads one of the following functions stills reads the 1213 * pmap as inactive on the other processor. 1214 * 1215 * The kernel page table is exempt because its pm_active field is 1216 * immutable. The kernel page table is always active on every 1217 * processor. 1218 */ 1219 static void 1220 pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) 1221 { 1222 cpuset_t *mask, other_cpus; 1223 u_int cpuid; 1224 1225 sched_pin(); 1226 if (pmap == kernel_pmap) { 1227 invlpg(va); 1228 mask = &all_cpus; 1229 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1230 mask = &all_cpus; 1231 } else { 1232 cpuid = PCPU_GET(cpuid); 1233 other_cpus = all_cpus; 1234 CPU_CLR(cpuid, &other_cpus); 1235 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 1236 mask = &other_cpus; 1237 } 1238 smp_masked_invlpg(*mask, va, pmap, pmap_curcpu_cb_dummy); 1239 sched_unpin(); 1240 } 1241 1242 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1243 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1244 1245 static void 1246 pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1247 { 1248 cpuset_t *mask, other_cpus; 1249 vm_offset_t addr; 1250 u_int cpuid; 1251 1252 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1253 pmap_invalidate_all_int(pmap); 1254 return; 1255 } 1256 1257 sched_pin(); 1258 if (pmap == kernel_pmap) { 1259 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1260 invlpg(addr); 1261 mask = &all_cpus; 1262 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1263 mask = &all_cpus; 1264 } else { 1265 cpuid = PCPU_GET(cpuid); 1266 other_cpus = all_cpus; 1267 CPU_CLR(cpuid, &other_cpus); 1268 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 1269 mask = &other_cpus; 1270 } 1271 smp_masked_invlpg_range(*mask, sva, eva, pmap, pmap_curcpu_cb_dummy); 1272 sched_unpin(); 1273 } 1274 1275 static void 1276 pmap_invalidate_all_int(pmap_t pmap) 1277 { 1278 cpuset_t *mask, other_cpus; 1279 u_int cpuid; 1280 1281 sched_pin(); 1282 if (pmap == kernel_pmap) { 1283 invltlb(); 1284 mask = &all_cpus; 1285 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1286 mask = &all_cpus; 1287 } else { 1288 cpuid = PCPU_GET(cpuid); 1289 other_cpus = all_cpus; 1290 CPU_CLR(cpuid, &other_cpus); 1291 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 1292 mask = &other_cpus; 1293 } 1294 smp_masked_invltlb(*mask, pmap, pmap_curcpu_cb_dummy); 1295 sched_unpin(); 1296 } 1297 1298 static void 1299 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, 1300 vm_offset_t addr1 __unused, vm_offset_t addr2 __unused) 1301 { 1302 wbinvd(); 1303 } 1304 1305 static void 1306 __CONCAT(PMTYPE, invalidate_cache)(void) 1307 { 1308 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 1309 } 1310 1311 struct pde_action { 1312 cpuset_t invalidate; /* processors that invalidate their TLB */ 1313 vm_offset_t va; 1314 pd_entry_t *pde; 1315 pd_entry_t newpde; 1316 u_int store; /* processor that updates the PDE */ 1317 }; 1318 1319 static void 1320 pmap_update_pde_kernel(void *arg) 1321 { 1322 struct pde_action *act = arg; 1323 pd_entry_t *pde; 1324 1325 if (act->store == PCPU_GET(cpuid)) { 1326 pde = pmap_pde(kernel_pmap, act->va); 1327 pde_store(pde, act->newpde); 1328 } 1329 } 1330 1331 static void 1332 pmap_update_pde_user(void *arg) 1333 { 1334 struct pde_action *act = arg; 1335 1336 if (act->store == PCPU_GET(cpuid)) 1337 pde_store(act->pde, act->newpde); 1338 } 1339 1340 static void 1341 pmap_update_pde_teardown(void *arg) 1342 { 1343 struct pde_action *act = arg; 1344 1345 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1346 pmap_update_pde_invalidate(act->va, act->newpde); 1347 } 1348 1349 /* 1350 * Change the page size for the specified virtual address in a way that 1351 * prevents any possibility of the TLB ever having two entries that map the 1352 * same virtual address using different page sizes. This is the recommended 1353 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1354 * machine check exception for a TLB state that is improperly diagnosed as a 1355 * hardware error. 1356 */ 1357 static void 1358 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1359 { 1360 struct pde_action act; 1361 cpuset_t active, other_cpus; 1362 u_int cpuid; 1363 1364 sched_pin(); 1365 cpuid = PCPU_GET(cpuid); 1366 other_cpus = all_cpus; 1367 CPU_CLR(cpuid, &other_cpus); 1368 if (pmap == kernel_pmap) 1369 active = all_cpus; 1370 else 1371 active = pmap->pm_active; 1372 if (CPU_OVERLAP(&active, &other_cpus)) { 1373 act.store = cpuid; 1374 act.invalidate = active; 1375 act.va = va; 1376 act.pde = pde; 1377 act.newpde = newpde; 1378 CPU_SET(cpuid, &active); 1379 smp_rendezvous_cpus(active, 1380 smp_no_rendezvous_barrier, pmap == kernel_pmap ? 1381 pmap_update_pde_kernel : pmap_update_pde_user, 1382 pmap_update_pde_teardown, &act); 1383 } else { 1384 if (pmap == kernel_pmap) 1385 pmap_kenter_pde(va, newpde); 1386 else 1387 pde_store(pde, newpde); 1388 if (CPU_ISSET(cpuid, &active)) 1389 pmap_update_pde_invalidate(va, newpde); 1390 } 1391 sched_unpin(); 1392 } 1393 #else /* !SMP */ 1394 /* 1395 * Normal, non-SMP, 486+ invalidation functions. 1396 * We inline these within pmap.c for speed. 1397 */ 1398 static void 1399 pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) 1400 { 1401 1402 if (pmap == kernel_pmap) 1403 invlpg(va); 1404 } 1405 1406 static void 1407 pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1408 { 1409 vm_offset_t addr; 1410 1411 if (pmap == kernel_pmap) 1412 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1413 invlpg(addr); 1414 } 1415 1416 static void 1417 pmap_invalidate_all_int(pmap_t pmap) 1418 { 1419 1420 if (pmap == kernel_pmap) 1421 invltlb(); 1422 } 1423 1424 static void 1425 __CONCAT(PMTYPE, invalidate_cache)(void) 1426 { 1427 1428 wbinvd(); 1429 } 1430 1431 static void 1432 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1433 { 1434 1435 if (pmap == kernel_pmap) 1436 pmap_kenter_pde(va, newpde); 1437 else 1438 pde_store(pde, newpde); 1439 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1440 pmap_update_pde_invalidate(va, newpde); 1441 } 1442 #endif /* !SMP */ 1443 1444 static void 1445 __CONCAT(PMTYPE, invalidate_page)(pmap_t pmap, vm_offset_t va) 1446 { 1447 1448 pmap_invalidate_page_int(pmap, va); 1449 } 1450 1451 static void 1452 __CONCAT(PMTYPE, invalidate_range)(pmap_t pmap, vm_offset_t sva, 1453 vm_offset_t eva) 1454 { 1455 1456 pmap_invalidate_range_int(pmap, sva, eva); 1457 } 1458 1459 static void 1460 __CONCAT(PMTYPE, invalidate_all)(pmap_t pmap) 1461 { 1462 1463 pmap_invalidate_all_int(pmap); 1464 } 1465 1466 static void 1467 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1468 { 1469 1470 /* 1471 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was 1472 * created by a promotion that did not invalidate the 512 or 1024 4KB 1473 * page mappings that might exist in the TLB. Consequently, at this 1474 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for 1475 * the address range [va, va + NBPDR). Therefore, the entire range 1476 * must be invalidated here. In contrast, when PG_PROMOTED is clear, 1477 * the TLB will not hold any 4KB page mappings for the address range 1478 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the 1479 * 2- or 4MB page mapping from the TLB. 1480 */ 1481 if ((pde & PG_PROMOTED) != 0) 1482 pmap_invalidate_range_int(pmap, va, va + NBPDR - 1); 1483 else 1484 pmap_invalidate_page_int(pmap, va); 1485 } 1486 1487 /* 1488 * Are we current address space or kernel? 1489 */ 1490 static __inline int 1491 pmap_is_current(pmap_t pmap) 1492 { 1493 1494 return (pmap == kernel_pmap); 1495 } 1496 1497 /* 1498 * If the given pmap is not the current or kernel pmap, the returned pte must 1499 * be released by passing it to pmap_pte_release(). 1500 */ 1501 static pt_entry_t * 1502 __CONCAT(PMTYPE, pte)(pmap_t pmap, vm_offset_t va) 1503 { 1504 pd_entry_t newpf; 1505 pd_entry_t *pde; 1506 1507 pde = pmap_pde(pmap, va); 1508 if (*pde & PG_PS) 1509 return (pde); 1510 if (*pde != 0) { 1511 /* are we current address space or kernel? */ 1512 if (pmap_is_current(pmap)) 1513 return (vtopte(va)); 1514 mtx_lock(&PMAP2mutex); 1515 newpf = *pde & PG_FRAME; 1516 if ((*PMAP2 & PG_FRAME) != newpf) { 1517 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1518 pmap_invalidate_page_int(kernel_pmap, 1519 (vm_offset_t)PADDR2); 1520 } 1521 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1522 } 1523 return (NULL); 1524 } 1525 1526 /* 1527 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1528 * being NULL. 1529 */ 1530 static __inline void 1531 pmap_pte_release(pt_entry_t *pte) 1532 { 1533 1534 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1535 mtx_unlock(&PMAP2mutex); 1536 } 1537 1538 /* 1539 * NB: The sequence of updating a page table followed by accesses to the 1540 * corresponding pages is subject to the situation described in the "AMD64 1541 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1542 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1543 * right after modifying the PTE bits is crucial. 1544 */ 1545 static __inline void 1546 invlcaddr(void *caddr) 1547 { 1548 1549 invlpg((u_int)caddr); 1550 } 1551 1552 /* 1553 * Super fast pmap_pte routine best used when scanning 1554 * the pv lists. This eliminates many coarse-grained 1555 * invltlb calls. Note that many of the pv list 1556 * scans are across different pmaps. It is very wasteful 1557 * to do an entire invltlb for checking a single mapping. 1558 * 1559 * If the given pmap is not the current pmap, pvh_global_lock 1560 * must be held and curthread pinned to a CPU. 1561 */ 1562 static pt_entry_t * 1563 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1564 { 1565 pd_entry_t newpf; 1566 pd_entry_t *pde; 1567 1568 pde = pmap_pde(pmap, va); 1569 if (*pde & PG_PS) 1570 return (pde); 1571 if (*pde != 0) { 1572 /* are we current address space or kernel? */ 1573 if (pmap_is_current(pmap)) 1574 return (vtopte(va)); 1575 rw_assert(&pvh_global_lock, RA_WLOCKED); 1576 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1577 newpf = *pde & PG_FRAME; 1578 if ((*PMAP1 & PG_FRAME) != newpf) { 1579 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1580 #ifdef SMP 1581 PMAP1cpu = PCPU_GET(cpuid); 1582 #endif 1583 invlcaddr(PADDR1); 1584 PMAP1changed++; 1585 } else 1586 #ifdef SMP 1587 if (PMAP1cpu != PCPU_GET(cpuid)) { 1588 PMAP1cpu = PCPU_GET(cpuid); 1589 invlcaddr(PADDR1); 1590 PMAP1changedcpu++; 1591 } else 1592 #endif 1593 PMAP1unchanged++; 1594 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1595 } 1596 return (0); 1597 } 1598 1599 static pt_entry_t * 1600 pmap_pte_quick3(pmap_t pmap, vm_offset_t va) 1601 { 1602 pd_entry_t newpf; 1603 pd_entry_t *pde; 1604 1605 pde = pmap_pde(pmap, va); 1606 if (*pde & PG_PS) 1607 return (pde); 1608 if (*pde != 0) { 1609 rw_assert(&pvh_global_lock, RA_WLOCKED); 1610 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1611 newpf = *pde & PG_FRAME; 1612 if ((*PMAP3 & PG_FRAME) != newpf) { 1613 *PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M; 1614 #ifdef SMP 1615 PMAP3cpu = PCPU_GET(cpuid); 1616 #endif 1617 invlcaddr(PADDR3); 1618 PMAP1changed++; 1619 } else 1620 #ifdef SMP 1621 if (PMAP3cpu != PCPU_GET(cpuid)) { 1622 PMAP3cpu = PCPU_GET(cpuid); 1623 invlcaddr(PADDR3); 1624 PMAP1changedcpu++; 1625 } else 1626 #endif 1627 PMAP1unchanged++; 1628 return (PADDR3 + (i386_btop(va) & (NPTEPG - 1))); 1629 } 1630 return (0); 1631 } 1632 1633 static pt_entry_t 1634 pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1635 { 1636 pt_entry_t *eh_ptep, pte, *ptep; 1637 1638 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1639 pde &= PG_FRAME; 1640 critical_enter(); 1641 eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep); 1642 if ((*eh_ptep & PG_FRAME) != pde) { 1643 *eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M; 1644 invlcaddr((void *)PCPU_GET(pmap_eh_va)); 1645 } 1646 ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) & 1647 (NPTEPG - 1)); 1648 pte = *ptep; 1649 critical_exit(); 1650 return (pte); 1651 } 1652 1653 /* 1654 * Extract from the kernel page table the physical address that is mapped by 1655 * the given virtual address "va". 1656 * 1657 * This function may be used before pmap_bootstrap() is called. 1658 */ 1659 static vm_paddr_t 1660 __CONCAT(PMTYPE, kextract)(vm_offset_t va) 1661 { 1662 vm_paddr_t pa; 1663 1664 if ((pa = pte_load(&PTD[va >> PDRSHIFT])) & PG_PS) { 1665 pa = (pa & PG_PS_FRAME) | (va & PDRMASK); 1666 } else { 1667 /* 1668 * Beware of a concurrent promotion that changes the PDE at 1669 * this point! For example, vtopte() must not be used to 1670 * access the PTE because it would use the new PDE. It is, 1671 * however, safe to use the old PDE because the page table 1672 * page is preserved by the promotion. 1673 */ 1674 pa = KPTmap[i386_btop(va)]; 1675 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1676 } 1677 return (pa); 1678 } 1679 1680 /* 1681 * Routine: pmap_extract 1682 * Function: 1683 * Extract the physical page address associated 1684 * with the given map/virtual_address pair. 1685 */ 1686 static vm_paddr_t 1687 __CONCAT(PMTYPE, extract)(pmap_t pmap, vm_offset_t va) 1688 { 1689 vm_paddr_t rtval; 1690 pt_entry_t pte; 1691 pd_entry_t pde; 1692 1693 rtval = 0; 1694 PMAP_LOCK(pmap); 1695 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1696 if (pde != 0) { 1697 if ((pde & PG_PS) != 0) 1698 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1699 else { 1700 pte = pmap_pte_ufast(pmap, va, pde); 1701 rtval = (pte & PG_FRAME) | (va & PAGE_MASK); 1702 } 1703 } 1704 PMAP_UNLOCK(pmap); 1705 return (rtval); 1706 } 1707 1708 /* 1709 * Routine: pmap_extract_and_hold 1710 * Function: 1711 * Atomically extract and hold the physical page 1712 * with the given pmap and virtual address pair 1713 * if that mapping permits the given protection. 1714 */ 1715 static vm_page_t 1716 __CONCAT(PMTYPE, extract_and_hold)(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1717 { 1718 pd_entry_t pde; 1719 pt_entry_t pte; 1720 vm_page_t m; 1721 1722 m = NULL; 1723 PMAP_LOCK(pmap); 1724 pde = *pmap_pde(pmap, va); 1725 if (pde != 0) { 1726 if (pde & PG_PS) { 1727 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) 1728 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1729 (va & PDRMASK)); 1730 } else { 1731 pte = pmap_pte_ufast(pmap, va, pde); 1732 if (pte != 0 && 1733 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) 1734 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1735 } 1736 if (m != NULL && !vm_page_wire_mapped(m)) 1737 m = NULL; 1738 } 1739 PMAP_UNLOCK(pmap); 1740 return (m); 1741 } 1742 1743 /*************************************************** 1744 * Low level mapping routines..... 1745 ***************************************************/ 1746 1747 /* 1748 * Add a wired page to the kva. 1749 * Note: not SMP coherent. 1750 * 1751 * This function may be used before pmap_bootstrap() is called. 1752 */ 1753 static void 1754 __CONCAT(PMTYPE, kenter)(vm_offset_t va, vm_paddr_t pa) 1755 { 1756 pt_entry_t *pte; 1757 1758 pte = vtopte(va); 1759 pte_store(pte, pa | PG_RW | PG_V); 1760 } 1761 1762 static __inline void 1763 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1764 { 1765 pt_entry_t *pte; 1766 1767 pte = vtopte(va); 1768 pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, 1769 mode, 0)); 1770 } 1771 1772 /* 1773 * Remove a page from the kernel pagetables. 1774 * Note: not SMP coherent. 1775 * 1776 * This function may be used before pmap_bootstrap() is called. 1777 */ 1778 static void 1779 __CONCAT(PMTYPE, kremove)(vm_offset_t va) 1780 { 1781 pt_entry_t *pte; 1782 1783 pte = vtopte(va); 1784 pte_clear(pte); 1785 } 1786 1787 /* 1788 * Used to map a range of physical addresses into kernel 1789 * virtual address space. 1790 * 1791 * The value passed in '*virt' is a suggested virtual address for 1792 * the mapping. Architectures which can support a direct-mapped 1793 * physical to virtual region can return the appropriate address 1794 * within that region, leaving '*virt' unchanged. Other 1795 * architectures should map the pages starting at '*virt' and 1796 * update '*virt' with the first usable address after the mapped 1797 * region. 1798 */ 1799 static vm_offset_t 1800 __CONCAT(PMTYPE, map)(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1801 int prot) 1802 { 1803 vm_offset_t va, sva; 1804 vm_paddr_t superpage_offset; 1805 pd_entry_t newpde; 1806 1807 va = *virt; 1808 /* 1809 * Does the physical address range's size and alignment permit at 1810 * least one superpage mapping to be created? 1811 */ 1812 superpage_offset = start & PDRMASK; 1813 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1814 /* 1815 * Increase the starting virtual address so that its alignment 1816 * does not preclude the use of superpage mappings. 1817 */ 1818 if ((va & PDRMASK) < superpage_offset) 1819 va = (va & ~PDRMASK) + superpage_offset; 1820 else if ((va & PDRMASK) > superpage_offset) 1821 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1822 } 1823 sva = va; 1824 while (start < end) { 1825 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1826 pseflag != 0) { 1827 KASSERT((va & PDRMASK) == 0, 1828 ("pmap_map: misaligned va %#x", va)); 1829 newpde = start | PG_PS | PG_RW | PG_V; 1830 pmap_kenter_pde(va, newpde); 1831 va += NBPDR; 1832 start += NBPDR; 1833 } else { 1834 pmap_kenter(va, start); 1835 va += PAGE_SIZE; 1836 start += PAGE_SIZE; 1837 } 1838 } 1839 pmap_invalidate_range_int(kernel_pmap, sva, va); 1840 *virt = va; 1841 return (sva); 1842 } 1843 1844 /* 1845 * Add a list of wired pages to the kva 1846 * this routine is only used for temporary 1847 * kernel mappings that do not need to have 1848 * page modification or references recorded. 1849 * Note that old mappings are simply written 1850 * over. The page *must* be wired. 1851 * Note: SMP coherent. Uses a ranged shootdown IPI. 1852 */ 1853 static void 1854 __CONCAT(PMTYPE, qenter)(vm_offset_t sva, vm_page_t *ma, int count) 1855 { 1856 pt_entry_t *endpte, oldpte, pa, *pte; 1857 vm_page_t m; 1858 1859 oldpte = 0; 1860 pte = vtopte(sva); 1861 endpte = pte + count; 1862 while (pte < endpte) { 1863 m = *ma++; 1864 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(kernel_pmap, 1865 m->md.pat_mode, 0); 1866 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1867 oldpte |= *pte; 1868 pte_store(pte, pa | pg_nx | PG_RW | PG_V); 1869 } 1870 pte++; 1871 } 1872 if (__predict_false((oldpte & PG_V) != 0)) 1873 pmap_invalidate_range_int(kernel_pmap, sva, sva + count * 1874 PAGE_SIZE); 1875 } 1876 1877 /* 1878 * This routine tears out page mappings from the 1879 * kernel -- it is meant only for temporary mappings. 1880 * Note: SMP coherent. Uses a ranged shootdown IPI. 1881 */ 1882 static void 1883 __CONCAT(PMTYPE, qremove)(vm_offset_t sva, int count) 1884 { 1885 vm_offset_t va; 1886 1887 va = sva; 1888 while (count-- > 0) { 1889 pmap_kremove(va); 1890 va += PAGE_SIZE; 1891 } 1892 pmap_invalidate_range_int(kernel_pmap, sva, va); 1893 } 1894 1895 /*************************************************** 1896 * Page table page management routines..... 1897 ***************************************************/ 1898 /* 1899 * Schedule the specified unused page table page to be freed. Specifically, 1900 * add the page to the specified list of pages that will be released to the 1901 * physical memory manager after the TLB has been updated. 1902 */ 1903 static __inline void 1904 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 1905 { 1906 1907 if (set_PG_ZERO) 1908 m->flags |= PG_ZERO; 1909 else 1910 m->flags &= ~PG_ZERO; 1911 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1912 } 1913 1914 /* 1915 * Inserts the specified page table page into the specified pmap's collection 1916 * of idle page table pages. Each of a pmap's page table pages is responsible 1917 * for mapping a distinct range of virtual addresses. The pmap's collection is 1918 * ordered by this virtual address range. 1919 * 1920 * If "promoted" is false, then the page table page "mpte" must be zero filled; 1921 * "mpte"'s valid field will be set to 0. 1922 * 1923 * If "promoted" is true and "allpte_PG_A_set" is false, then "mpte" must 1924 * contain valid mappings with identical attributes except for PG_A; "mpte"'s 1925 * valid field will be set to 1. 1926 * 1927 * If "promoted" and "allpte_PG_A_set" are both true, then "mpte" must contain 1928 * valid mappings with identical attributes including PG_A; "mpte"'s valid 1929 * field will be set to VM_PAGE_BITS_ALL. 1930 */ 1931 static __inline int 1932 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1933 bool allpte_PG_A_set) 1934 { 1935 1936 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1937 KASSERT(promoted || !allpte_PG_A_set, 1938 ("a zero-filled PTP can't have PG_A set in every PTE")); 1939 mpte->valid = promoted ? (allpte_PG_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 1940 return (vm_radix_insert(&pmap->pm_root, mpte)); 1941 } 1942 1943 /* 1944 * Removes the page table page mapping the specified virtual address from the 1945 * specified pmap's collection of idle page table pages, and returns it. 1946 * Otherwise, returns NULL if there is no page table page corresponding to the 1947 * specified virtual address. 1948 */ 1949 static __inline vm_page_t 1950 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1951 { 1952 1953 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1954 return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); 1955 } 1956 1957 /* 1958 * Decrements a page table page's reference count, which is used to record the 1959 * number of valid page table entries within the page. If the reference count 1960 * drops to zero, then the page table page is unmapped. Returns true if the 1961 * page table page was unmapped and false otherwise. 1962 */ 1963 static inline bool 1964 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1965 { 1966 1967 --m->ref_count; 1968 if (m->ref_count == 0) { 1969 _pmap_unwire_ptp(pmap, m, free); 1970 return (true); 1971 } else 1972 return (false); 1973 } 1974 1975 static void 1976 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1977 { 1978 1979 /* 1980 * unmap the page table page 1981 */ 1982 pmap->pm_pdir[m->pindex] = 0; 1983 --pmap->pm_stats.resident_count; 1984 1985 /* 1986 * There is not need to invalidate the recursive mapping since 1987 * we never instantiate such mapping for the usermode pmaps, 1988 * and never remove page table pages from the kernel pmap. 1989 * Put page on a list so that it is released since all TLB 1990 * shootdown is done. 1991 */ 1992 MPASS(pmap != kernel_pmap); 1993 pmap_add_delayed_free_list(m, free, true); 1994 } 1995 1996 /* 1997 * After removing a page table entry, this routine is used to 1998 * conditionally free the page, and manage the reference count. 1999 */ 2000 static int 2001 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 2002 { 2003 pd_entry_t ptepde; 2004 vm_page_t mpte; 2005 2006 if (pmap == kernel_pmap) 2007 return (0); 2008 ptepde = *pmap_pde(pmap, va); 2009 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2010 return (pmap_unwire_ptp(pmap, mpte, free)); 2011 } 2012 2013 /* 2014 * Release a page table page reference after a failed attempt to create a 2015 * mapping. 2016 */ 2017 static void 2018 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 2019 { 2020 struct spglist free; 2021 2022 SLIST_INIT(&free); 2023 if (pmap_unwire_ptp(pmap, mpte, &free)) { 2024 /* 2025 * Although "va" was never mapped, paging-structure caches 2026 * could nonetheless have entries that refer to the freed 2027 * page table pages. Invalidate those entries. 2028 */ 2029 pmap_invalidate_page_int(pmap, va); 2030 vm_page_free_pages_toq(&free, true); 2031 } 2032 } 2033 2034 /* 2035 * Initialize the pmap for the swapper process. 2036 */ 2037 static void 2038 __CONCAT(PMTYPE, pinit0)(pmap_t pmap) 2039 { 2040 2041 PMAP_LOCK_INIT(pmap); 2042 pmap->pm_pdir = IdlePTD; 2043 #ifdef PMAP_PAE_COMP 2044 pmap->pm_pdpt = IdlePDPT; 2045 #endif 2046 vm_radix_init(&pmap->pm_root); 2047 CPU_ZERO(&pmap->pm_active); 2048 TAILQ_INIT(&pmap->pm_pvchunk); 2049 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2050 pmap_activate_boot(pmap); 2051 } 2052 2053 /* 2054 * Initialize a preallocated and zeroed pmap structure, 2055 * such as one in a vmspace structure. 2056 */ 2057 static int 2058 __CONCAT(PMTYPE, pinit)(pmap_t pmap) 2059 { 2060 int i; 2061 2062 /* 2063 * No need to allocate page table space yet but we do need a valid 2064 * page directory table. 2065 */ 2066 if (pmap->pm_pdir == NULL) { 2067 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 2068 if (pmap->pm_pdir == NULL) 2069 return (0); 2070 #ifdef PMAP_PAE_COMP 2071 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 2072 KASSERT(((vm_offset_t)pmap->pm_pdpt & 2073 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 2074 ("pmap_pinit: pdpt misaligned")); 2075 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 2076 ("pmap_pinit: pdpt above 4g")); 2077 #endif 2078 vm_radix_init(&pmap->pm_root); 2079 } 2080 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2081 ("pmap_pinit: pmap has reserved page table page(s)")); 2082 2083 /* 2084 * allocate the page directory page(s) 2085 */ 2086 for (i = 0; i < NPGPTD; i++) { 2087 pmap->pm_ptdpg[i] = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2088 VM_ALLOC_ZERO | VM_ALLOC_WAITOK); 2089 #ifdef PMAP_PAE_COMP 2090 pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(pmap->pm_ptdpg[i]) | PG_V; 2091 #endif 2092 } 2093 2094 pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD); 2095 #ifdef PMAP_PAE_COMP 2096 if ((cpu_feature & CPUID_PAT) == 0) { 2097 pmap_invalidate_cache_range( 2098 trunc_page((vm_offset_t)pmap->pm_pdpt), 2099 round_page((vm_offset_t)pmap->pm_pdpt + 2100 NPGPTD * sizeof(pdpt_entry_t))); 2101 } 2102 #endif 2103 2104 /* Install the trampoline mapping. */ 2105 pmap->pm_pdir[TRPTDI] = PTD[TRPTDI]; 2106 2107 CPU_ZERO(&pmap->pm_active); 2108 TAILQ_INIT(&pmap->pm_pvchunk); 2109 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2110 2111 return (1); 2112 } 2113 2114 /* 2115 * this routine is called if the page table page is not 2116 * mapped correctly. 2117 */ 2118 static vm_page_t 2119 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 2120 { 2121 vm_paddr_t ptepa; 2122 vm_page_t m; 2123 2124 /* 2125 * Allocate a page table page. 2126 */ 2127 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2128 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2129 PMAP_UNLOCK(pmap); 2130 rw_wunlock(&pvh_global_lock); 2131 vm_wait(NULL); 2132 rw_wlock(&pvh_global_lock); 2133 PMAP_LOCK(pmap); 2134 } 2135 2136 /* 2137 * Indicate the need to retry. While waiting, the page table 2138 * page may have been allocated. 2139 */ 2140 return (NULL); 2141 } 2142 m->pindex = ptepindex; 2143 2144 /* 2145 * Map the pagetable page into the process address space, if 2146 * it isn't already there. 2147 */ 2148 2149 pmap->pm_stats.resident_count++; 2150 2151 ptepa = VM_PAGE_TO_PHYS(m); 2152 KASSERT((pmap->pm_pdir[ptepindex] & PG_V) == 0, 2153 ("%s: page directory entry %#jx is valid", 2154 __func__, (uintmax_t)pmap->pm_pdir[ptepindex])); 2155 pmap->pm_pdir[ptepindex] = 2156 (pd_entry_t)(ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 2157 2158 return (m); 2159 } 2160 2161 static vm_page_t 2162 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 2163 { 2164 u_int ptepindex; 2165 pd_entry_t ptepa; 2166 vm_page_t m; 2167 2168 /* 2169 * Calculate pagetable page index 2170 */ 2171 ptepindex = va >> PDRSHIFT; 2172 retry: 2173 /* 2174 * Get the page directory entry 2175 */ 2176 ptepa = pmap->pm_pdir[ptepindex]; 2177 2178 /* 2179 * This supports switching from a 4MB page to a 2180 * normal 4K page. 2181 */ 2182 if (ptepa & PG_PS) { 2183 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 2184 ptepa = pmap->pm_pdir[ptepindex]; 2185 } 2186 2187 /* 2188 * If the page table page is mapped, we just increment the 2189 * hold count, and activate it. 2190 */ 2191 if (ptepa) { 2192 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 2193 m->ref_count++; 2194 } else { 2195 /* 2196 * Here if the pte page isn't mapped, or if it has 2197 * been deallocated. 2198 */ 2199 m = _pmap_allocpte(pmap, ptepindex, flags); 2200 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2201 goto retry; 2202 } 2203 return (m); 2204 } 2205 2206 /*************************************************** 2207 * Pmap allocation/deallocation routines. 2208 ***************************************************/ 2209 2210 /* 2211 * Release any resources held by the given physical map. 2212 * Called when a pmap initialized by pmap_pinit is being released. 2213 * Should only be called if the map contains no valid mappings. 2214 */ 2215 static void 2216 __CONCAT(PMTYPE, release)(pmap_t pmap) 2217 { 2218 vm_page_t m; 2219 int i; 2220 2221 KASSERT(pmap->pm_stats.resident_count == 0, 2222 ("pmap_release: pmap resident count %ld != 0", 2223 pmap->pm_stats.resident_count)); 2224 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2225 ("pmap_release: pmap has reserved page table page(s)")); 2226 KASSERT(CPU_EMPTY(&pmap->pm_active), 2227 ("releasing active pmap %p", pmap)); 2228 2229 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2230 2231 for (i = 0; i < NPGPTD; i++) { 2232 m = pmap->pm_ptdpg[i]; 2233 #ifdef PMAP_PAE_COMP 2234 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2235 ("pmap_release: got wrong ptd page")); 2236 #endif 2237 vm_page_unwire_noq(m); 2238 vm_page_free(m); 2239 } 2240 } 2241 2242 /* 2243 * grow the number of kernel page table entries, if needed 2244 */ 2245 static void 2246 __CONCAT(PMTYPE, growkernel)(vm_offset_t addr) 2247 { 2248 vm_paddr_t ptppaddr; 2249 vm_page_t nkpg; 2250 pd_entry_t newpdir; 2251 2252 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2253 addr = roundup2(addr, NBPDR); 2254 if (addr - 1 >= vm_map_max(kernel_map)) 2255 addr = vm_map_max(kernel_map); 2256 while (kernel_vm_end < addr) { 2257 if (pdir_pde(PTD, kernel_vm_end)) { 2258 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2259 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2260 kernel_vm_end = vm_map_max(kernel_map); 2261 break; 2262 } 2263 continue; 2264 } 2265 2266 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 2267 VM_ALLOC_ZERO); 2268 if (nkpg == NULL) 2269 panic("pmap_growkernel: no memory to grow kernel"); 2270 nkpg->pindex = kernel_vm_end >> PDRSHIFT; 2271 nkpt++; 2272 2273 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2274 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2275 pdir_pde(KPTD, kernel_vm_end) = newpdir; 2276 2277 pmap_kenter_pde(kernel_vm_end, newpdir); 2278 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2279 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2280 kernel_vm_end = vm_map_max(kernel_map); 2281 break; 2282 } 2283 } 2284 } 2285 2286 /*************************************************** 2287 * page management routines. 2288 ***************************************************/ 2289 2290 static const uint32_t pc_freemask[_NPCM] = { 2291 [0 ... _NPCM - 2] = PC_FREEN, 2292 [_NPCM - 1] = PC_FREEL 2293 }; 2294 2295 #ifdef PV_STATS 2296 extern int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2297 extern long pv_entry_frees, pv_entry_allocs; 2298 extern int pv_entry_spare; 2299 #endif 2300 2301 /* 2302 * We are in a serious low memory condition. Resort to 2303 * drastic measures to free some pages so we can allocate 2304 * another pv entry chunk. 2305 */ 2306 static vm_page_t 2307 pmap_pv_reclaim(pmap_t locked_pmap) 2308 { 2309 struct pch newtail; 2310 struct pv_chunk *pc; 2311 struct md_page *pvh; 2312 pd_entry_t *pde; 2313 pmap_t pmap; 2314 pt_entry_t *pte, tpte; 2315 pv_entry_t pv; 2316 vm_offset_t va; 2317 vm_page_t m, m_pc; 2318 struct spglist free; 2319 uint32_t inuse; 2320 int bit, field, freed; 2321 2322 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2323 pmap = NULL; 2324 m_pc = NULL; 2325 SLIST_INIT(&free); 2326 TAILQ_INIT(&newtail); 2327 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2328 SLIST_EMPTY(&free))) { 2329 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2330 if (pmap != pc->pc_pmap) { 2331 if (pmap != NULL) { 2332 pmap_invalidate_all_int(pmap); 2333 if (pmap != locked_pmap) 2334 PMAP_UNLOCK(pmap); 2335 } 2336 pmap = pc->pc_pmap; 2337 /* Avoid deadlock and lock recursion. */ 2338 if (pmap > locked_pmap) 2339 PMAP_LOCK(pmap); 2340 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2341 pmap = NULL; 2342 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2343 continue; 2344 } 2345 } 2346 2347 /* 2348 * Destroy every non-wired, 4 KB page mapping in the chunk. 2349 */ 2350 freed = 0; 2351 for (field = 0; field < _NPCM; field++) { 2352 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2353 inuse != 0; inuse &= ~(1UL << bit)) { 2354 bit = bsfl(inuse); 2355 pv = &pc->pc_pventry[field * 32 + bit]; 2356 va = pv->pv_va; 2357 pde = pmap_pde(pmap, va); 2358 if ((*pde & PG_PS) != 0) 2359 continue; 2360 pte = __CONCAT(PMTYPE, pte)(pmap, va); 2361 tpte = *pte; 2362 if ((tpte & PG_W) == 0) 2363 tpte = pte_load_clear(pte); 2364 pmap_pte_release(pte); 2365 if ((tpte & PG_W) != 0) 2366 continue; 2367 KASSERT(tpte != 0, 2368 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2369 pmap, va)); 2370 if ((tpte & PG_G) != 0) 2371 pmap_invalidate_page_int(pmap, va); 2372 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2373 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2374 vm_page_dirty(m); 2375 if ((tpte & PG_A) != 0) 2376 vm_page_aflag_set(m, PGA_REFERENCED); 2377 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2378 if (TAILQ_EMPTY(&m->md.pv_list) && 2379 (m->flags & PG_FICTITIOUS) == 0) { 2380 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2381 if (TAILQ_EMPTY(&pvh->pv_list)) { 2382 vm_page_aflag_clear(m, 2383 PGA_WRITEABLE); 2384 } 2385 } 2386 pc->pc_map[field] |= 1UL << bit; 2387 pmap_unuse_pt(pmap, va, &free); 2388 freed++; 2389 } 2390 } 2391 if (freed == 0) { 2392 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2393 continue; 2394 } 2395 /* Every freed mapping is for a 4 KB page. */ 2396 pmap->pm_stats.resident_count -= freed; 2397 PV_STAT(pv_entry_frees += freed); 2398 PV_STAT(pv_entry_spare += freed); 2399 pv_entry_count -= freed; 2400 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2401 for (field = 0; field < _NPCM; field++) 2402 if (pc->pc_map[field] != pc_freemask[field]) { 2403 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2404 pc_list); 2405 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2406 2407 /* 2408 * One freed pv entry in locked_pmap is 2409 * sufficient. 2410 */ 2411 if (pmap == locked_pmap) 2412 goto out; 2413 break; 2414 } 2415 if (field == _NPCM) { 2416 PV_STAT(pv_entry_spare -= _NPCPV); 2417 PV_STAT(pc_chunk_count--); 2418 PV_STAT(pc_chunk_frees++); 2419 /* Entire chunk is free; return it. */ 2420 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2421 pmap_qremove((vm_offset_t)pc, 1); 2422 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2423 break; 2424 } 2425 } 2426 out: 2427 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2428 if (pmap != NULL) { 2429 pmap_invalidate_all_int(pmap); 2430 if (pmap != locked_pmap) 2431 PMAP_UNLOCK(pmap); 2432 } 2433 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2434 m_pc = SLIST_FIRST(&free); 2435 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2436 /* Recycle a freed page table page. */ 2437 m_pc->ref_count = 1; 2438 } 2439 vm_page_free_pages_toq(&free, true); 2440 return (m_pc); 2441 } 2442 2443 /* 2444 * free the pv_entry back to the free list 2445 */ 2446 static void 2447 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2448 { 2449 struct pv_chunk *pc; 2450 int idx, field, bit; 2451 2452 rw_assert(&pvh_global_lock, RA_WLOCKED); 2453 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2454 PV_STAT(pv_entry_frees++); 2455 PV_STAT(pv_entry_spare++); 2456 pv_entry_count--; 2457 pc = pv_to_chunk(pv); 2458 idx = pv - &pc->pc_pventry[0]; 2459 field = idx / 32; 2460 bit = idx % 32; 2461 pc->pc_map[field] |= 1ul << bit; 2462 for (idx = 0; idx < _NPCM; idx++) 2463 if (pc->pc_map[idx] != pc_freemask[idx]) { 2464 /* 2465 * 98% of the time, pc is already at the head of the 2466 * list. If it isn't already, move it to the head. 2467 */ 2468 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2469 pc)) { 2470 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2471 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2472 pc_list); 2473 } 2474 return; 2475 } 2476 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2477 free_pv_chunk(pc); 2478 } 2479 2480 static void 2481 free_pv_chunk(struct pv_chunk *pc) 2482 { 2483 vm_page_t m; 2484 2485 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2486 PV_STAT(pv_entry_spare -= _NPCPV); 2487 PV_STAT(pc_chunk_count--); 2488 PV_STAT(pc_chunk_frees++); 2489 /* entire chunk is free, return it */ 2490 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2491 pmap_qremove((vm_offset_t)pc, 1); 2492 vm_page_unwire_noq(m); 2493 vm_page_free(m); 2494 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2495 } 2496 2497 /* 2498 * get a new pv_entry, allocating a block from the system 2499 * when needed. 2500 */ 2501 static pv_entry_t 2502 get_pv_entry(pmap_t pmap, bool try) 2503 { 2504 static const struct timeval printinterval = { 60, 0 }; 2505 static struct timeval lastprint; 2506 int bit, field; 2507 pv_entry_t pv; 2508 struct pv_chunk *pc; 2509 vm_page_t m; 2510 2511 rw_assert(&pvh_global_lock, RA_WLOCKED); 2512 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2513 PV_STAT(pv_entry_allocs++); 2514 pv_entry_count++; 2515 if (pv_entry_count > pv_entry_high_water) 2516 if (ratecheck(&lastprint, &printinterval)) 2517 printf("Approaching the limit on PV entries, consider " 2518 "increasing either the vm.pmap.shpgperproc or the " 2519 "vm.pmap.pv_entry_max tunable.\n"); 2520 retry: 2521 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2522 if (pc != NULL) { 2523 for (field = 0; field < _NPCM; field++) { 2524 if (pc->pc_map[field]) { 2525 bit = bsfl(pc->pc_map[field]); 2526 break; 2527 } 2528 } 2529 if (field < _NPCM) { 2530 pv = &pc->pc_pventry[field * 32 + bit]; 2531 pc->pc_map[field] &= ~(1ul << bit); 2532 /* If this was the last item, move it to tail */ 2533 for (field = 0; field < _NPCM; field++) 2534 if (pc->pc_map[field] != 0) { 2535 PV_STAT(pv_entry_spare--); 2536 return (pv); /* not full, return */ 2537 } 2538 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2539 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2540 PV_STAT(pv_entry_spare--); 2541 return (pv); 2542 } 2543 } 2544 /* 2545 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2546 * global lock. If "pv_vafree" is currently non-empty, it will 2547 * remain non-empty until pmap_ptelist_alloc() completes. 2548 */ 2549 if (pv_vafree == 0 || 2550 (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { 2551 if (try) { 2552 pv_entry_count--; 2553 PV_STAT(pc_chunk_tryfail++); 2554 return (NULL); 2555 } 2556 m = pmap_pv_reclaim(pmap); 2557 if (m == NULL) 2558 goto retry; 2559 } 2560 PV_STAT(pc_chunk_count++); 2561 PV_STAT(pc_chunk_allocs++); 2562 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2563 pmap_qenter((vm_offset_t)pc, &m, 1); 2564 pc->pc_pmap = pmap; 2565 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2566 for (field = 1; field < _NPCM; field++) 2567 pc->pc_map[field] = pc_freemask[field]; 2568 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2569 pv = &pc->pc_pventry[0]; 2570 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2571 PV_STAT(pv_entry_spare += _NPCPV - 1); 2572 return (pv); 2573 } 2574 2575 static __inline pv_entry_t 2576 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2577 { 2578 pv_entry_t pv; 2579 2580 rw_assert(&pvh_global_lock, RA_WLOCKED); 2581 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2582 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2583 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2584 break; 2585 } 2586 } 2587 return (pv); 2588 } 2589 2590 static void 2591 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2592 { 2593 struct md_page *pvh; 2594 pv_entry_t pv; 2595 vm_offset_t va_last; 2596 vm_page_t m; 2597 2598 rw_assert(&pvh_global_lock, RA_WLOCKED); 2599 KASSERT((pa & PDRMASK) == 0, 2600 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2601 2602 /* 2603 * Transfer the 4mpage's pv entry for this mapping to the first 2604 * page's pv list. 2605 */ 2606 pvh = pa_to_pvh(pa); 2607 va = trunc_4mpage(va); 2608 pv = pmap_pvh_remove(pvh, pmap, va); 2609 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2610 m = PHYS_TO_VM_PAGE(pa); 2611 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2612 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2613 va_last = va + NBPDR - PAGE_SIZE; 2614 do { 2615 m++; 2616 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2617 ("pmap_pv_demote_pde: page %p is not managed", m)); 2618 va += PAGE_SIZE; 2619 pmap_insert_entry(pmap, va, m); 2620 } while (va < va_last); 2621 } 2622 2623 #if VM_NRESERVLEVEL > 0 2624 static void 2625 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2626 { 2627 struct md_page *pvh; 2628 pv_entry_t pv; 2629 vm_offset_t va_last; 2630 vm_page_t m; 2631 2632 rw_assert(&pvh_global_lock, RA_WLOCKED); 2633 KASSERT((pa & PDRMASK) == 0, 2634 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2635 2636 /* 2637 * Transfer the first page's pv entry for this mapping to the 2638 * 4mpage's pv list. Aside from avoiding the cost of a call 2639 * to get_pv_entry(), a transfer avoids the possibility that 2640 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2641 * removes one of the mappings that is being promoted. 2642 */ 2643 m = PHYS_TO_VM_PAGE(pa); 2644 va = trunc_4mpage(va); 2645 pv = pmap_pvh_remove(&m->md, pmap, va); 2646 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2647 pvh = pa_to_pvh(pa); 2648 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2649 /* Free the remaining NPTEPG - 1 pv entries. */ 2650 va_last = va + NBPDR - PAGE_SIZE; 2651 do { 2652 m++; 2653 va += PAGE_SIZE; 2654 pmap_pvh_free(&m->md, pmap, va); 2655 } while (va < va_last); 2656 } 2657 #endif /* VM_NRESERVLEVEL > 0 */ 2658 2659 static void 2660 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2661 { 2662 pv_entry_t pv; 2663 2664 pv = pmap_pvh_remove(pvh, pmap, va); 2665 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2666 free_pv_entry(pmap, pv); 2667 } 2668 2669 static void 2670 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2671 { 2672 struct md_page *pvh; 2673 2674 rw_assert(&pvh_global_lock, RA_WLOCKED); 2675 pmap_pvh_free(&m->md, pmap, va); 2676 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2677 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2678 if (TAILQ_EMPTY(&pvh->pv_list)) 2679 vm_page_aflag_clear(m, PGA_WRITEABLE); 2680 } 2681 } 2682 2683 /* 2684 * Create a pv entry for page at pa for 2685 * (pmap, va). 2686 */ 2687 static void 2688 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2689 { 2690 pv_entry_t pv; 2691 2692 rw_assert(&pvh_global_lock, RA_WLOCKED); 2693 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2694 pv = get_pv_entry(pmap, false); 2695 pv->pv_va = va; 2696 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2697 } 2698 2699 /* 2700 * Conditionally create a pv entry. 2701 */ 2702 static bool 2703 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2704 { 2705 pv_entry_t pv; 2706 2707 rw_assert(&pvh_global_lock, RA_WLOCKED); 2708 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2709 if (pv_entry_count < pv_entry_high_water && 2710 (pv = get_pv_entry(pmap, true)) != NULL) { 2711 pv->pv_va = va; 2712 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2713 return (true); 2714 } else 2715 return (false); 2716 } 2717 2718 /* 2719 * Create the pv entries for each of the pages within a superpage. 2720 */ 2721 static bool 2722 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags) 2723 { 2724 struct md_page *pvh; 2725 pv_entry_t pv; 2726 bool noreclaim; 2727 2728 rw_assert(&pvh_global_lock, RA_WLOCKED); 2729 noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; 2730 if ((noreclaim && pv_entry_count >= pv_entry_high_water) || 2731 (pv = get_pv_entry(pmap, noreclaim)) == NULL) 2732 return (false); 2733 pv->pv_va = va; 2734 pvh = pa_to_pvh(pde & PG_PS_FRAME); 2735 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2736 return (true); 2737 } 2738 2739 /* 2740 * Fills a page table page with mappings to consecutive physical pages. 2741 */ 2742 static void 2743 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2744 { 2745 pt_entry_t *pte; 2746 2747 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2748 *pte = newpte; 2749 newpte += PAGE_SIZE; 2750 } 2751 } 2752 2753 /* 2754 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2755 * 2- or 4MB page mapping is invalidated. 2756 */ 2757 static bool 2758 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2759 { 2760 pd_entry_t newpde, oldpde; 2761 pt_entry_t *firstpte, newpte; 2762 vm_paddr_t mptepa; 2763 vm_page_t mpte; 2764 struct spglist free; 2765 vm_offset_t sva; 2766 2767 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2768 oldpde = *pde; 2769 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2770 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2771 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2772 NULL) { 2773 KASSERT((oldpde & PG_W) == 0, 2774 ("pmap_demote_pde: page table page for a wired mapping" 2775 " is missing")); 2776 2777 /* 2778 * Invalidate the 2- or 4MB page mapping and return 2779 * "failure" if the mapping was never accessed or the 2780 * allocation of the new page table page fails. 2781 */ 2782 if ((oldpde & PG_A) == 0 || 2783 (mpte = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { 2784 SLIST_INIT(&free); 2785 sva = trunc_4mpage(va); 2786 pmap_remove_pde(pmap, pde, sva, &free); 2787 if ((oldpde & PG_G) == 0) 2788 pmap_invalidate_pde_page(pmap, sva, oldpde); 2789 vm_page_free_pages_toq(&free, true); 2790 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2791 " in pmap %p", va, pmap); 2792 return (false); 2793 } 2794 mpte->pindex = va >> PDRSHIFT; 2795 if (pmap != kernel_pmap) { 2796 mpte->ref_count = NPTEPG; 2797 pmap->pm_stats.resident_count++; 2798 } 2799 } 2800 mptepa = VM_PAGE_TO_PHYS(mpte); 2801 2802 /* 2803 * If the page mapping is in the kernel's address space, then the 2804 * KPTmap can provide access to the page table page. Otherwise, 2805 * temporarily map the page table page (mpte) into the kernel's 2806 * address space at either PADDR1 or PADDR2. 2807 */ 2808 if (pmap == kernel_pmap) 2809 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2810 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2811 if ((*PMAP1 & PG_FRAME) != mptepa) { 2812 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2813 #ifdef SMP 2814 PMAP1cpu = PCPU_GET(cpuid); 2815 #endif 2816 invlcaddr(PADDR1); 2817 PMAP1changed++; 2818 } else 2819 #ifdef SMP 2820 if (PMAP1cpu != PCPU_GET(cpuid)) { 2821 PMAP1cpu = PCPU_GET(cpuid); 2822 invlcaddr(PADDR1); 2823 PMAP1changedcpu++; 2824 } else 2825 #endif 2826 PMAP1unchanged++; 2827 firstpte = PADDR1; 2828 } else { 2829 mtx_lock(&PMAP2mutex); 2830 if ((*PMAP2 & PG_FRAME) != mptepa) { 2831 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2832 pmap_invalidate_page_int(kernel_pmap, 2833 (vm_offset_t)PADDR2); 2834 } 2835 firstpte = PADDR2; 2836 } 2837 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2838 KASSERT((oldpde & PG_A) != 0, 2839 ("pmap_demote_pde: oldpde is missing PG_A")); 2840 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2841 ("pmap_demote_pde: oldpde is missing PG_M")); 2842 newpte = oldpde & ~PG_PS; 2843 if ((newpte & PG_PDE_PAT) != 0) 2844 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2845 2846 /* 2847 * If the PTP is not leftover from an earlier promotion or it does not 2848 * have PG_A set in every PTE, then fill it. The new PTEs will all 2849 * have PG_A set. 2850 */ 2851 if (!vm_page_all_valid(mpte)) 2852 pmap_fill_ptp(firstpte, newpte); 2853 2854 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2855 ("pmap_demote_pde: firstpte and newpte map different physical" 2856 " addresses")); 2857 2858 /* 2859 * If the mapping has changed attributes, update the PTEs. 2860 */ 2861 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2862 pmap_fill_ptp(firstpte, newpte); 2863 2864 /* 2865 * Demote the mapping. This pmap is locked. The old PDE has 2866 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2867 * set. Thus, there is no danger of a race with another 2868 * processor changing the setting of PG_A and/or PG_M between 2869 * the read above and the store below. 2870 */ 2871 if (workaround_erratum383) 2872 pmap_update_pde(pmap, va, pde, newpde); 2873 else if (pmap == kernel_pmap) 2874 pmap_kenter_pde(va, newpde); 2875 else 2876 pde_store(pde, newpde); 2877 if (firstpte == PADDR2) 2878 mtx_unlock(&PMAP2mutex); 2879 2880 /* 2881 * Invalidate the recursive mapping of the page table page. 2882 */ 2883 pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); 2884 2885 /* 2886 * Demote the pv entry. This depends on the earlier demotion 2887 * of the mapping. Specifically, the (re)creation of a per- 2888 * page pv entry might trigger the execution of pmap_collect(), 2889 * which might reclaim a newly (re)created per-page pv entry 2890 * and destroy the associated mapping. In order to destroy 2891 * the mapping, the PDE must have already changed from mapping 2892 * the 2mpage to referencing the page table page. 2893 */ 2894 if ((oldpde & PG_MANAGED) != 0) 2895 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2896 2897 pmap_pde_demotions++; 2898 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2899 " in pmap %p", va, pmap); 2900 return (true); 2901 } 2902 2903 /* 2904 * Removes a 2- or 4MB page mapping from the kernel pmap. 2905 */ 2906 static void 2907 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2908 { 2909 pd_entry_t newpde; 2910 vm_paddr_t mptepa; 2911 vm_page_t mpte; 2912 2913 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2914 mpte = pmap_remove_pt_page(pmap, va); 2915 if (mpte == NULL) 2916 panic("pmap_remove_kernel_pde: Missing pt page."); 2917 2918 mptepa = VM_PAGE_TO_PHYS(mpte); 2919 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2920 2921 /* 2922 * If this page table page was unmapped by a promotion, then it 2923 * contains valid mappings. Zero it to invalidate those mappings. 2924 */ 2925 if (vm_page_any_valid(mpte)) 2926 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2927 2928 /* 2929 * Remove the mapping. 2930 */ 2931 if (workaround_erratum383) 2932 pmap_update_pde(pmap, va, pde, newpde); 2933 else 2934 pmap_kenter_pde(va, newpde); 2935 2936 /* 2937 * Invalidate the recursive mapping of the page table page. 2938 */ 2939 pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); 2940 } 2941 2942 /* 2943 * pmap_remove_pde: do the things to unmap a superpage in a process 2944 */ 2945 static void 2946 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2947 struct spglist *free) 2948 { 2949 struct md_page *pvh; 2950 pd_entry_t oldpde; 2951 vm_offset_t eva, va; 2952 vm_page_t m, mpte; 2953 2954 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2955 KASSERT((sva & PDRMASK) == 0, 2956 ("pmap_remove_pde: sva is not 4mpage aligned")); 2957 oldpde = pte_load_clear(pdq); 2958 if (oldpde & PG_W) 2959 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2960 2961 /* 2962 * Machines that don't support invlpg, also don't support 2963 * PG_G. 2964 */ 2965 if ((oldpde & PG_G) != 0) 2966 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 2967 2968 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2969 if (oldpde & PG_MANAGED) { 2970 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2971 pmap_pvh_free(pvh, pmap, sva); 2972 eva = sva + NBPDR; 2973 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2974 va < eva; va += PAGE_SIZE, m++) { 2975 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2976 vm_page_dirty(m); 2977 if (oldpde & PG_A) 2978 vm_page_aflag_set(m, PGA_REFERENCED); 2979 if (TAILQ_EMPTY(&m->md.pv_list) && 2980 TAILQ_EMPTY(&pvh->pv_list)) 2981 vm_page_aflag_clear(m, PGA_WRITEABLE); 2982 } 2983 } 2984 if (pmap == kernel_pmap) { 2985 pmap_remove_kernel_pde(pmap, pdq, sva); 2986 } else { 2987 mpte = pmap_remove_pt_page(pmap, sva); 2988 if (mpte != NULL) { 2989 KASSERT(vm_page_any_valid(mpte), 2990 ("pmap_remove_pde: pte page not promoted")); 2991 pmap->pm_stats.resident_count--; 2992 KASSERT(mpte->ref_count == NPTEPG, 2993 ("pmap_remove_pde: pte page ref count error")); 2994 mpte->ref_count = 0; 2995 pmap_add_delayed_free_list(mpte, free, false); 2996 } 2997 } 2998 } 2999 3000 /* 3001 * pmap_remove_pte: do the things to unmap a page in a process 3002 */ 3003 static int 3004 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 3005 struct spglist *free) 3006 { 3007 pt_entry_t oldpte; 3008 vm_page_t m; 3009 3010 rw_assert(&pvh_global_lock, RA_WLOCKED); 3011 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3012 oldpte = pte_load_clear(ptq); 3013 KASSERT(oldpte != 0, 3014 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 3015 if (oldpte & PG_W) 3016 pmap->pm_stats.wired_count -= 1; 3017 /* 3018 * Machines that don't support invlpg, also don't support 3019 * PG_G. 3020 */ 3021 if (oldpte & PG_G) 3022 pmap_invalidate_page_int(kernel_pmap, va); 3023 pmap->pm_stats.resident_count -= 1; 3024 if (oldpte & PG_MANAGED) { 3025 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3026 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3027 vm_page_dirty(m); 3028 if (oldpte & PG_A) 3029 vm_page_aflag_set(m, PGA_REFERENCED); 3030 pmap_remove_entry(pmap, m, va); 3031 } 3032 return (pmap_unuse_pt(pmap, va, free)); 3033 } 3034 3035 /* 3036 * Remove a single page from a process address space 3037 */ 3038 static void 3039 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 3040 { 3041 pt_entry_t *pte; 3042 3043 rw_assert(&pvh_global_lock, RA_WLOCKED); 3044 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 3045 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3046 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 3047 return; 3048 pmap_remove_pte(pmap, pte, va, free); 3049 pmap_invalidate_page_int(pmap, va); 3050 } 3051 3052 /* 3053 * Removes the specified range of addresses from the page table page. 3054 */ 3055 static bool 3056 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3057 struct spglist *free) 3058 { 3059 pt_entry_t *pte; 3060 bool anyvalid; 3061 3062 rw_assert(&pvh_global_lock, RA_WLOCKED); 3063 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 3064 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3065 anyvalid = false; 3066 for (pte = pmap_pte_quick(pmap, sva); sva != eva; pte++, 3067 sva += PAGE_SIZE) { 3068 if (*pte == 0) 3069 continue; 3070 3071 /* 3072 * The TLB entry for a PG_G mapping is invalidated by 3073 * pmap_remove_pte(). 3074 */ 3075 if ((*pte & PG_G) == 0) 3076 anyvalid = true; 3077 3078 if (pmap_remove_pte(pmap, pte, sva, free)) 3079 break; 3080 } 3081 return (anyvalid); 3082 } 3083 3084 /* 3085 * Remove the given range of addresses from the specified map. 3086 * 3087 * It is assumed that the start and end are properly 3088 * rounded to the page size. 3089 */ 3090 static void 3091 __CONCAT(PMTYPE, remove)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3092 { 3093 vm_offset_t pdnxt; 3094 pd_entry_t ptpaddr; 3095 struct spglist free; 3096 int anyvalid; 3097 3098 /* 3099 * Perform an unsynchronized read. This is, however, safe. 3100 */ 3101 if (pmap->pm_stats.resident_count == 0) 3102 return; 3103 3104 anyvalid = 0; 3105 SLIST_INIT(&free); 3106 3107 rw_wlock(&pvh_global_lock); 3108 sched_pin(); 3109 PMAP_LOCK(pmap); 3110 3111 /* 3112 * special handling of removing one page. a very 3113 * common operation and easy to short circuit some 3114 * code. 3115 */ 3116 if ((sva + PAGE_SIZE == eva) && 3117 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 3118 pmap_remove_page(pmap, sva, &free); 3119 goto out; 3120 } 3121 3122 for (; sva < eva; sva = pdnxt) { 3123 u_int pdirindex; 3124 3125 /* 3126 * Calculate index for next page table. 3127 */ 3128 pdnxt = (sva + NBPDR) & ~PDRMASK; 3129 if (pdnxt < sva) 3130 pdnxt = eva; 3131 if (pmap->pm_stats.resident_count == 0) 3132 break; 3133 3134 pdirindex = sva >> PDRSHIFT; 3135 ptpaddr = pmap->pm_pdir[pdirindex]; 3136 3137 /* 3138 * Weed out invalid mappings. Note: we assume that the page 3139 * directory table is always allocated, and in kernel virtual. 3140 */ 3141 if (ptpaddr == 0) 3142 continue; 3143 3144 /* 3145 * Check for large page. 3146 */ 3147 if ((ptpaddr & PG_PS) != 0) { 3148 /* 3149 * Are we removing the entire large page? If not, 3150 * demote the mapping and fall through. 3151 */ 3152 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3153 /* 3154 * The TLB entry for a PG_G mapping is 3155 * invalidated by pmap_remove_pde(). 3156 */ 3157 if ((ptpaddr & PG_G) == 0) 3158 anyvalid = 1; 3159 pmap_remove_pde(pmap, 3160 &pmap->pm_pdir[pdirindex], sva, &free); 3161 continue; 3162 } else if (!pmap_demote_pde(pmap, 3163 &pmap->pm_pdir[pdirindex], sva)) { 3164 /* The large page mapping was destroyed. */ 3165 continue; 3166 } 3167 } 3168 3169 /* 3170 * Limit our scan to either the end of the va represented 3171 * by the current page table page, or to the end of the 3172 * range being removed. 3173 */ 3174 if (pdnxt > eva) 3175 pdnxt = eva; 3176 3177 if (pmap_remove_ptes(pmap, sva, pdnxt, &free)) 3178 anyvalid = 1; 3179 } 3180 out: 3181 sched_unpin(); 3182 if (anyvalid) 3183 pmap_invalidate_all_int(pmap); 3184 rw_wunlock(&pvh_global_lock); 3185 PMAP_UNLOCK(pmap); 3186 vm_page_free_pages_toq(&free, true); 3187 } 3188 3189 /* 3190 * Routine: pmap_remove_all 3191 * Function: 3192 * Removes this physical page from 3193 * all physical maps in which it resides. 3194 * Reflects back modify bits to the pager. 3195 * 3196 * Notes: 3197 * Original versions of this routine were very 3198 * inefficient because they iteratively called 3199 * pmap_remove (slow...) 3200 */ 3201 3202 static void 3203 __CONCAT(PMTYPE, remove_all)(vm_page_t m) 3204 { 3205 struct md_page *pvh; 3206 pv_entry_t pv; 3207 pmap_t pmap; 3208 pt_entry_t *pte, tpte; 3209 pd_entry_t *pde; 3210 vm_offset_t va; 3211 struct spglist free; 3212 3213 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3214 ("pmap_remove_all: page %p is not managed", m)); 3215 SLIST_INIT(&free); 3216 rw_wlock(&pvh_global_lock); 3217 sched_pin(); 3218 if ((m->flags & PG_FICTITIOUS) != 0) 3219 goto small_mappings; 3220 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3221 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3222 va = pv->pv_va; 3223 pmap = PV_PMAP(pv); 3224 PMAP_LOCK(pmap); 3225 pde = pmap_pde(pmap, va); 3226 (void)pmap_demote_pde(pmap, pde, va); 3227 PMAP_UNLOCK(pmap); 3228 } 3229 small_mappings: 3230 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3231 pmap = PV_PMAP(pv); 3232 PMAP_LOCK(pmap); 3233 pmap->pm_stats.resident_count--; 3234 pde = pmap_pde(pmap, pv->pv_va); 3235 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3236 " a 4mpage in page %p's pv list", m)); 3237 pte = pmap_pte_quick(pmap, pv->pv_va); 3238 tpte = pte_load_clear(pte); 3239 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3240 pmap, pv->pv_va)); 3241 if (tpte & PG_W) 3242 pmap->pm_stats.wired_count--; 3243 if (tpte & PG_A) 3244 vm_page_aflag_set(m, PGA_REFERENCED); 3245 3246 /* 3247 * Update the vm_page_t clean and reference bits. 3248 */ 3249 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3250 vm_page_dirty(m); 3251 pmap_unuse_pt(pmap, pv->pv_va, &free); 3252 pmap_invalidate_page_int(pmap, pv->pv_va); 3253 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3254 free_pv_entry(pmap, pv); 3255 PMAP_UNLOCK(pmap); 3256 } 3257 vm_page_aflag_clear(m, PGA_WRITEABLE); 3258 sched_unpin(); 3259 rw_wunlock(&pvh_global_lock); 3260 vm_page_free_pages_toq(&free, true); 3261 } 3262 3263 /* 3264 * pmap_protect_pde: do the things to protect a 4mpage in a process 3265 */ 3266 static bool 3267 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3268 { 3269 pd_entry_t newpde, oldpde; 3270 vm_page_t m, mt; 3271 bool anychanged; 3272 3273 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3274 KASSERT((sva & PDRMASK) == 0, 3275 ("pmap_protect_pde: sva is not 4mpage aligned")); 3276 anychanged = false; 3277 retry: 3278 oldpde = newpde = *pde; 3279 if ((prot & VM_PROT_WRITE) == 0) { 3280 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 3281 (PG_MANAGED | PG_M | PG_RW)) { 3282 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3283 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 3284 vm_page_dirty(mt); 3285 } 3286 newpde &= ~(PG_RW | PG_M); 3287 } 3288 #ifdef PMAP_PAE_COMP 3289 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 3290 newpde |= pg_nx; 3291 #endif 3292 if (newpde != oldpde) { 3293 /* 3294 * As an optimization to future operations on this PDE, clear 3295 * PG_PROMOTED. The impending invalidation will remove any 3296 * lingering 4KB page mappings from the TLB. 3297 */ 3298 if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) 3299 goto retry; 3300 if ((oldpde & PG_G) != 0) 3301 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 3302 else 3303 anychanged = true; 3304 } 3305 return (anychanged); 3306 } 3307 3308 /* 3309 * Set the physical protection on the 3310 * specified range of this map as requested. 3311 */ 3312 static void 3313 __CONCAT(PMTYPE, protect)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3314 vm_prot_t prot) 3315 { 3316 vm_offset_t pdnxt; 3317 pd_entry_t ptpaddr; 3318 pt_entry_t *pte; 3319 bool anychanged, pv_lists_locked; 3320 3321 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3322 if (prot == VM_PROT_NONE) { 3323 pmap_remove(pmap, sva, eva); 3324 return; 3325 } 3326 3327 #ifdef PMAP_PAE_COMP 3328 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 3329 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 3330 return; 3331 #else 3332 if (prot & VM_PROT_WRITE) 3333 return; 3334 #endif 3335 3336 if (pmap_is_current(pmap)) 3337 pv_lists_locked = false; 3338 else { 3339 pv_lists_locked = true; 3340 resume: 3341 rw_wlock(&pvh_global_lock); 3342 sched_pin(); 3343 } 3344 anychanged = false; 3345 3346 PMAP_LOCK(pmap); 3347 for (; sva < eva; sva = pdnxt) { 3348 pt_entry_t obits, pbits; 3349 u_int pdirindex; 3350 3351 pdnxt = (sva + NBPDR) & ~PDRMASK; 3352 if (pdnxt < sva) 3353 pdnxt = eva; 3354 3355 pdirindex = sva >> PDRSHIFT; 3356 ptpaddr = pmap->pm_pdir[pdirindex]; 3357 3358 /* 3359 * Weed out invalid mappings. Note: we assume that the page 3360 * directory table is always allocated, and in kernel virtual. 3361 */ 3362 if (ptpaddr == 0) 3363 continue; 3364 3365 /* 3366 * Check for large page. 3367 */ 3368 if ((ptpaddr & PG_PS) != 0) { 3369 /* 3370 * Are we protecting the entire large page? If not, 3371 * demote the mapping and fall through. 3372 */ 3373 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3374 /* 3375 * The TLB entry for a PG_G mapping is 3376 * invalidated by pmap_protect_pde(). 3377 */ 3378 if (pmap_protect_pde(pmap, 3379 &pmap->pm_pdir[pdirindex], sva, prot)) 3380 anychanged = true; 3381 continue; 3382 } else { 3383 if (!pv_lists_locked) { 3384 pv_lists_locked = true; 3385 if (!rw_try_wlock(&pvh_global_lock)) { 3386 if (anychanged) 3387 pmap_invalidate_all_int( 3388 pmap); 3389 PMAP_UNLOCK(pmap); 3390 goto resume; 3391 } 3392 sched_pin(); 3393 } 3394 if (!pmap_demote_pde(pmap, 3395 &pmap->pm_pdir[pdirindex], sva)) { 3396 /* 3397 * The large page mapping was 3398 * destroyed. 3399 */ 3400 continue; 3401 } 3402 } 3403 } 3404 3405 if (pdnxt > eva) 3406 pdnxt = eva; 3407 3408 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3409 sva += PAGE_SIZE) { 3410 vm_page_t m; 3411 3412 retry: 3413 /* 3414 * Regardless of whether a pte is 32 or 64 bits in 3415 * size, PG_RW, PG_A, and PG_M are among the least 3416 * significant 32 bits. 3417 */ 3418 obits = pbits = *pte; 3419 if ((pbits & PG_V) == 0) 3420 continue; 3421 3422 if ((prot & VM_PROT_WRITE) == 0) { 3423 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3424 (PG_MANAGED | PG_M | PG_RW)) { 3425 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3426 vm_page_dirty(m); 3427 } 3428 pbits &= ~(PG_RW | PG_M); 3429 } 3430 #ifdef PMAP_PAE_COMP 3431 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 3432 pbits |= pg_nx; 3433 #endif 3434 3435 if (pbits != obits) { 3436 #ifdef PMAP_PAE_COMP 3437 if (!atomic_cmpset_64(pte, obits, pbits)) 3438 goto retry; 3439 #else 3440 if (!atomic_cmpset_int((u_int *)pte, obits, 3441 pbits)) 3442 goto retry; 3443 #endif 3444 if (obits & PG_G) 3445 pmap_invalidate_page_int(pmap, sva); 3446 else 3447 anychanged = true; 3448 } 3449 } 3450 } 3451 if (anychanged) 3452 pmap_invalidate_all_int(pmap); 3453 if (pv_lists_locked) { 3454 sched_unpin(); 3455 rw_wunlock(&pvh_global_lock); 3456 } 3457 PMAP_UNLOCK(pmap); 3458 } 3459 3460 #if VM_NRESERVLEVEL > 0 3461 /* 3462 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3463 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3464 * For promotion to occur, two conditions must be met: (1) the 4KB page 3465 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3466 * mappings must have identical characteristics. 3467 * 3468 * Managed (PG_MANAGED) mappings within the kernel address space are not 3469 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3470 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3471 * pmap. 3472 */ 3473 static bool 3474 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte) 3475 { 3476 pd_entry_t newpde; 3477 pt_entry_t allpte_PG_A, *firstpte, oldpte, pa, *pte; 3478 #ifdef KTR 3479 vm_offset_t oldpteva; 3480 #endif 3481 3482 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3483 if (!pg_ps_enabled) 3484 return (false); 3485 3486 /* 3487 * Examine the first PTE in the specified PTP. Abort if this PTE is 3488 * either invalid or does not map the first 4KB physical page 3489 * within a 2- or 4MB page. 3490 */ 3491 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3492 setpde: 3493 newpde = *firstpte; 3494 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { 3495 pmap_pde_p_failures++; 3496 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3497 " in pmap %p", va, pmap); 3498 return (false); 3499 } 3500 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3501 pmap_pde_p_failures++; 3502 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3503 " in pmap %p", va, pmap); 3504 return (false); 3505 } 3506 3507 /* 3508 * Both here and in the below "for" loop, to allow for repromotion 3509 * after MADV_FREE, conditionally write protect a clean PTE before 3510 * possibly aborting the promotion due to other PTE attributes. Why? 3511 * Suppose that MADV_FREE is applied to a part of a superpage, the 3512 * address range [S, E). pmap_advise() will demote the superpage 3513 * mapping, destroy the 4KB page mapping at the end of [S, E), and 3514 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, 3515 * imagine that the memory in [S, E) is recycled, but the last 4KB 3516 * page in [S, E) is not the last to be rewritten, or simply accessed. 3517 * In other words, there is still a 4KB page in [S, E), call it P, 3518 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless 3519 * we write protect P before aborting the promotion, if and when P is 3520 * finally rewritten, there won't be a page fault to trigger 3521 * repromotion. 3522 */ 3523 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3524 /* 3525 * When PG_M is already clear, PG_RW can be cleared without 3526 * a TLB invalidation. 3527 */ 3528 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3529 ~PG_RW)) 3530 goto setpde; 3531 newpde &= ~PG_RW; 3532 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 3533 " in pmap %p", va & ~PDRMASK, pmap); 3534 } 3535 3536 /* 3537 * Examine each of the other PTEs in the specified PTP. Abort if this 3538 * PTE maps an unexpected 4KB physical page or does not have identical 3539 * characteristics to the first PTE. 3540 */ 3541 allpte_PG_A = newpde & PG_A; 3542 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; 3543 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3544 setpte: 3545 oldpte = *pte; 3546 if ((oldpte & (PG_FRAME | PG_V)) != pa) { 3547 pmap_pde_p_failures++; 3548 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3549 " in pmap %p", va, pmap); 3550 return (false); 3551 } 3552 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3553 /* 3554 * When PG_M is already clear, PG_RW can be cleared 3555 * without a TLB invalidation. 3556 */ 3557 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3558 oldpte & ~PG_RW)) 3559 goto setpte; 3560 oldpte &= ~PG_RW; 3561 #ifdef KTR 3562 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3563 (va & ~PDRMASK); 3564 #endif 3565 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3566 " in pmap %p", oldpteva, pmap); 3567 } 3568 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3569 pmap_pde_p_failures++; 3570 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3571 " in pmap %p", va, pmap); 3572 return (false); 3573 } 3574 allpte_PG_A &= oldpte; 3575 pa -= PAGE_SIZE; 3576 } 3577 3578 /* 3579 * Unless all PTEs have PG_A set, clear it from the superpage mapping, 3580 * so that promotions triggered by speculative mappings, such as 3581 * pmap_enter_quick(), don't automatically mark the underlying pages 3582 * as referenced. 3583 */ 3584 newpde &= ~PG_A | allpte_PG_A; 3585 3586 /* 3587 * Save the PTP in its current state until the PDE mapping the 3588 * superpage is demoted by pmap_demote_pde() or destroyed by 3589 * pmap_remove_pde(). If PG_A is not set in every PTE, then request 3590 * that the PTP be refilled on demotion. 3591 */ 3592 if (mpte == NULL) 3593 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3594 KASSERT(mpte >= vm_page_array && 3595 mpte < &vm_page_array[vm_page_array_size], 3596 ("pmap_promote_pde: page table page is out of range")); 3597 KASSERT(mpte->pindex == va >> PDRSHIFT, 3598 ("pmap_promote_pde: page table page's pindex is wrong")); 3599 if (pmap_insert_pt_page(pmap, mpte, true, allpte_PG_A != 0)) { 3600 pmap_pde_p_failures++; 3601 CTR2(KTR_PMAP, 3602 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3603 pmap); 3604 return (false); 3605 } 3606 3607 /* 3608 * Promote the pv entries. 3609 */ 3610 if ((newpde & PG_MANAGED) != 0) 3611 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3612 3613 /* 3614 * Propagate the PAT index to its proper position. 3615 */ 3616 if ((newpde & PG_PTE_PAT) != 0) 3617 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3618 3619 /* 3620 * Map the superpage. 3621 */ 3622 if (workaround_erratum383) 3623 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3624 else if (pmap == kernel_pmap) 3625 pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); 3626 else 3627 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 3628 3629 pmap_pde_promotions++; 3630 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3631 " in pmap %p", va, pmap); 3632 return (true); 3633 } 3634 #endif /* VM_NRESERVLEVEL > 0 */ 3635 3636 /* 3637 * Insert the given physical page (p) at 3638 * the specified virtual address (v) in the 3639 * target physical map with the protection requested. 3640 * 3641 * If specified, the page will be wired down, meaning 3642 * that the related pte can not be reclaimed. 3643 * 3644 * NB: This is the only routine which MAY NOT lazy-evaluate 3645 * or lose information. That is, this routine must actually 3646 * insert this page into the given map NOW. 3647 */ 3648 static int 3649 __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, 3650 vm_prot_t prot, u_int flags, int8_t psind) 3651 { 3652 pd_entry_t *pde; 3653 pt_entry_t *pte; 3654 pt_entry_t newpte, origpte; 3655 pv_entry_t pv; 3656 vm_paddr_t opa, pa; 3657 vm_page_t mpte, om; 3658 int rv; 3659 3660 va = trunc_page(va); 3661 KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) || 3662 (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS), 3663 ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va)); 3664 KASSERT(va < PMAP_TRM_MIN_ADDRESS, 3665 ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)", 3666 va)); 3667 KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 || 3668 !VA_IS_CLEANMAP(va), 3669 ("pmap_enter: managed mapping within the clean submap")); 3670 if ((m->oflags & VPO_UNMANAGED) == 0) 3671 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3672 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 3673 ("pmap_enter: flags %u has reserved bits set", flags)); 3674 pa = VM_PAGE_TO_PHYS(m); 3675 newpte = (pt_entry_t)(pa | PG_A | PG_V); 3676 if ((flags & VM_PROT_WRITE) != 0) 3677 newpte |= PG_M; 3678 if ((prot & VM_PROT_WRITE) != 0) 3679 newpte |= PG_RW; 3680 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 3681 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 3682 #ifdef PMAP_PAE_COMP 3683 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 3684 newpte |= pg_nx; 3685 #endif 3686 if ((flags & PMAP_ENTER_WIRED) != 0) 3687 newpte |= PG_W; 3688 if (pmap != kernel_pmap) 3689 newpte |= PG_U; 3690 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 3691 if ((m->oflags & VPO_UNMANAGED) == 0) 3692 newpte |= PG_MANAGED; 3693 3694 rw_wlock(&pvh_global_lock); 3695 PMAP_LOCK(pmap); 3696 sched_pin(); 3697 if (psind == 1) { 3698 /* Assert the required virtual and physical alignment. */ 3699 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 3700 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 3701 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m); 3702 goto out; 3703 } 3704 3705 pde = pmap_pde(pmap, va); 3706 if (pmap != kernel_pmap) { 3707 /* 3708 * va is for UVA. 3709 * In the case that a page table page is not resident, 3710 * we are creating it here. pmap_allocpte() handles 3711 * demotion. 3712 */ 3713 mpte = pmap_allocpte(pmap, va, flags); 3714 if (mpte == NULL) { 3715 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3716 ("pmap_allocpte failed with sleep allowed")); 3717 rv = KERN_RESOURCE_SHORTAGE; 3718 goto out; 3719 } 3720 } else { 3721 /* 3722 * va is for KVA, so pmap_demote_pde() will never fail 3723 * to install a page table page. PG_V is also 3724 * asserted by pmap_demote_pde(). 3725 */ 3726 mpte = NULL; 3727 KASSERT(pde != NULL && (*pde & PG_V) != 0, 3728 ("KVA %#x invalid pde pdir %#jx", va, 3729 (uintmax_t)pmap->pm_pdir[PTDPTDI])); 3730 if ((*pde & PG_PS) != 0) 3731 pmap_demote_pde(pmap, pde, va); 3732 } 3733 pte = pmap_pte_quick(pmap, va); 3734 3735 /* 3736 * Page Directory table entry is not valid, which should not 3737 * happen. We should have either allocated the page table 3738 * page or demoted the existing mapping above. 3739 */ 3740 if (pte == NULL) { 3741 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3742 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3743 } 3744 3745 origpte = *pte; 3746 pv = NULL; 3747 3748 /* 3749 * Is the specified virtual address already mapped? 3750 */ 3751 if ((origpte & PG_V) != 0) { 3752 /* 3753 * Wiring change, just update stats. We don't worry about 3754 * wiring PT pages as they remain resident as long as there 3755 * are valid mappings in them. Hence, if a user page is wired, 3756 * the PT page will be also. 3757 */ 3758 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 3759 pmap->pm_stats.wired_count++; 3760 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 3761 pmap->pm_stats.wired_count--; 3762 3763 /* 3764 * Remove the extra PT page reference. 3765 */ 3766 if (mpte != NULL) { 3767 mpte->ref_count--; 3768 KASSERT(mpte->ref_count > 0, 3769 ("pmap_enter: missing reference to page table page," 3770 " va: 0x%x", va)); 3771 } 3772 3773 /* 3774 * Has the physical page changed? 3775 */ 3776 opa = origpte & PG_FRAME; 3777 if (opa == pa) { 3778 /* 3779 * No, might be a protection or wiring change. 3780 */ 3781 if ((origpte & PG_MANAGED) != 0 && 3782 (newpte & PG_RW) != 0) 3783 vm_page_aflag_set(m, PGA_WRITEABLE); 3784 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 3785 goto unchanged; 3786 goto validate; 3787 } 3788 3789 /* 3790 * The physical page has changed. Temporarily invalidate 3791 * the mapping. This ensures that all threads sharing the 3792 * pmap keep a consistent view of the mapping, which is 3793 * necessary for the correct handling of COW faults. It 3794 * also permits reuse of the old mapping's PV entry, 3795 * avoiding an allocation. 3796 * 3797 * For consistency, handle unmanaged mappings the same way. 3798 */ 3799 origpte = pte_load_clear(pte); 3800 KASSERT((origpte & PG_FRAME) == opa, 3801 ("pmap_enter: unexpected pa update for %#x", va)); 3802 if ((origpte & PG_MANAGED) != 0) { 3803 om = PHYS_TO_VM_PAGE(opa); 3804 3805 /* 3806 * The pmap lock is sufficient to synchronize with 3807 * concurrent calls to pmap_page_test_mappings() and 3808 * pmap_ts_referenced(). 3809 */ 3810 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3811 vm_page_dirty(om); 3812 if ((origpte & PG_A) != 0) { 3813 pmap_invalidate_page_int(pmap, va); 3814 vm_page_aflag_set(om, PGA_REFERENCED); 3815 } 3816 pv = pmap_pvh_remove(&om->md, pmap, va); 3817 KASSERT(pv != NULL, 3818 ("pmap_enter: no PV entry for %#x", va)); 3819 if ((newpte & PG_MANAGED) == 0) 3820 free_pv_entry(pmap, pv); 3821 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3822 TAILQ_EMPTY(&om->md.pv_list) && 3823 ((om->flags & PG_FICTITIOUS) != 0 || 3824 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3825 vm_page_aflag_clear(om, PGA_WRITEABLE); 3826 } else { 3827 /* 3828 * Since this mapping is unmanaged, assume that PG_A 3829 * is set. 3830 */ 3831 pmap_invalidate_page_int(pmap, va); 3832 } 3833 origpte = 0; 3834 } else { 3835 /* 3836 * Increment the counters. 3837 */ 3838 if ((newpte & PG_W) != 0) 3839 pmap->pm_stats.wired_count++; 3840 pmap->pm_stats.resident_count++; 3841 } 3842 3843 /* 3844 * Enter on the PV list if part of our managed memory. 3845 */ 3846 if ((newpte & PG_MANAGED) != 0) { 3847 if (pv == NULL) { 3848 pv = get_pv_entry(pmap, false); 3849 pv->pv_va = va; 3850 } 3851 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3852 if ((newpte & PG_RW) != 0) 3853 vm_page_aflag_set(m, PGA_WRITEABLE); 3854 } 3855 3856 /* 3857 * Update the PTE. 3858 */ 3859 if ((origpte & PG_V) != 0) { 3860 validate: 3861 origpte = pte_load_store(pte, newpte); 3862 KASSERT((origpte & PG_FRAME) == pa, 3863 ("pmap_enter: unexpected pa update for %#x", va)); 3864 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 3865 (PG_M | PG_RW)) { 3866 if ((origpte & PG_MANAGED) != 0) 3867 vm_page_dirty(m); 3868 3869 /* 3870 * Although the PTE may still have PG_RW set, TLB 3871 * invalidation may nonetheless be required because 3872 * the PTE no longer has PG_M set. 3873 */ 3874 } 3875 #ifdef PMAP_PAE_COMP 3876 else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 3877 /* 3878 * This PTE change does not require TLB invalidation. 3879 */ 3880 goto unchanged; 3881 } 3882 #endif 3883 if ((origpte & PG_A) != 0) 3884 pmap_invalidate_page_int(pmap, va); 3885 } else 3886 pte_store_zero(pte, newpte); 3887 3888 unchanged: 3889 3890 #if VM_NRESERVLEVEL > 0 3891 /* 3892 * If both the page table page and the reservation are fully 3893 * populated, then attempt promotion. 3894 */ 3895 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 3896 (m->flags & PG_FICTITIOUS) == 0 && 3897 vm_reserv_level_iffullpop(m) == 0) 3898 (void)pmap_promote_pde(pmap, pde, va, mpte); 3899 #endif 3900 3901 rv = KERN_SUCCESS; 3902 out: 3903 sched_unpin(); 3904 rw_wunlock(&pvh_global_lock); 3905 PMAP_UNLOCK(pmap); 3906 return (rv); 3907 } 3908 3909 /* 3910 * Tries to create a read- and/or execute-only 2 or 4 MB page mapping. Returns 3911 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 3912 * value. See pmap_enter_pde() for the possible error values when "no sleep", 3913 * "no replace", and "no reclaim" are specified. 3914 */ 3915 static int 3916 pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3917 { 3918 pd_entry_t newpde; 3919 3920 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3921 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 3922 PG_PS | PG_V; 3923 if ((m->oflags & VPO_UNMANAGED) == 0) 3924 newpde |= PG_MANAGED; 3925 #ifdef PMAP_PAE_COMP 3926 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 3927 newpde |= pg_nx; 3928 #endif 3929 if (pmap != kernel_pmap) 3930 newpde |= PG_U; 3931 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 3932 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL)); 3933 } 3934 3935 /* 3936 * Returns true if every page table entry in the page table page that maps 3937 * the specified kernel virtual address is zero. 3938 */ 3939 static bool 3940 pmap_every_pte_zero(vm_offset_t va) 3941 { 3942 pt_entry_t *pt_end, *pte; 3943 3944 KASSERT((va & PDRMASK) == 0, ("va is misaligned")); 3945 pte = vtopte(va); 3946 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 3947 if (*pte != 0) 3948 return (false); 3949 } 3950 return (true); 3951 } 3952 3953 /* 3954 * Tries to create the specified 2 or 4 MB page mapping. Returns KERN_SUCCESS 3955 * if the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, 3956 * or KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 3957 * PMAP_ENTER_NOREPLACE was specified and a 4 KB page mapping already exists 3958 * within the 2 or 4 MB virtual address range starting at the specified virtual 3959 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 3960 * 2 or 4 MB page mapping already exists at the specified virtual address. 3961 * Returns KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NORECLAIM was specified and a 3962 * PV entry allocation failed. 3963 * 3964 * The parameter "m" is only used when creating a managed, writeable mapping. 3965 */ 3966 static int 3967 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 3968 vm_page_t m) 3969 { 3970 struct spglist free; 3971 pd_entry_t oldpde, *pde; 3972 vm_page_t mt; 3973 vm_page_t uwptpg; 3974 3975 rw_assert(&pvh_global_lock, RA_WLOCKED); 3976 KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, 3977 ("pmap_enter_pde: newpde is missing PG_M")); 3978 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3979 pde = pmap_pde(pmap, va); 3980 oldpde = *pde; 3981 if ((oldpde & PG_V) != 0) { 3982 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3983 if ((oldpde & PG_PS) != 0) { 3984 CTR2(KTR_PMAP, 3985 "pmap_enter_pde: no space for va %#lx" 3986 " in pmap %p", va, pmap); 3987 return (KERN_NO_SPACE); 3988 } else if (pmap != kernel_pmap || 3989 !pmap_every_pte_zero(va)) { 3990 CTR2(KTR_PMAP, 3991 "pmap_enter_pde: failure for va %#lx" 3992 " in pmap %p", va, pmap); 3993 return (KERN_FAILURE); 3994 } 3995 } 3996 /* Break the existing mapping(s). */ 3997 SLIST_INIT(&free); 3998 if ((oldpde & PG_PS) != 0) { 3999 /* 4000 * If the PDE resulted from a promotion, then a 4001 * reserved PT page could be freed. 4002 */ 4003 (void)pmap_remove_pde(pmap, pde, va, &free); 4004 if ((oldpde & PG_G) == 0) 4005 pmap_invalidate_pde_page(pmap, va, oldpde); 4006 } else { 4007 if (pmap_remove_ptes(pmap, va, va + NBPDR, &free)) 4008 pmap_invalidate_all_int(pmap); 4009 } 4010 if (pmap != kernel_pmap) { 4011 vm_page_free_pages_toq(&free, true); 4012 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 4013 pde)); 4014 } else { 4015 KASSERT(SLIST_EMPTY(&free), 4016 ("pmap_enter_pde: freed kernel page table page")); 4017 4018 /* 4019 * Both pmap_remove_pde() and pmap_remove_ptes() will 4020 * leave the kernel page table page zero filled. 4021 */ 4022 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4023 if (pmap_insert_pt_page(pmap, mt, false, false)) 4024 panic("pmap_enter_pde: trie insert failed"); 4025 } 4026 } 4027 4028 /* 4029 * Allocate a leaf ptpage for wired userspace pages. 4030 */ 4031 uwptpg = NULL; 4032 if ((newpde & PG_W) != 0 && pmap != kernel_pmap) { 4033 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 4034 if (uwptpg == NULL) { 4035 return (KERN_RESOURCE_SHORTAGE); 4036 } 4037 uwptpg->pindex = va >> PDRSHIFT; 4038 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 4039 vm_page_unwire_noq(uwptpg); 4040 vm_page_free(uwptpg); 4041 return (KERN_RESOURCE_SHORTAGE); 4042 } 4043 pmap->pm_stats.resident_count++; 4044 uwptpg->ref_count = NPTEPG; 4045 } 4046 if ((newpde & PG_MANAGED) != 0) { 4047 /* 4048 * Abort this mapping if its PV entry could not be created. 4049 */ 4050 if (!pmap_pv_insert_pde(pmap, va, newpde, flags)) { 4051 if (uwptpg != NULL) { 4052 mt = pmap_remove_pt_page(pmap, va); 4053 KASSERT(mt == uwptpg, 4054 ("removed pt page %p, expected %p", mt, 4055 uwptpg)); 4056 pmap->pm_stats.resident_count--; 4057 uwptpg->ref_count = 1; 4058 vm_page_unwire_noq(uwptpg); 4059 vm_page_free(uwptpg); 4060 } 4061 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4062 " in pmap %p", va, pmap); 4063 return (KERN_RESOURCE_SHORTAGE); 4064 } 4065 if ((newpde & PG_RW) != 0) { 4066 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4067 vm_page_aflag_set(mt, PGA_WRITEABLE); 4068 } 4069 } 4070 4071 /* 4072 * Increment counters. 4073 */ 4074 if ((newpde & PG_W) != 0) 4075 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 4076 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 4077 4078 /* 4079 * Map the superpage. (This is not a promoted mapping; there will not 4080 * be any lingering 4KB page mappings in the TLB.) 4081 */ 4082 pde_store(pde, newpde); 4083 4084 pmap_pde_mappings++; 4085 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 4086 va, pmap); 4087 return (KERN_SUCCESS); 4088 } 4089 4090 /* 4091 * Maps a sequence of resident pages belonging to the same object. 4092 * The sequence begins with the given page m_start. This page is 4093 * mapped at the given virtual address start. Each subsequent page is 4094 * mapped at a virtual address that is offset from start by the same 4095 * amount as the page is offset from m_start within the object. The 4096 * last page in the sequence is the page with the largest offset from 4097 * m_start that can be mapped at a virtual address less than the given 4098 * virtual address end. Not every virtual page between start and end 4099 * is mapped; only those for which a resident page exists with the 4100 * corresponding offset from m_start are mapped. 4101 */ 4102 static void 4103 __CONCAT(PMTYPE, enter_object)(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4104 vm_page_t m_start, vm_prot_t prot) 4105 { 4106 vm_offset_t va; 4107 vm_page_t m, mpte; 4108 vm_pindex_t diff, psize; 4109 int rv; 4110 4111 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4112 4113 psize = atop(end - start); 4114 mpte = NULL; 4115 m = m_start; 4116 rw_wlock(&pvh_global_lock); 4117 PMAP_LOCK(pmap); 4118 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4119 va = start + ptoa(diff); 4120 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 4121 m->psind == 1 && pg_ps_enabled && 4122 ((rv = pmap_enter_4mpage(pmap, va, m, prot)) == 4123 KERN_SUCCESS || rv == KERN_NO_SPACE)) 4124 m = &m[NBPDR / PAGE_SIZE - 1]; 4125 else 4126 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 4127 mpte); 4128 m = TAILQ_NEXT(m, listq); 4129 } 4130 rw_wunlock(&pvh_global_lock); 4131 PMAP_UNLOCK(pmap); 4132 } 4133 4134 /* 4135 * this code makes some *MAJOR* assumptions: 4136 * 1. Current pmap & pmap exists. 4137 * 2. Not wired. 4138 * 3. Read access. 4139 * 4. No page table pages. 4140 * but is *MUCH* faster than pmap_enter... 4141 */ 4142 4143 static void 4144 __CONCAT(PMTYPE, enter_quick)(pmap_t pmap, vm_offset_t va, vm_page_t m, 4145 vm_prot_t prot) 4146 { 4147 4148 rw_wlock(&pvh_global_lock); 4149 PMAP_LOCK(pmap); 4150 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4151 rw_wunlock(&pvh_global_lock); 4152 PMAP_UNLOCK(pmap); 4153 } 4154 4155 static vm_page_t 4156 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4157 vm_prot_t prot, vm_page_t mpte) 4158 { 4159 pt_entry_t newpte, *pte; 4160 pd_entry_t *pde; 4161 4162 KASSERT(pmap != kernel_pmap || !VA_IS_CLEANMAP(va) || 4163 (m->oflags & VPO_UNMANAGED) != 0, 4164 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4165 rw_assert(&pvh_global_lock, RA_WLOCKED); 4166 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4167 pde = NULL; 4168 4169 /* 4170 * In the case that a page table page is not 4171 * resident, we are creating it here. 4172 */ 4173 if (pmap != kernel_pmap) { 4174 u_int ptepindex; 4175 pd_entry_t ptepa; 4176 4177 /* 4178 * Calculate pagetable page index 4179 */ 4180 ptepindex = va >> PDRSHIFT; 4181 if (mpte && (mpte->pindex == ptepindex)) { 4182 mpte->ref_count++; 4183 } else { 4184 /* 4185 * Get the page directory entry 4186 */ 4187 pde = &pmap->pm_pdir[ptepindex]; 4188 ptepa = *pde; 4189 4190 /* 4191 * If the page table page is mapped, we just increment 4192 * the hold count, and activate it. 4193 */ 4194 if (ptepa) { 4195 if (ptepa & PG_PS) 4196 return (NULL); 4197 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 4198 mpte->ref_count++; 4199 } else { 4200 mpte = _pmap_allocpte(pmap, ptepindex, 4201 PMAP_ENTER_NOSLEEP); 4202 if (mpte == NULL) 4203 return (mpte); 4204 } 4205 } 4206 } else { 4207 mpte = NULL; 4208 } 4209 4210 sched_pin(); 4211 pte = pmap_pte_quick(pmap, va); 4212 if (*pte) { 4213 if (mpte != NULL) 4214 mpte->ref_count--; 4215 sched_unpin(); 4216 return (NULL); 4217 } 4218 4219 /* 4220 * Enter on the PV list if part of our managed memory. 4221 */ 4222 if ((m->oflags & VPO_UNMANAGED) == 0 && 4223 !pmap_try_insert_pv_entry(pmap, va, m)) { 4224 if (mpte != NULL) 4225 pmap_abort_ptp(pmap, va, mpte); 4226 sched_unpin(); 4227 return (NULL); 4228 } 4229 4230 /* 4231 * Increment counters 4232 */ 4233 pmap->pm_stats.resident_count++; 4234 4235 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 4236 pmap_cache_bits(pmap, m->md.pat_mode, 0); 4237 if ((m->oflags & VPO_UNMANAGED) == 0) 4238 newpte |= PG_MANAGED; 4239 #ifdef PMAP_PAE_COMP 4240 if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) 4241 newpte |= pg_nx; 4242 #endif 4243 if (pmap != kernel_pmap) 4244 newpte |= PG_U; 4245 pte_store_zero(pte, newpte); 4246 4247 #if VM_NRESERVLEVEL > 0 4248 /* 4249 * If both the PTP and the reservation are fully populated, then 4250 * attempt promotion. 4251 */ 4252 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 4253 (m->flags & PG_FICTITIOUS) == 0 && 4254 vm_reserv_level_iffullpop(m) == 0) { 4255 if (pde == NULL) 4256 pde = pmap_pde(pmap, va); 4257 4258 /* 4259 * If promotion succeeds, then the next call to this function 4260 * should not be given the unmapped PTP as a hint. 4261 */ 4262 if (pmap_promote_pde(pmap, pde, va, mpte)) 4263 mpte = NULL; 4264 } 4265 #endif 4266 4267 sched_unpin(); 4268 return (mpte); 4269 } 4270 4271 /* 4272 * Make a temporary mapping for a physical address. This is only intended 4273 * to be used for panic dumps. 4274 */ 4275 static void * 4276 __CONCAT(PMTYPE, kenter_temporary)(vm_paddr_t pa, int i) 4277 { 4278 vm_offset_t va; 4279 4280 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4281 pmap_kenter(va, pa); 4282 invlpg(va); 4283 return ((void *)crashdumpmap); 4284 } 4285 4286 /* 4287 * This code maps large physical mmap regions into the 4288 * processor address space. Note that some shortcuts 4289 * are taken, but the code works. 4290 */ 4291 static void 4292 __CONCAT(PMTYPE, object_init_pt)(pmap_t pmap, vm_offset_t addr, 4293 vm_object_t object, vm_pindex_t pindex, vm_size_t size) 4294 { 4295 pd_entry_t *pde; 4296 vm_paddr_t pa, ptepa; 4297 vm_page_t p; 4298 int pat_mode; 4299 4300 VM_OBJECT_ASSERT_WLOCKED(object); 4301 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4302 ("pmap_object_init_pt: non-device object")); 4303 if (pg_ps_enabled && 4304 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 4305 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4306 return; 4307 p = vm_page_lookup(object, pindex); 4308 KASSERT(vm_page_all_valid(p), 4309 ("pmap_object_init_pt: invalid page %p", p)); 4310 pat_mode = p->md.pat_mode; 4311 4312 /* 4313 * Abort the mapping if the first page is not physically 4314 * aligned to a 2/4MB page boundary. 4315 */ 4316 ptepa = VM_PAGE_TO_PHYS(p); 4317 if (ptepa & (NBPDR - 1)) 4318 return; 4319 4320 /* 4321 * Skip the first page. Abort the mapping if the rest of 4322 * the pages are not physically contiguous or have differing 4323 * memory attributes. 4324 */ 4325 p = TAILQ_NEXT(p, listq); 4326 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4327 pa += PAGE_SIZE) { 4328 KASSERT(vm_page_all_valid(p), 4329 ("pmap_object_init_pt: invalid page %p", p)); 4330 if (pa != VM_PAGE_TO_PHYS(p) || 4331 pat_mode != p->md.pat_mode) 4332 return; 4333 p = TAILQ_NEXT(p, listq); 4334 } 4335 4336 /* 4337 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 4338 * "size" is a multiple of 2/4M, adding the PAT setting to 4339 * "pa" will not affect the termination of this loop. 4340 */ 4341 PMAP_LOCK(pmap); 4342 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 4343 pa < ptepa + size; pa += NBPDR) { 4344 pde = pmap_pde(pmap, addr); 4345 if (*pde == 0) { 4346 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4347 PG_U | PG_RW | PG_V); 4348 pmap->pm_stats.resident_count += NBPDR / 4349 PAGE_SIZE; 4350 pmap_pde_mappings++; 4351 } 4352 /* Else continue on if the PDE is already valid. */ 4353 addr += NBPDR; 4354 } 4355 PMAP_UNLOCK(pmap); 4356 } 4357 } 4358 4359 /* 4360 * Clear the wired attribute from the mappings for the specified range of 4361 * addresses in the given pmap. Every valid mapping within that range 4362 * must have the wired attribute set. In contrast, invalid mappings 4363 * cannot have the wired attribute set, so they are ignored. 4364 * 4365 * The wired attribute of the page table entry is not a hardware feature, 4366 * so there is no need to invalidate any TLB entries. 4367 */ 4368 static void 4369 __CONCAT(PMTYPE, unwire)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4370 { 4371 vm_offset_t pdnxt; 4372 pd_entry_t *pde; 4373 pt_entry_t *pte; 4374 bool pv_lists_locked; 4375 4376 if (pmap_is_current(pmap)) 4377 pv_lists_locked = false; 4378 else { 4379 pv_lists_locked = true; 4380 resume: 4381 rw_wlock(&pvh_global_lock); 4382 sched_pin(); 4383 } 4384 PMAP_LOCK(pmap); 4385 for (; sva < eva; sva = pdnxt) { 4386 pdnxt = (sva + NBPDR) & ~PDRMASK; 4387 if (pdnxt < sva) 4388 pdnxt = eva; 4389 pde = pmap_pde(pmap, sva); 4390 if ((*pde & PG_V) == 0) 4391 continue; 4392 if ((*pde & PG_PS) != 0) { 4393 if ((*pde & PG_W) == 0) 4394 panic("pmap_unwire: pde %#jx is missing PG_W", 4395 (uintmax_t)*pde); 4396 4397 /* 4398 * Are we unwiring the entire large page? If not, 4399 * demote the mapping and fall through. 4400 */ 4401 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4402 /* 4403 * Regardless of whether a pde (or pte) is 32 4404 * or 64 bits in size, PG_W is among the least 4405 * significant 32 bits. 4406 */ 4407 atomic_clear_int((u_int *)pde, PG_W); 4408 pmap->pm_stats.wired_count -= NBPDR / 4409 PAGE_SIZE; 4410 continue; 4411 } else { 4412 if (!pv_lists_locked) { 4413 pv_lists_locked = true; 4414 if (!rw_try_wlock(&pvh_global_lock)) { 4415 PMAP_UNLOCK(pmap); 4416 /* Repeat sva. */ 4417 goto resume; 4418 } 4419 sched_pin(); 4420 } 4421 if (!pmap_demote_pde(pmap, pde, sva)) 4422 panic("pmap_unwire: demotion failed"); 4423 } 4424 } 4425 if (pdnxt > eva) 4426 pdnxt = eva; 4427 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4428 sva += PAGE_SIZE) { 4429 if ((*pte & PG_V) == 0) 4430 continue; 4431 if ((*pte & PG_W) == 0) 4432 panic("pmap_unwire: pte %#jx is missing PG_W", 4433 (uintmax_t)*pte); 4434 4435 /* 4436 * PG_W must be cleared atomically. Although the pmap 4437 * lock synchronizes access to PG_W, another processor 4438 * could be setting PG_M and/or PG_A concurrently. 4439 * 4440 * PG_W is among the least significant 32 bits. 4441 */ 4442 atomic_clear_int((u_int *)pte, PG_W); 4443 pmap->pm_stats.wired_count--; 4444 } 4445 } 4446 if (pv_lists_locked) { 4447 sched_unpin(); 4448 rw_wunlock(&pvh_global_lock); 4449 } 4450 PMAP_UNLOCK(pmap); 4451 } 4452 4453 /* 4454 * Copy the range specified by src_addr/len 4455 * from the source map to the range dst_addr/len 4456 * in the destination map. 4457 * 4458 * This routine is only advisory and need not do anything. Since 4459 * current pmap is always the kernel pmap when executing in 4460 * kernel, and we do not copy from the kernel pmap to a user 4461 * pmap, this optimization is not usable in 4/4G full split i386 4462 * world. 4463 */ 4464 4465 static void 4466 __CONCAT(PMTYPE, copy)(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 4467 vm_size_t len, vm_offset_t src_addr) 4468 { 4469 pt_entry_t *src_pte, *dst_pte, ptetemp; 4470 pd_entry_t srcptepaddr; 4471 vm_page_t dstmpte, srcmpte; 4472 vm_offset_t addr, end_addr, pdnxt; 4473 u_int ptepindex; 4474 4475 if (dst_addr != src_addr) 4476 return; 4477 4478 end_addr = src_addr + len; 4479 4480 rw_wlock(&pvh_global_lock); 4481 if (dst_pmap < src_pmap) { 4482 PMAP_LOCK(dst_pmap); 4483 PMAP_LOCK(src_pmap); 4484 } else { 4485 PMAP_LOCK(src_pmap); 4486 PMAP_LOCK(dst_pmap); 4487 } 4488 sched_pin(); 4489 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4490 KASSERT(addr < PMAP_TRM_MIN_ADDRESS, 4491 ("pmap_copy: invalid to pmap_copy the trampoline")); 4492 4493 pdnxt = (addr + NBPDR) & ~PDRMASK; 4494 if (pdnxt < addr) 4495 pdnxt = end_addr; 4496 ptepindex = addr >> PDRSHIFT; 4497 4498 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4499 if (srcptepaddr == 0) 4500 continue; 4501 4502 if (srcptepaddr & PG_PS) { 4503 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4504 continue; 4505 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4506 ((srcptepaddr & PG_MANAGED) == 0 || 4507 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 4508 PMAP_ENTER_NORECLAIM))) { 4509 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4510 ~PG_W; 4511 dst_pmap->pm_stats.resident_count += 4512 NBPDR / PAGE_SIZE; 4513 pmap_pde_mappings++; 4514 } 4515 continue; 4516 } 4517 4518 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4519 KASSERT(srcmpte->ref_count > 0, 4520 ("pmap_copy: source page table page is unused")); 4521 4522 if (pdnxt > end_addr) 4523 pdnxt = end_addr; 4524 4525 src_pte = pmap_pte_quick3(src_pmap, addr); 4526 while (addr < pdnxt) { 4527 ptetemp = *src_pte; 4528 /* 4529 * we only virtual copy managed pages 4530 */ 4531 if ((ptetemp & PG_MANAGED) != 0) { 4532 dstmpte = pmap_allocpte(dst_pmap, addr, 4533 PMAP_ENTER_NOSLEEP); 4534 if (dstmpte == NULL) 4535 goto out; 4536 dst_pte = pmap_pte_quick(dst_pmap, addr); 4537 if (*dst_pte == 0 && 4538 pmap_try_insert_pv_entry(dst_pmap, addr, 4539 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4540 /* 4541 * Clear the wired, modified, and 4542 * accessed (referenced) bits 4543 * during the copy. 4544 */ 4545 *dst_pte = ptetemp & ~(PG_W | PG_M | 4546 PG_A); 4547 dst_pmap->pm_stats.resident_count++; 4548 } else { 4549 pmap_abort_ptp(dst_pmap, addr, dstmpte); 4550 goto out; 4551 } 4552 if (dstmpte->ref_count >= srcmpte->ref_count) 4553 break; 4554 } 4555 addr += PAGE_SIZE; 4556 src_pte++; 4557 } 4558 } 4559 out: 4560 sched_unpin(); 4561 rw_wunlock(&pvh_global_lock); 4562 PMAP_UNLOCK(src_pmap); 4563 PMAP_UNLOCK(dst_pmap); 4564 } 4565 4566 /* 4567 * Zero 1 page of virtual memory mapped from a hardware page by the caller. 4568 */ 4569 static __inline void 4570 pagezero(void *page) 4571 { 4572 #if defined(I686_CPU) 4573 if (cpu_class == CPUCLASS_686) { 4574 if (cpu_feature & CPUID_SSE2) 4575 sse2_pagezero(page); 4576 else 4577 i686_pagezero(page); 4578 } else 4579 #endif 4580 bzero(page, PAGE_SIZE); 4581 } 4582 4583 /* 4584 * Zero the specified hardware page. 4585 */ 4586 static void 4587 __CONCAT(PMTYPE, zero_page)(vm_page_t m) 4588 { 4589 pt_entry_t *cmap_pte2; 4590 struct pcpu *pc; 4591 4592 sched_pin(); 4593 pc = get_pcpu(); 4594 cmap_pte2 = pc->pc_cmap_pte2; 4595 mtx_lock(&pc->pc_cmap_lock); 4596 if (*cmap_pte2) 4597 panic("pmap_zero_page: CMAP2 busy"); 4598 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4599 pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 4600 invlcaddr(pc->pc_cmap_addr2); 4601 pagezero(pc->pc_cmap_addr2); 4602 *cmap_pte2 = 0; 4603 4604 /* 4605 * Unpin the thread before releasing the lock. Otherwise the thread 4606 * could be rescheduled while still bound to the current CPU, only 4607 * to unpin itself immediately upon resuming execution. 4608 */ 4609 sched_unpin(); 4610 mtx_unlock(&pc->pc_cmap_lock); 4611 } 4612 4613 /* 4614 * Zero an area within a single hardware page. off and size must not 4615 * cover an area beyond a single hardware page. 4616 */ 4617 static void 4618 __CONCAT(PMTYPE, zero_page_area)(vm_page_t m, int off, int size) 4619 { 4620 pt_entry_t *cmap_pte2; 4621 struct pcpu *pc; 4622 4623 sched_pin(); 4624 pc = get_pcpu(); 4625 cmap_pte2 = pc->pc_cmap_pte2; 4626 mtx_lock(&pc->pc_cmap_lock); 4627 if (*cmap_pte2) 4628 panic("pmap_zero_page_area: CMAP2 busy"); 4629 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4630 pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 4631 invlcaddr(pc->pc_cmap_addr2); 4632 if (off == 0 && size == PAGE_SIZE) 4633 pagezero(pc->pc_cmap_addr2); 4634 else 4635 bzero(pc->pc_cmap_addr2 + off, size); 4636 *cmap_pte2 = 0; 4637 sched_unpin(); 4638 mtx_unlock(&pc->pc_cmap_lock); 4639 } 4640 4641 /* 4642 * Copy 1 specified hardware page to another. 4643 */ 4644 static void 4645 __CONCAT(PMTYPE, copy_page)(vm_page_t src, vm_page_t dst) 4646 { 4647 pt_entry_t *cmap_pte1, *cmap_pte2; 4648 struct pcpu *pc; 4649 4650 sched_pin(); 4651 pc = get_pcpu(); 4652 cmap_pte1 = pc->pc_cmap_pte1; 4653 cmap_pte2 = pc->pc_cmap_pte2; 4654 mtx_lock(&pc->pc_cmap_lock); 4655 if (*cmap_pte1) 4656 panic("pmap_copy_page: CMAP1 busy"); 4657 if (*cmap_pte2) 4658 panic("pmap_copy_page: CMAP2 busy"); 4659 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4660 pmap_cache_bits(kernel_pmap, src->md.pat_mode, 0); 4661 invlcaddr(pc->pc_cmap_addr1); 4662 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4663 pmap_cache_bits(kernel_pmap, dst->md.pat_mode, 0); 4664 invlcaddr(pc->pc_cmap_addr2); 4665 bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); 4666 *cmap_pte1 = 0; 4667 *cmap_pte2 = 0; 4668 sched_unpin(); 4669 mtx_unlock(&pc->pc_cmap_lock); 4670 } 4671 4672 static void 4673 __CONCAT(PMTYPE, copy_pages)(vm_page_t ma[], vm_offset_t a_offset, 4674 vm_page_t mb[], vm_offset_t b_offset, int xfersize) 4675 { 4676 vm_page_t a_pg, b_pg; 4677 char *a_cp, *b_cp; 4678 vm_offset_t a_pg_offset, b_pg_offset; 4679 pt_entry_t *cmap_pte1, *cmap_pte2; 4680 struct pcpu *pc; 4681 int cnt; 4682 4683 sched_pin(); 4684 pc = get_pcpu(); 4685 cmap_pte1 = pc->pc_cmap_pte1; 4686 cmap_pte2 = pc->pc_cmap_pte2; 4687 mtx_lock(&pc->pc_cmap_lock); 4688 if (*cmap_pte1 != 0) 4689 panic("pmap_copy_pages: CMAP1 busy"); 4690 if (*cmap_pte2 != 0) 4691 panic("pmap_copy_pages: CMAP2 busy"); 4692 while (xfersize > 0) { 4693 a_pg = ma[a_offset >> PAGE_SHIFT]; 4694 a_pg_offset = a_offset & PAGE_MASK; 4695 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4696 b_pg = mb[b_offset >> PAGE_SHIFT]; 4697 b_pg_offset = b_offset & PAGE_MASK; 4698 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4699 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4700 pmap_cache_bits(kernel_pmap, a_pg->md.pat_mode, 0); 4701 invlcaddr(pc->pc_cmap_addr1); 4702 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4703 PG_M | pmap_cache_bits(kernel_pmap, b_pg->md.pat_mode, 0); 4704 invlcaddr(pc->pc_cmap_addr2); 4705 a_cp = pc->pc_cmap_addr1 + a_pg_offset; 4706 b_cp = pc->pc_cmap_addr2 + b_pg_offset; 4707 bcopy(a_cp, b_cp, cnt); 4708 a_offset += cnt; 4709 b_offset += cnt; 4710 xfersize -= cnt; 4711 } 4712 *cmap_pte1 = 0; 4713 *cmap_pte2 = 0; 4714 sched_unpin(); 4715 mtx_unlock(&pc->pc_cmap_lock); 4716 } 4717 4718 /* 4719 * Returns true if the pmap's pv is one of the first 4720 * 16 pvs linked to from this page. This count may 4721 * be changed upwards or downwards in the future; it 4722 * is only necessary that true be returned for a small 4723 * subset of pmaps for proper page aging. 4724 */ 4725 static bool 4726 __CONCAT(PMTYPE, page_exists_quick)(pmap_t pmap, vm_page_t m) 4727 { 4728 struct md_page *pvh; 4729 pv_entry_t pv; 4730 int loops = 0; 4731 bool rv; 4732 4733 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4734 ("pmap_page_exists_quick: page %p is not managed", m)); 4735 rv = false; 4736 rw_wlock(&pvh_global_lock); 4737 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4738 if (PV_PMAP(pv) == pmap) { 4739 rv = true; 4740 break; 4741 } 4742 loops++; 4743 if (loops >= 16) 4744 break; 4745 } 4746 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4747 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4748 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4749 if (PV_PMAP(pv) == pmap) { 4750 rv = true; 4751 break; 4752 } 4753 loops++; 4754 if (loops >= 16) 4755 break; 4756 } 4757 } 4758 rw_wunlock(&pvh_global_lock); 4759 return (rv); 4760 } 4761 4762 /* 4763 * pmap_page_wired_mappings: 4764 * 4765 * Return the number of managed mappings to the given physical page 4766 * that are wired. 4767 */ 4768 static int 4769 __CONCAT(PMTYPE, page_wired_mappings)(vm_page_t m) 4770 { 4771 int count; 4772 4773 count = 0; 4774 if ((m->oflags & VPO_UNMANAGED) != 0) 4775 return (count); 4776 rw_wlock(&pvh_global_lock); 4777 count = pmap_pvh_wired_mappings(&m->md, count); 4778 if ((m->flags & PG_FICTITIOUS) == 0) { 4779 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4780 count); 4781 } 4782 rw_wunlock(&pvh_global_lock); 4783 return (count); 4784 } 4785 4786 /* 4787 * pmap_pvh_wired_mappings: 4788 * 4789 * Return the updated number "count" of managed mappings that are wired. 4790 */ 4791 static int 4792 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4793 { 4794 pmap_t pmap; 4795 pt_entry_t *pte; 4796 pv_entry_t pv; 4797 4798 rw_assert(&pvh_global_lock, RA_WLOCKED); 4799 sched_pin(); 4800 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4801 pmap = PV_PMAP(pv); 4802 PMAP_LOCK(pmap); 4803 pte = pmap_pte_quick(pmap, pv->pv_va); 4804 if ((*pte & PG_W) != 0) 4805 count++; 4806 PMAP_UNLOCK(pmap); 4807 } 4808 sched_unpin(); 4809 return (count); 4810 } 4811 4812 /* 4813 * Returns true if the given page is mapped individually or as part of 4814 * a 4mpage. Otherwise, returns false. 4815 */ 4816 static bool 4817 __CONCAT(PMTYPE, page_is_mapped)(vm_page_t m) 4818 { 4819 bool rv; 4820 4821 if ((m->oflags & VPO_UNMANAGED) != 0) 4822 return (false); 4823 rw_wlock(&pvh_global_lock); 4824 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4825 ((m->flags & PG_FICTITIOUS) == 0 && 4826 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4827 rw_wunlock(&pvh_global_lock); 4828 return (rv); 4829 } 4830 4831 /* 4832 * Remove all pages from specified address space 4833 * this aids process exit speeds. Also, this code 4834 * is special cased for current process only, but 4835 * can have the more generic (and slightly slower) 4836 * mode enabled. This is much faster than pmap_remove 4837 * in the case of running down an entire address space. 4838 */ 4839 static void 4840 __CONCAT(PMTYPE, remove_pages)(pmap_t pmap) 4841 { 4842 pt_entry_t *pte, tpte; 4843 vm_page_t m, mpte, mt; 4844 pv_entry_t pv; 4845 struct md_page *pvh; 4846 struct pv_chunk *pc, *npc; 4847 struct spglist free; 4848 int field, idx; 4849 int32_t bit; 4850 uint32_t inuse, bitmask; 4851 int allfree; 4852 4853 if (pmap != PCPU_GET(curpmap)) { 4854 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4855 return; 4856 } 4857 SLIST_INIT(&free); 4858 rw_wlock(&pvh_global_lock); 4859 PMAP_LOCK(pmap); 4860 sched_pin(); 4861 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4862 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4863 pc->pc_pmap)); 4864 allfree = 1; 4865 for (field = 0; field < _NPCM; field++) { 4866 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4867 while (inuse != 0) { 4868 bit = bsfl(inuse); 4869 bitmask = 1UL << bit; 4870 idx = field * 32 + bit; 4871 pv = &pc->pc_pventry[idx]; 4872 inuse &= ~bitmask; 4873 4874 pte = pmap_pde(pmap, pv->pv_va); 4875 tpte = *pte; 4876 if ((tpte & PG_PS) == 0) { 4877 pte = pmap_pte_quick(pmap, pv->pv_va); 4878 tpte = *pte & ~PG_PTE_PAT; 4879 } 4880 4881 if (tpte == 0) { 4882 printf( 4883 "TPTE at %p IS ZERO @ VA %08x\n", 4884 pte, pv->pv_va); 4885 panic("bad pte"); 4886 } 4887 4888 /* 4889 * We cannot remove wired pages from a process' mapping at this time 4890 */ 4891 if (tpte & PG_W) { 4892 allfree = 0; 4893 continue; 4894 } 4895 4896 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4897 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4898 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4899 m, (uintmax_t)m->phys_addr, 4900 (uintmax_t)tpte)); 4901 4902 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4903 m < &vm_page_array[vm_page_array_size], 4904 ("pmap_remove_pages: bad tpte %#jx", 4905 (uintmax_t)tpte)); 4906 4907 pte_clear(pte); 4908 4909 /* 4910 * Update the vm_page_t clean/reference bits. 4911 */ 4912 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4913 if ((tpte & PG_PS) != 0) { 4914 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4915 vm_page_dirty(mt); 4916 } else 4917 vm_page_dirty(m); 4918 } 4919 4920 /* Mark free */ 4921 PV_STAT(pv_entry_frees++); 4922 PV_STAT(pv_entry_spare++); 4923 pv_entry_count--; 4924 pc->pc_map[field] |= bitmask; 4925 if ((tpte & PG_PS) != 0) { 4926 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4927 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4928 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4929 if (TAILQ_EMPTY(&pvh->pv_list)) { 4930 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4931 if (TAILQ_EMPTY(&mt->md.pv_list)) 4932 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4933 } 4934 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 4935 if (mpte != NULL) { 4936 KASSERT(vm_page_any_valid(mpte), 4937 ("pmap_remove_pages: pte page not promoted")); 4938 pmap->pm_stats.resident_count--; 4939 KASSERT(mpte->ref_count == NPTEPG, 4940 ("pmap_remove_pages: pte page ref count error")); 4941 mpte->ref_count = 0; 4942 pmap_add_delayed_free_list(mpte, &free, false); 4943 } 4944 } else { 4945 pmap->pm_stats.resident_count--; 4946 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4947 if (TAILQ_EMPTY(&m->md.pv_list) && 4948 (m->flags & PG_FICTITIOUS) == 0) { 4949 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4950 if (TAILQ_EMPTY(&pvh->pv_list)) 4951 vm_page_aflag_clear(m, PGA_WRITEABLE); 4952 } 4953 pmap_unuse_pt(pmap, pv->pv_va, &free); 4954 } 4955 } 4956 } 4957 if (allfree) { 4958 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4959 free_pv_chunk(pc); 4960 } 4961 } 4962 sched_unpin(); 4963 pmap_invalidate_all_int(pmap); 4964 rw_wunlock(&pvh_global_lock); 4965 PMAP_UNLOCK(pmap); 4966 vm_page_free_pages_toq(&free, true); 4967 } 4968 4969 /* 4970 * pmap_is_modified: 4971 * 4972 * Return whether or not the specified physical page was modified 4973 * in any physical maps. 4974 */ 4975 static bool 4976 __CONCAT(PMTYPE, is_modified)(vm_page_t m) 4977 { 4978 bool rv; 4979 4980 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4981 ("pmap_is_modified: page %p is not managed", m)); 4982 4983 /* 4984 * If the page is not busied then this check is racy. 4985 */ 4986 if (!pmap_page_is_write_mapped(m)) 4987 return (false); 4988 rw_wlock(&pvh_global_lock); 4989 rv = pmap_is_modified_pvh(&m->md) || 4990 ((m->flags & PG_FICTITIOUS) == 0 && 4991 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4992 rw_wunlock(&pvh_global_lock); 4993 return (rv); 4994 } 4995 4996 /* 4997 * Returns true if any of the given mappings were used to modify 4998 * physical memory. Otherwise, returns false. Both page and 2mpage 4999 * mappings are supported. 5000 */ 5001 static bool 5002 pmap_is_modified_pvh(struct md_page *pvh) 5003 { 5004 pv_entry_t pv; 5005 pt_entry_t *pte; 5006 pmap_t pmap; 5007 bool rv; 5008 5009 rw_assert(&pvh_global_lock, RA_WLOCKED); 5010 rv = false; 5011 sched_pin(); 5012 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5013 pmap = PV_PMAP(pv); 5014 PMAP_LOCK(pmap); 5015 pte = pmap_pte_quick(pmap, pv->pv_va); 5016 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 5017 PMAP_UNLOCK(pmap); 5018 if (rv) 5019 break; 5020 } 5021 sched_unpin(); 5022 return (rv); 5023 } 5024 5025 /* 5026 * pmap_is_prefaultable: 5027 * 5028 * Return whether or not the specified virtual address is elgible 5029 * for prefault. 5030 */ 5031 static bool 5032 __CONCAT(PMTYPE, is_prefaultable)(pmap_t pmap, vm_offset_t addr) 5033 { 5034 pd_entry_t pde; 5035 bool rv; 5036 5037 rv = false; 5038 PMAP_LOCK(pmap); 5039 pde = *pmap_pde(pmap, addr); 5040 if (pde != 0 && (pde & PG_PS) == 0) 5041 rv = pmap_pte_ufast(pmap, addr, pde) == 0; 5042 PMAP_UNLOCK(pmap); 5043 return (rv); 5044 } 5045 5046 /* 5047 * pmap_is_referenced: 5048 * 5049 * Return whether or not the specified physical page was referenced 5050 * in any physical maps. 5051 */ 5052 static bool 5053 __CONCAT(PMTYPE, is_referenced)(vm_page_t m) 5054 { 5055 bool rv; 5056 5057 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5058 ("pmap_is_referenced: page %p is not managed", m)); 5059 rw_wlock(&pvh_global_lock); 5060 rv = pmap_is_referenced_pvh(&m->md) || 5061 ((m->flags & PG_FICTITIOUS) == 0 && 5062 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5063 rw_wunlock(&pvh_global_lock); 5064 return (rv); 5065 } 5066 5067 /* 5068 * Returns true if any of the given mappings were referenced and false 5069 * otherwise. Both page and 4mpage mappings are supported. 5070 */ 5071 static bool 5072 pmap_is_referenced_pvh(struct md_page *pvh) 5073 { 5074 pv_entry_t pv; 5075 pt_entry_t *pte; 5076 pmap_t pmap; 5077 bool rv; 5078 5079 rw_assert(&pvh_global_lock, RA_WLOCKED); 5080 rv = false; 5081 sched_pin(); 5082 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5083 pmap = PV_PMAP(pv); 5084 PMAP_LOCK(pmap); 5085 pte = pmap_pte_quick(pmap, pv->pv_va); 5086 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 5087 PMAP_UNLOCK(pmap); 5088 if (rv) 5089 break; 5090 } 5091 sched_unpin(); 5092 return (rv); 5093 } 5094 5095 /* 5096 * Clear the write and modified bits in each of the given page's mappings. 5097 */ 5098 static void 5099 __CONCAT(PMTYPE, remove_write)(vm_page_t m) 5100 { 5101 struct md_page *pvh; 5102 pv_entry_t next_pv, pv; 5103 pmap_t pmap; 5104 pd_entry_t *pde; 5105 pt_entry_t oldpte, *pte; 5106 vm_offset_t va; 5107 5108 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5109 ("pmap_remove_write: page %p is not managed", m)); 5110 vm_page_assert_busied(m); 5111 5112 if (!pmap_page_is_write_mapped(m)) 5113 return; 5114 rw_wlock(&pvh_global_lock); 5115 sched_pin(); 5116 if ((m->flags & PG_FICTITIOUS) != 0) 5117 goto small_mappings; 5118 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5119 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5120 va = pv->pv_va; 5121 pmap = PV_PMAP(pv); 5122 PMAP_LOCK(pmap); 5123 pde = pmap_pde(pmap, va); 5124 if ((*pde & PG_RW) != 0) 5125 (void)pmap_demote_pde(pmap, pde, va); 5126 PMAP_UNLOCK(pmap); 5127 } 5128 small_mappings: 5129 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5130 pmap = PV_PMAP(pv); 5131 PMAP_LOCK(pmap); 5132 pde = pmap_pde(pmap, pv->pv_va); 5133 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 5134 " a 4mpage in page %p's pv list", m)); 5135 pte = pmap_pte_quick(pmap, pv->pv_va); 5136 retry: 5137 oldpte = *pte; 5138 if ((oldpte & PG_RW) != 0) { 5139 /* 5140 * Regardless of whether a pte is 32 or 64 bits 5141 * in size, PG_RW and PG_M are among the least 5142 * significant 32 bits. 5143 */ 5144 if (!atomic_cmpset_int((u_int *)pte, oldpte, 5145 oldpte & ~(PG_RW | PG_M))) 5146 goto retry; 5147 if ((oldpte & PG_M) != 0) 5148 vm_page_dirty(m); 5149 pmap_invalidate_page_int(pmap, pv->pv_va); 5150 } 5151 PMAP_UNLOCK(pmap); 5152 } 5153 vm_page_aflag_clear(m, PGA_WRITEABLE); 5154 sched_unpin(); 5155 rw_wunlock(&pvh_global_lock); 5156 } 5157 5158 /* 5159 * pmap_ts_referenced: 5160 * 5161 * Return a count of reference bits for a page, clearing those bits. 5162 * It is not necessary for every reference bit to be cleared, but it 5163 * is necessary that 0 only be returned when there are truly no 5164 * reference bits set. 5165 * 5166 * As an optimization, update the page's dirty field if a modified bit is 5167 * found while counting reference bits. This opportunistic update can be 5168 * performed at low cost and can eliminate the need for some future calls 5169 * to pmap_is_modified(). However, since this function stops after 5170 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5171 * dirty pages. Those dirty pages will only be detected by a future call 5172 * to pmap_is_modified(). 5173 */ 5174 static int 5175 __CONCAT(PMTYPE, ts_referenced)(vm_page_t m) 5176 { 5177 struct md_page *pvh; 5178 pv_entry_t pv, pvf; 5179 pmap_t pmap; 5180 pd_entry_t *pde; 5181 pt_entry_t *pte; 5182 vm_paddr_t pa; 5183 int rtval = 0; 5184 5185 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5186 ("pmap_ts_referenced: page %p is not managed", m)); 5187 pa = VM_PAGE_TO_PHYS(m); 5188 pvh = pa_to_pvh(pa); 5189 rw_wlock(&pvh_global_lock); 5190 sched_pin(); 5191 if ((m->flags & PG_FICTITIOUS) != 0 || 5192 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5193 goto small_mappings; 5194 pv = pvf; 5195 do { 5196 pmap = PV_PMAP(pv); 5197 PMAP_LOCK(pmap); 5198 pde = pmap_pde(pmap, pv->pv_va); 5199 if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5200 /* 5201 * Although "*pde" is mapping a 2/4MB page, because 5202 * this function is called at a 4KB page granularity, 5203 * we only update the 4KB page under test. 5204 */ 5205 vm_page_dirty(m); 5206 } 5207 if ((*pde & PG_A) != 0) { 5208 /* 5209 * Since this reference bit is shared by either 1024 5210 * or 512 4KB pages, it should not be cleared every 5211 * time it is tested. Apply a simple "hash" function 5212 * on the physical page number, the virtual superpage 5213 * number, and the pmap address to select one 4KB page 5214 * out of the 1024 or 512 on which testing the 5215 * reference bit will result in clearing that bit. 5216 * This function is designed to avoid the selection of 5217 * the same 4KB page for every 2- or 4MB page mapping. 5218 * 5219 * On demotion, a mapping that hasn't been referenced 5220 * is simply destroyed. To avoid the possibility of a 5221 * subsequent page fault on a demoted wired mapping, 5222 * always leave its reference bit set. Moreover, 5223 * since the superpage is wired, the current state of 5224 * its reference bit won't affect page replacement. 5225 */ 5226 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 5227 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 5228 (*pde & PG_W) == 0) { 5229 atomic_clear_int((u_int *)pde, PG_A); 5230 pmap_invalidate_page_int(pmap, pv->pv_va); 5231 } 5232 rtval++; 5233 } 5234 PMAP_UNLOCK(pmap); 5235 /* Rotate the PV list if it has more than one entry. */ 5236 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5237 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5238 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5239 } 5240 if (rtval >= PMAP_TS_REFERENCED_MAX) 5241 goto out; 5242 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5243 small_mappings: 5244 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5245 goto out; 5246 pv = pvf; 5247 do { 5248 pmap = PV_PMAP(pv); 5249 PMAP_LOCK(pmap); 5250 pde = pmap_pde(pmap, pv->pv_va); 5251 KASSERT((*pde & PG_PS) == 0, 5252 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 5253 m)); 5254 pte = pmap_pte_quick(pmap, pv->pv_va); 5255 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5256 vm_page_dirty(m); 5257 if ((*pte & PG_A) != 0) { 5258 atomic_clear_int((u_int *)pte, PG_A); 5259 pmap_invalidate_page_int(pmap, pv->pv_va); 5260 rtval++; 5261 } 5262 PMAP_UNLOCK(pmap); 5263 /* Rotate the PV list if it has more than one entry. */ 5264 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5265 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5266 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5267 } 5268 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5269 PMAP_TS_REFERENCED_MAX); 5270 out: 5271 sched_unpin(); 5272 rw_wunlock(&pvh_global_lock); 5273 return (rtval); 5274 } 5275 5276 /* 5277 * Apply the given advice to the specified range of addresses within the 5278 * given pmap. Depending on the advice, clear the referenced and/or 5279 * modified flags in each mapping and set the mapped page's dirty field. 5280 */ 5281 static void 5282 __CONCAT(PMTYPE, advise)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 5283 int advice) 5284 { 5285 pd_entry_t oldpde, *pde; 5286 pt_entry_t *pte; 5287 vm_offset_t va, pdnxt; 5288 vm_page_t m; 5289 bool anychanged, pv_lists_locked; 5290 5291 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5292 return; 5293 if (pmap_is_current(pmap)) 5294 pv_lists_locked = false; 5295 else { 5296 pv_lists_locked = true; 5297 resume: 5298 rw_wlock(&pvh_global_lock); 5299 sched_pin(); 5300 } 5301 anychanged = false; 5302 PMAP_LOCK(pmap); 5303 for (; sva < eva; sva = pdnxt) { 5304 pdnxt = (sva + NBPDR) & ~PDRMASK; 5305 if (pdnxt < sva) 5306 pdnxt = eva; 5307 pde = pmap_pde(pmap, sva); 5308 oldpde = *pde; 5309 if ((oldpde & PG_V) == 0) 5310 continue; 5311 else if ((oldpde & PG_PS) != 0) { 5312 if ((oldpde & PG_MANAGED) == 0) 5313 continue; 5314 if (!pv_lists_locked) { 5315 pv_lists_locked = true; 5316 if (!rw_try_wlock(&pvh_global_lock)) { 5317 if (anychanged) 5318 pmap_invalidate_all_int(pmap); 5319 PMAP_UNLOCK(pmap); 5320 goto resume; 5321 } 5322 sched_pin(); 5323 } 5324 if (!pmap_demote_pde(pmap, pde, sva)) { 5325 /* 5326 * The large page mapping was destroyed. 5327 */ 5328 continue; 5329 } 5330 5331 /* 5332 * Unless the page mappings are wired, remove the 5333 * mapping to a single page so that a subsequent 5334 * access may repromote. Choosing the last page 5335 * within the address range [sva, min(pdnxt, eva)) 5336 * generally results in more repromotions. Since the 5337 * underlying page table page is fully populated, this 5338 * removal never frees a page table page. 5339 */ 5340 if ((oldpde & PG_W) == 0) { 5341 va = eva; 5342 if (va > pdnxt) 5343 va = pdnxt; 5344 va -= PAGE_SIZE; 5345 KASSERT(va >= sva, 5346 ("pmap_advise: no address gap")); 5347 pte = pmap_pte_quick(pmap, va); 5348 KASSERT((*pte & PG_V) != 0, 5349 ("pmap_advise: invalid PTE")); 5350 pmap_remove_pte(pmap, pte, va, NULL); 5351 anychanged = true; 5352 } 5353 } 5354 if (pdnxt > eva) 5355 pdnxt = eva; 5356 va = pdnxt; 5357 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 5358 sva += PAGE_SIZE) { 5359 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 5360 goto maybe_invlrng; 5361 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5362 if (advice == MADV_DONTNEED) { 5363 /* 5364 * Future calls to pmap_is_modified() 5365 * can be avoided by making the page 5366 * dirty now. 5367 */ 5368 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5369 vm_page_dirty(m); 5370 } 5371 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5372 } else if ((*pte & PG_A) != 0) 5373 atomic_clear_int((u_int *)pte, PG_A); 5374 else 5375 goto maybe_invlrng; 5376 if ((*pte & PG_G) != 0) { 5377 if (va == pdnxt) 5378 va = sva; 5379 } else 5380 anychanged = true; 5381 continue; 5382 maybe_invlrng: 5383 if (va != pdnxt) { 5384 pmap_invalidate_range_int(pmap, va, sva); 5385 va = pdnxt; 5386 } 5387 } 5388 if (va != pdnxt) 5389 pmap_invalidate_range_int(pmap, va, sva); 5390 } 5391 if (anychanged) 5392 pmap_invalidate_all_int(pmap); 5393 if (pv_lists_locked) { 5394 sched_unpin(); 5395 rw_wunlock(&pvh_global_lock); 5396 } 5397 PMAP_UNLOCK(pmap); 5398 } 5399 5400 /* 5401 * Clear the modify bits on the specified physical page. 5402 */ 5403 static void 5404 __CONCAT(PMTYPE, clear_modify)(vm_page_t m) 5405 { 5406 struct md_page *pvh; 5407 pv_entry_t next_pv, pv; 5408 pmap_t pmap; 5409 pd_entry_t oldpde, *pde; 5410 pt_entry_t *pte; 5411 vm_offset_t va; 5412 5413 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5414 ("pmap_clear_modify: page %p is not managed", m)); 5415 vm_page_assert_busied(m); 5416 5417 if (!pmap_page_is_write_mapped(m)) 5418 return; 5419 rw_wlock(&pvh_global_lock); 5420 sched_pin(); 5421 if ((m->flags & PG_FICTITIOUS) != 0) 5422 goto small_mappings; 5423 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5424 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5425 va = pv->pv_va; 5426 pmap = PV_PMAP(pv); 5427 PMAP_LOCK(pmap); 5428 pde = pmap_pde(pmap, va); 5429 oldpde = *pde; 5430 /* If oldpde has PG_RW set, then it also has PG_M set. */ 5431 if ((oldpde & PG_RW) != 0 && 5432 pmap_demote_pde(pmap, pde, va) && 5433 (oldpde & PG_W) == 0) { 5434 /* 5435 * Write protect the mapping to a single page so that 5436 * a subsequent write access may repromote. 5437 */ 5438 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 5439 pte = pmap_pte_quick(pmap, va); 5440 /* 5441 * Regardless of whether a pte is 32 or 64 bits 5442 * in size, PG_RW and PG_M are among the least 5443 * significant 32 bits. 5444 */ 5445 atomic_clear_int((u_int *)pte, PG_M | PG_RW); 5446 vm_page_dirty(m); 5447 pmap_invalidate_page_int(pmap, va); 5448 } 5449 PMAP_UNLOCK(pmap); 5450 } 5451 small_mappings: 5452 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5453 pmap = PV_PMAP(pv); 5454 PMAP_LOCK(pmap); 5455 pde = pmap_pde(pmap, pv->pv_va); 5456 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5457 " a 4mpage in page %p's pv list", m)); 5458 pte = pmap_pte_quick(pmap, pv->pv_va); 5459 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5460 /* 5461 * Regardless of whether a pte is 32 or 64 bits 5462 * in size, PG_M is among the least significant 5463 * 32 bits. 5464 */ 5465 atomic_clear_int((u_int *)pte, PG_M); 5466 pmap_invalidate_page_int(pmap, pv->pv_va); 5467 } 5468 PMAP_UNLOCK(pmap); 5469 } 5470 sched_unpin(); 5471 rw_wunlock(&pvh_global_lock); 5472 } 5473 5474 /* 5475 * Miscellaneous support routines follow 5476 */ 5477 5478 /* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5479 static __inline void 5480 pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5481 { 5482 u_int opte, npte; 5483 5484 /* 5485 * The cache mode bits are all in the low 32-bits of the 5486 * PTE, so we can just spin on updating the low 32-bits. 5487 */ 5488 do { 5489 opte = *(u_int *)pte; 5490 npte = opte & ~PG_PTE_CACHE; 5491 npte |= cache_bits; 5492 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5493 } 5494 5495 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5496 static __inline void 5497 pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5498 { 5499 u_int opde, npde; 5500 5501 /* 5502 * The cache mode bits are all in the low 32-bits of the 5503 * PDE, so we can just spin on updating the low 32-bits. 5504 */ 5505 do { 5506 opde = *(u_int *)pde; 5507 npde = opde & ~PG_PDE_CACHE; 5508 npde |= cache_bits; 5509 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5510 } 5511 5512 /* 5513 * Map a set of physical memory pages into the kernel virtual 5514 * address space. Return a pointer to where it is mapped. This 5515 * routine is intended to be used for mapping device memory, 5516 * NOT real memory. 5517 */ 5518 static void * 5519 __CONCAT(PMTYPE, mapdev_attr)(vm_paddr_t pa, vm_size_t size, int mode, 5520 int flags) 5521 { 5522 struct pmap_preinit_mapping *ppim; 5523 vm_offset_t va, offset; 5524 vm_page_t m; 5525 vm_size_t tmpsize; 5526 int i; 5527 5528 offset = pa & PAGE_MASK; 5529 size = round_page(offset + size); 5530 pa = pa & PG_FRAME; 5531 5532 if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW) { 5533 va = pa + PMAP_MAP_LOW; 5534 if ((flags & MAPDEV_SETATTR) == 0) 5535 return ((void *)(va + offset)); 5536 } else if (!pmap_initialized) { 5537 va = 0; 5538 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5539 ppim = pmap_preinit_mapping + i; 5540 if (ppim->va == 0) { 5541 ppim->pa = pa; 5542 ppim->sz = size; 5543 ppim->mode = mode; 5544 ppim->va = virtual_avail; 5545 virtual_avail += size; 5546 va = ppim->va; 5547 break; 5548 } 5549 } 5550 if (va == 0) 5551 panic("%s: too many preinit mappings", __func__); 5552 } else { 5553 /* 5554 * If we have a preinit mapping, re-use it. 5555 */ 5556 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5557 ppim = pmap_preinit_mapping + i; 5558 if (ppim->pa == pa && ppim->sz == size && 5559 (ppim->mode == mode || 5560 (flags & MAPDEV_SETATTR) == 0)) 5561 return ((void *)(ppim->va + offset)); 5562 } 5563 va = kva_alloc(size); 5564 if (va == 0) 5565 panic("%s: Couldn't allocate KVA", __func__); 5566 } 5567 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) { 5568 if ((flags & MAPDEV_SETATTR) == 0 && pmap_initialized) { 5569 m = PHYS_TO_VM_PAGE(pa); 5570 if (m != NULL && VM_PAGE_TO_PHYS(m) == pa) { 5571 pmap_kenter_attr(va + tmpsize, pa + tmpsize, 5572 m->md.pat_mode); 5573 continue; 5574 } 5575 } 5576 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5577 } 5578 pmap_invalidate_range_int(kernel_pmap, va, va + tmpsize); 5579 pmap_invalidate_cache_range(va, va + size); 5580 return ((void *)(va + offset)); 5581 } 5582 5583 static void 5584 __CONCAT(PMTYPE, unmapdev)(void *p, vm_size_t size) 5585 { 5586 struct pmap_preinit_mapping *ppim; 5587 vm_offset_t offset, va; 5588 int i; 5589 5590 va = (vm_offset_t)p; 5591 if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE) 5592 return; 5593 offset = va & PAGE_MASK; 5594 size = round_page(offset + size); 5595 va = trunc_page(va); 5596 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5597 ppim = pmap_preinit_mapping + i; 5598 if (ppim->va == va && ppim->sz == size) { 5599 if (pmap_initialized) 5600 return; 5601 ppim->pa = 0; 5602 ppim->va = 0; 5603 ppim->sz = 0; 5604 ppim->mode = 0; 5605 if (va + size == virtual_avail) 5606 virtual_avail = va; 5607 return; 5608 } 5609 } 5610 if (pmap_initialized) { 5611 pmap_qremove(va, atop(size)); 5612 kva_free(va, size); 5613 } 5614 } 5615 5616 /* 5617 * Sets the memory attribute for the specified page. 5618 */ 5619 static void 5620 __CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma) 5621 { 5622 5623 m->md.pat_mode = ma; 5624 if ((m->flags & PG_FICTITIOUS) != 0) 5625 return; 5626 5627 /* 5628 * If "m" is a normal page, flush it from the cache. 5629 * See pmap_invalidate_cache_range(). 5630 * 5631 * First, try to find an existing mapping of the page by sf 5632 * buffer. sf_buf_invalidate_cache() modifies mapping and 5633 * flushes the cache. 5634 */ 5635 if (sf_buf_invalidate_cache(m)) 5636 return; 5637 5638 /* 5639 * If page is not mapped by sf buffer, but CPU does not 5640 * support self snoop, map the page transient and do 5641 * invalidation. In the worst case, whole cache is flushed by 5642 * pmap_invalidate_cache_range(). 5643 */ 5644 if ((cpu_feature & CPUID_SS) == 0) 5645 pmap_flush_page(m); 5646 } 5647 5648 static void 5649 __CONCAT(PMTYPE, flush_page)(vm_page_t m) 5650 { 5651 pt_entry_t *cmap_pte2; 5652 struct pcpu *pc; 5653 vm_offset_t sva, eva; 5654 bool useclflushopt; 5655 5656 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 5657 if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { 5658 sched_pin(); 5659 pc = get_pcpu(); 5660 cmap_pte2 = pc->pc_cmap_pte2; 5661 mtx_lock(&pc->pc_cmap_lock); 5662 if (*cmap_pte2) 5663 panic("pmap_flush_page: CMAP2 busy"); 5664 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5665 PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 5666 0); 5667 invlcaddr(pc->pc_cmap_addr2); 5668 sva = (vm_offset_t)pc->pc_cmap_addr2; 5669 eva = sva + PAGE_SIZE; 5670 5671 /* 5672 * Use mfence or sfence despite the ordering implied by 5673 * mtx_{un,}lock() because clflush on non-Intel CPUs 5674 * and clflushopt are not guaranteed to be ordered by 5675 * any other instruction. 5676 */ 5677 if (useclflushopt) 5678 sfence(); 5679 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5680 mfence(); 5681 for (; sva < eva; sva += cpu_clflush_line_size) { 5682 if (useclflushopt) 5683 clflushopt(sva); 5684 else 5685 clflush(sva); 5686 } 5687 if (useclflushopt) 5688 sfence(); 5689 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5690 mfence(); 5691 *cmap_pte2 = 0; 5692 sched_unpin(); 5693 mtx_unlock(&pc->pc_cmap_lock); 5694 } else 5695 pmap_invalidate_cache(); 5696 } 5697 5698 /* 5699 * Changes the specified virtual address range's memory type to that given by 5700 * the parameter "mode". The specified virtual address range must be 5701 * completely contained within either the kernel map. 5702 * 5703 * Returns zero if the change completed successfully, and either EINVAL or 5704 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5705 * of the virtual address range was not mapped, and ENOMEM is returned if 5706 * there was insufficient memory available to complete the change. 5707 */ 5708 static int 5709 __CONCAT(PMTYPE, change_attr)(vm_offset_t va, vm_size_t size, int mode) 5710 { 5711 vm_offset_t base, offset, tmpva; 5712 pd_entry_t *pde; 5713 pt_entry_t *pte; 5714 int cache_bits_pte, cache_bits_pde; 5715 bool changed; 5716 5717 base = trunc_page(va); 5718 offset = va & PAGE_MASK; 5719 size = round_page(offset + size); 5720 5721 /* 5722 * Only supported on kernel virtual addresses above the recursive map. 5723 */ 5724 if (base < VM_MIN_KERNEL_ADDRESS) 5725 return (EINVAL); 5726 5727 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 5728 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 5729 changed = false; 5730 5731 /* 5732 * Pages that aren't mapped aren't supported. Also break down 5733 * 2/4MB pages into 4KB pages if required. 5734 */ 5735 PMAP_LOCK(kernel_pmap); 5736 for (tmpva = base; tmpva < base + size; ) { 5737 pde = pmap_pde(kernel_pmap, tmpva); 5738 if (*pde == 0) { 5739 PMAP_UNLOCK(kernel_pmap); 5740 return (EINVAL); 5741 } 5742 if (*pde & PG_PS) { 5743 /* 5744 * If the current 2/4MB page already has 5745 * the required memory type, then we need not 5746 * demote this page. Just increment tmpva to 5747 * the next 2/4MB page frame. 5748 */ 5749 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5750 tmpva = trunc_4mpage(tmpva) + NBPDR; 5751 continue; 5752 } 5753 5754 /* 5755 * If the current offset aligns with a 2/4MB 5756 * page frame and there is at least 2/4MB left 5757 * within the range, then we need not break 5758 * down this page into 4KB pages. 5759 */ 5760 if ((tmpva & PDRMASK) == 0 && 5761 tmpva + PDRMASK < base + size) { 5762 tmpva += NBPDR; 5763 continue; 5764 } 5765 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5766 PMAP_UNLOCK(kernel_pmap); 5767 return (ENOMEM); 5768 } 5769 } 5770 pte = vtopte(tmpva); 5771 if (*pte == 0) { 5772 PMAP_UNLOCK(kernel_pmap); 5773 return (EINVAL); 5774 } 5775 tmpva += PAGE_SIZE; 5776 } 5777 PMAP_UNLOCK(kernel_pmap); 5778 5779 /* 5780 * Ok, all the pages exist, so run through them updating their 5781 * cache mode if required. 5782 */ 5783 for (tmpva = base; tmpva < base + size; ) { 5784 pde = pmap_pde(kernel_pmap, tmpva); 5785 if (*pde & PG_PS) { 5786 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5787 pmap_pde_attr(pde, cache_bits_pde); 5788 changed = true; 5789 } 5790 tmpva = trunc_4mpage(tmpva) + NBPDR; 5791 } else { 5792 pte = vtopte(tmpva); 5793 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5794 pmap_pte_attr(pte, cache_bits_pte); 5795 changed = true; 5796 } 5797 tmpva += PAGE_SIZE; 5798 } 5799 } 5800 5801 /* 5802 * Flush CPU caches to make sure any data isn't cached that 5803 * shouldn't be, etc. 5804 */ 5805 if (changed) { 5806 pmap_invalidate_range_int(kernel_pmap, base, tmpva); 5807 pmap_invalidate_cache_range(base, tmpva); 5808 } 5809 return (0); 5810 } 5811 5812 /* 5813 * Perform the pmap work for mincore(2). If the page is not both referenced and 5814 * modified by this pmap, returns its physical address so that the caller can 5815 * find other mappings. 5816 */ 5817 static int 5818 __CONCAT(PMTYPE, mincore)(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 5819 { 5820 pd_entry_t pde; 5821 pt_entry_t pte; 5822 vm_paddr_t pa; 5823 int val; 5824 5825 PMAP_LOCK(pmap); 5826 pde = *pmap_pde(pmap, addr); 5827 if (pde != 0) { 5828 if ((pde & PG_PS) != 0) { 5829 pte = pde; 5830 /* Compute the physical address of the 4KB page. */ 5831 pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) & 5832 PG_FRAME; 5833 val = MINCORE_PSIND(1); 5834 } else { 5835 pte = pmap_pte_ufast(pmap, addr, pde); 5836 pa = pte & PG_FRAME; 5837 val = 0; 5838 } 5839 } else { 5840 pte = 0; 5841 pa = 0; 5842 val = 0; 5843 } 5844 if ((pte & PG_V) != 0) { 5845 val |= MINCORE_INCORE; 5846 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5847 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5848 if ((pte & PG_A) != 0) 5849 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5850 } 5851 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5852 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5853 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5854 *pap = pa; 5855 } 5856 PMAP_UNLOCK(pmap); 5857 return (val); 5858 } 5859 5860 static void 5861 __CONCAT(PMTYPE, activate)(struct thread *td) 5862 { 5863 pmap_t pmap, oldpmap; 5864 u_int cpuid; 5865 u_int32_t cr3; 5866 5867 critical_enter(); 5868 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5869 oldpmap = PCPU_GET(curpmap); 5870 cpuid = PCPU_GET(cpuid); 5871 #if defined(SMP) 5872 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5873 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5874 #else 5875 CPU_CLR(cpuid, &oldpmap->pm_active); 5876 CPU_SET(cpuid, &pmap->pm_active); 5877 #endif 5878 #ifdef PMAP_PAE_COMP 5879 cr3 = vtophys(pmap->pm_pdpt); 5880 #else 5881 cr3 = vtophys(pmap->pm_pdir); 5882 #endif 5883 /* 5884 * pmap_activate is for the current thread on the current cpu 5885 */ 5886 td->td_pcb->pcb_cr3 = cr3; 5887 PCPU_SET(curpmap, pmap); 5888 critical_exit(); 5889 } 5890 5891 static void 5892 __CONCAT(PMTYPE, activate_boot)(pmap_t pmap) 5893 { 5894 u_int cpuid; 5895 5896 cpuid = PCPU_GET(cpuid); 5897 #if defined(SMP) 5898 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5899 #else 5900 CPU_SET(cpuid, &pmap->pm_active); 5901 #endif 5902 PCPU_SET(curpmap, pmap); 5903 } 5904 5905 /* 5906 * Increase the starting virtual address of the given mapping if a 5907 * different alignment might result in more superpage mappings. 5908 */ 5909 static void 5910 __CONCAT(PMTYPE, align_superpage)(vm_object_t object, vm_ooffset_t offset, 5911 vm_offset_t *addr, vm_size_t size) 5912 { 5913 vm_offset_t superpage_offset; 5914 5915 if (size < NBPDR) 5916 return; 5917 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5918 offset += ptoa(object->pg_color); 5919 superpage_offset = offset & PDRMASK; 5920 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5921 (*addr & PDRMASK) == superpage_offset) 5922 return; 5923 if ((*addr & PDRMASK) < superpage_offset) 5924 *addr = (*addr & ~PDRMASK) + superpage_offset; 5925 else 5926 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5927 } 5928 5929 static vm_offset_t 5930 __CONCAT(PMTYPE, quick_enter_page)(vm_page_t m) 5931 { 5932 vm_offset_t qaddr; 5933 pt_entry_t *pte; 5934 5935 critical_enter(); 5936 qaddr = PCPU_GET(qmap_addr); 5937 pte = vtopte(qaddr); 5938 5939 KASSERT(*pte == 0, 5940 ("pmap_quick_enter_page: PTE busy %#jx", (uintmax_t)*pte)); 5941 *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 5942 pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(m), 0); 5943 invlpg(qaddr); 5944 5945 return (qaddr); 5946 } 5947 5948 static void 5949 __CONCAT(PMTYPE, quick_remove_page)(vm_offset_t addr) 5950 { 5951 vm_offset_t qaddr; 5952 pt_entry_t *pte; 5953 5954 qaddr = PCPU_GET(qmap_addr); 5955 pte = vtopte(qaddr); 5956 5957 KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); 5958 KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); 5959 5960 *pte = 0; 5961 critical_exit(); 5962 } 5963 5964 static vmem_t *pmap_trm_arena; 5965 static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS; 5966 static int trm_guard = PAGE_SIZE; 5967 5968 static int 5969 pmap_trm_import(void *unused __unused, vmem_size_t size, int flags, 5970 vmem_addr_t *addrp) 5971 { 5972 vm_page_t m; 5973 vmem_addr_t af, addr, prev_addr; 5974 pt_entry_t *trm_pte; 5975 5976 prev_addr = atomic_load_int(&pmap_trm_arena_last); 5977 size = round_page(size) + trm_guard; 5978 for (;;) { 5979 if (prev_addr + size < prev_addr || prev_addr + size < size || 5980 prev_addr + size > PMAP_TRM_MAX_ADDRESS) 5981 return (ENOMEM); 5982 addr = prev_addr + size; 5983 if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr)) 5984 break; 5985 } 5986 prev_addr += trm_guard; 5987 trm_pte = PTmap + atop(prev_addr); 5988 for (af = prev_addr; af < addr; af += PAGE_SIZE) { 5989 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 5990 pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) | 5991 PG_M | PG_A | PG_RW | PG_V | pgeflag | 5992 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, false)); 5993 } 5994 *addrp = prev_addr; 5995 return (0); 5996 } 5997 5998 void 5999 pmap_init_trm(void) 6000 { 6001 vm_page_t pd_m; 6002 6003 TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard); 6004 if ((trm_guard & PAGE_MASK) != 0) 6005 trm_guard = 0; 6006 pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK); 6007 vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE); 6008 pd_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_WAITOK | 6009 VM_ALLOC_ZERO); 6010 PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V | 6011 pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, true); 6012 } 6013 6014 static void * 6015 __CONCAT(PMTYPE, trm_alloc)(size_t size, int flags) 6016 { 6017 vmem_addr_t res; 6018 int error; 6019 6020 MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0); 6021 error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int), 6022 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res); 6023 if (error != 0) 6024 return (NULL); 6025 if ((flags & M_ZERO) != 0) 6026 bzero((void *)res, size); 6027 return ((void *)res); 6028 } 6029 6030 static void 6031 __CONCAT(PMTYPE, trm_free)(void *addr, size_t size) 6032 { 6033 6034 vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4)); 6035 } 6036 6037 static void 6038 __CONCAT(PMTYPE, ksetrw)(vm_offset_t va) 6039 { 6040 6041 *vtopte(va) |= PG_RW; 6042 } 6043 6044 static void 6045 __CONCAT(PMTYPE, remap_lowptdi)(bool enable) 6046 { 6047 6048 PTD[KPTDI] = enable ? PTD[LOWPTDI] : 0; 6049 invltlb_glob(); 6050 } 6051 6052 static vm_offset_t 6053 __CONCAT(PMTYPE, get_map_low)(void) 6054 { 6055 6056 return (PMAP_MAP_LOW); 6057 } 6058 6059 static vm_offset_t 6060 __CONCAT(PMTYPE, get_vm_maxuser_address)(void) 6061 { 6062 6063 return (VM_MAXUSER_ADDRESS); 6064 } 6065 6066 static vm_paddr_t 6067 __CONCAT(PMTYPE, pg_frame)(vm_paddr_t pa) 6068 { 6069 6070 return (pa & PG_FRAME); 6071 } 6072 6073 static void 6074 __CONCAT(PMTYPE, sf_buf_map)(struct sf_buf *sf) 6075 { 6076 pt_entry_t opte, *ptep; 6077 6078 /* 6079 * Update the sf_buf's virtual-to-physical mapping, flushing the 6080 * virtual address from the TLB. Since the reference count for 6081 * the sf_buf's old mapping was zero, that mapping is not 6082 * currently in use. Consequently, there is no need to exchange 6083 * the old and new PTEs atomically, even under PAE. 6084 */ 6085 ptep = vtopte(sf->kva); 6086 opte = *ptep; 6087 *ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V | 6088 pmap_cache_bits(kernel_pmap, sf->m->md.pat_mode, 0); 6089 6090 /* 6091 * Avoid unnecessary TLB invalidations: If the sf_buf's old 6092 * virtual-to-physical mapping was not used, then any processor 6093 * that has invalidated the sf_buf's virtual address from its TLB 6094 * since the last used mapping need not invalidate again. 6095 */ 6096 #ifdef SMP 6097 if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) 6098 CPU_ZERO(&sf->cpumask); 6099 #else 6100 if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) 6101 pmap_invalidate_page_int(kernel_pmap, sf->kva); 6102 #endif 6103 } 6104 6105 static void 6106 __CONCAT(PMTYPE, cp_slow0_map)(vm_offset_t kaddr, int plen, vm_page_t *ma) 6107 { 6108 pt_entry_t *pte; 6109 int i; 6110 6111 for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) { 6112 *pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(ma[i]) | 6113 pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(ma[i]), 6114 false); 6115 invlpg(kaddr + ptoa(i)); 6116 } 6117 } 6118 6119 static u_int 6120 __CONCAT(PMTYPE, get_kcr3)(void) 6121 { 6122 6123 #ifdef PMAP_PAE_COMP 6124 return ((u_int)IdlePDPT); 6125 #else 6126 return ((u_int)IdlePTD); 6127 #endif 6128 } 6129 6130 static u_int 6131 __CONCAT(PMTYPE, get_cr3)(pmap_t pmap) 6132 { 6133 6134 #ifdef PMAP_PAE_COMP 6135 return ((u_int)vtophys(pmap->pm_pdpt)); 6136 #else 6137 return ((u_int)vtophys(pmap->pm_pdir)); 6138 #endif 6139 } 6140 6141 static caddr_t 6142 __CONCAT(PMTYPE, cmap3)(vm_paddr_t pa, u_int pte_bits) 6143 { 6144 pt_entry_t *pte; 6145 6146 pte = CMAP3; 6147 *pte = pa | pte_bits; 6148 invltlb(); 6149 return (CADDR3); 6150 } 6151 6152 static void 6153 __CONCAT(PMTYPE, basemem_setup)(u_int basemem) 6154 { 6155 pt_entry_t *pte; 6156 int i; 6157 6158 /* 6159 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 6160 * the vm86 page table so that vm86 can scribble on them using 6161 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 6162 * page 0, at least as initialized here? 6163 */ 6164 pte = (pt_entry_t *)vm86paddr; 6165 for (i = basemem / 4; i < 160; i++) 6166 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 6167 } 6168 6169 struct bios16_pmap_handle { 6170 pt_entry_t *pte; 6171 pd_entry_t *ptd; 6172 pt_entry_t orig_ptd; 6173 }; 6174 6175 static void * 6176 __CONCAT(PMTYPE, bios16_enter)(void) 6177 { 6178 struct bios16_pmap_handle *h; 6179 6180 /* 6181 * no page table, so create one and install it. 6182 */ 6183 h = malloc(sizeof(struct bios16_pmap_handle), M_TEMP, M_WAITOK); 6184 h->pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); 6185 h->ptd = IdlePTD; 6186 *h->pte = vm86phystk | PG_RW | PG_V; 6187 h->orig_ptd = *h->ptd; 6188 *h->ptd = vtophys(h->pte) | PG_RW | PG_V; 6189 pmap_invalidate_all_int(kernel_pmap); /* XXX insurance for now */ 6190 return (h); 6191 } 6192 6193 static void 6194 __CONCAT(PMTYPE, bios16_leave)(void *arg) 6195 { 6196 struct bios16_pmap_handle *h; 6197 6198 h = arg; 6199 *h->ptd = h->orig_ptd; /* remove page table */ 6200 /* 6201 * XXX only needs to be invlpg(0) but that doesn't work on the 386 6202 */ 6203 pmap_invalidate_all_int(kernel_pmap); 6204 free(h->pte, M_TEMP); /* ... and free it */ 6205 } 6206 6207 struct pmap_kernel_map_range { 6208 vm_offset_t sva; 6209 pt_entry_t attrs; 6210 int ptes; 6211 int pdes; 6212 int pdpes; 6213 }; 6214 6215 static void 6216 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 6217 vm_offset_t eva) 6218 { 6219 const char *mode; 6220 int i, pat_idx; 6221 6222 if (eva <= range->sva) 6223 return; 6224 6225 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 6226 for (i = 0; i < PAT_INDEX_SIZE; i++) 6227 if (pat_index[i] == pat_idx) 6228 break; 6229 6230 switch (i) { 6231 case PAT_WRITE_BACK: 6232 mode = "WB"; 6233 break; 6234 case PAT_WRITE_THROUGH: 6235 mode = "WT"; 6236 break; 6237 case PAT_UNCACHEABLE: 6238 mode = "UC"; 6239 break; 6240 case PAT_UNCACHED: 6241 mode = "U-"; 6242 break; 6243 case PAT_WRITE_PROTECTED: 6244 mode = "WP"; 6245 break; 6246 case PAT_WRITE_COMBINING: 6247 mode = "WC"; 6248 break; 6249 default: 6250 printf("%s: unknown PAT mode %#x for range 0x%08x-0x%08x\n", 6251 __func__, pat_idx, range->sva, eva); 6252 mode = "??"; 6253 break; 6254 } 6255 6256 sbuf_printf(sb, "0x%08x-0x%08x r%c%c%c%c %s %d %d %d\n", 6257 range->sva, eva, 6258 (range->attrs & PG_RW) != 0 ? 'w' : '-', 6259 (range->attrs & pg_nx) != 0 ? '-' : 'x', 6260 (range->attrs & PG_U) != 0 ? 'u' : 's', 6261 (range->attrs & PG_G) != 0 ? 'g' : '-', 6262 mode, range->pdpes, range->pdes, range->ptes); 6263 6264 /* Reset to sentinel value. */ 6265 range->sva = 0xffffffff; 6266 } 6267 6268 /* 6269 * Determine whether the attributes specified by a page table entry match those 6270 * being tracked by the current range. This is not quite as simple as a direct 6271 * flag comparison since some PAT modes have multiple representations. 6272 */ 6273 static bool 6274 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 6275 { 6276 pt_entry_t diff, mask; 6277 6278 mask = pg_nx | PG_G | PG_RW | PG_U | PG_PDE_CACHE; 6279 diff = (range->attrs ^ attrs) & mask; 6280 if (diff == 0) 6281 return (true); 6282 if ((diff & ~PG_PDE_PAT) == 0 && 6283 pmap_pat_index(kernel_pmap, range->attrs, true) == 6284 pmap_pat_index(kernel_pmap, attrs, true)) 6285 return (true); 6286 return (false); 6287 } 6288 6289 static void 6290 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 6291 pt_entry_t attrs) 6292 { 6293 6294 memset(range, 0, sizeof(*range)); 6295 range->sva = va; 6296 range->attrs = attrs; 6297 } 6298 6299 /* 6300 * Given a leaf PTE, derive the mapping's attributes. If they do not match 6301 * those of the current run, dump the address range and its attributes, and 6302 * begin a new run. 6303 */ 6304 static void 6305 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 6306 vm_offset_t va, pd_entry_t pde, pt_entry_t pte) 6307 { 6308 pt_entry_t attrs; 6309 6310 attrs = pde & (PG_RW | PG_U | pg_nx); 6311 6312 if ((pde & PG_PS) != 0) { 6313 attrs |= pde & (PG_G | PG_PDE_CACHE); 6314 } else if (pte != 0) { 6315 attrs |= pte & pg_nx; 6316 attrs &= pg_nx | (pte & (PG_RW | PG_U)); 6317 attrs |= pte & (PG_G | PG_PTE_CACHE); 6318 6319 /* Canonicalize by always using the PDE PAT bit. */ 6320 if ((attrs & PG_PTE_PAT) != 0) 6321 attrs ^= PG_PDE_PAT | PG_PTE_PAT; 6322 } 6323 6324 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 6325 sysctl_kmaps_dump(sb, range, va); 6326 sysctl_kmaps_reinit(range, va, attrs); 6327 } 6328 } 6329 6330 static int 6331 __CONCAT(PMTYPE, sysctl_kmaps)(SYSCTL_HANDLER_ARGS) 6332 { 6333 struct pmap_kernel_map_range range; 6334 struct sbuf sbuf, *sb; 6335 pd_entry_t pde; 6336 pt_entry_t *pt, pte; 6337 vm_offset_t sva; 6338 int error; 6339 u_int i, k; 6340 6341 error = sysctl_wire_old_buffer(req, 0); 6342 if (error != 0) 6343 return (error); 6344 sb = &sbuf; 6345 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 6346 6347 /* Sentinel value. */ 6348 range.sva = 0xffffffff; 6349 6350 /* 6351 * Iterate over the kernel page tables without holding the 6352 * kernel pmap lock. Kernel page table pages are never freed, 6353 * so at worst we will observe inconsistencies in the output. 6354 */ 6355 for (sva = 0, i = 0; i < NPTEPG * NPGPTD * NPDEPG ;) { 6356 if (i == 0) 6357 sbuf_printf(sb, "\nLow PDE:\n"); 6358 else if (i == LOWPTDI * NPTEPG) 6359 sbuf_printf(sb, "Low PDE dup:\n"); 6360 else if (i == PTDPTDI * NPTEPG) 6361 sbuf_printf(sb, "Recursive map:\n"); 6362 else if (i == KERNPTDI * NPTEPG) 6363 sbuf_printf(sb, "Kernel base:\n"); 6364 else if (i == TRPTDI * NPTEPG) 6365 sbuf_printf(sb, "Trampoline:\n"); 6366 pde = IdlePTD[sva >> PDRSHIFT]; 6367 if ((pde & PG_V) == 0) { 6368 sva = rounddown2(sva, NBPDR); 6369 sysctl_kmaps_dump(sb, &range, sva); 6370 sva += NBPDR; 6371 i += NPTEPG; 6372 continue; 6373 } 6374 if ((pde & PG_PS) != 0) { 6375 sysctl_kmaps_check(sb, &range, sva, pde, 0); 6376 range.pdes++; 6377 sva += NBPDR; 6378 i += NPTEPG; 6379 continue; 6380 } 6381 for (pt = vtopte(sva), k = 0; k < NPTEPG; i++, k++, pt++, 6382 sva += PAGE_SIZE) { 6383 pte = *pt; 6384 if ((pte & PG_V) == 0) { 6385 sysctl_kmaps_dump(sb, &range, sva); 6386 continue; 6387 } 6388 sysctl_kmaps_check(sb, &range, sva, pde, pte); 6389 range.ptes++; 6390 } 6391 } 6392 6393 error = sbuf_finish(sb); 6394 sbuf_delete(sb); 6395 return (error); 6396 } 6397 6398 #define PMM(a) \ 6399 .pm_##a = __CONCAT(PMTYPE, a), 6400 6401 struct pmap_methods __CONCAT(PMTYPE, methods) = { 6402 PMM(ksetrw) 6403 PMM(remap_lower) 6404 PMM(remap_lowptdi) 6405 PMM(align_superpage) 6406 PMM(quick_enter_page) 6407 PMM(quick_remove_page) 6408 PMM(trm_alloc) 6409 PMM(trm_free) 6410 PMM(get_map_low) 6411 PMM(get_vm_maxuser_address) 6412 PMM(kextract) 6413 PMM(pg_frame) 6414 PMM(sf_buf_map) 6415 PMM(cp_slow0_map) 6416 PMM(get_kcr3) 6417 PMM(get_cr3) 6418 PMM(cmap3) 6419 PMM(basemem_setup) 6420 PMM(set_nx) 6421 PMM(bios16_enter) 6422 PMM(bios16_leave) 6423 PMM(bootstrap) 6424 PMM(is_valid_memattr) 6425 PMM(cache_bits) 6426 PMM(ps_enabled) 6427 PMM(pinit0) 6428 PMM(pinit) 6429 PMM(activate) 6430 PMM(activate_boot) 6431 PMM(advise) 6432 PMM(clear_modify) 6433 PMM(change_attr) 6434 PMM(mincore) 6435 PMM(copy) 6436 PMM(copy_page) 6437 PMM(copy_pages) 6438 PMM(zero_page) 6439 PMM(zero_page_area) 6440 PMM(enter) 6441 PMM(enter_object) 6442 PMM(enter_quick) 6443 PMM(kenter_temporary) 6444 PMM(object_init_pt) 6445 PMM(unwire) 6446 PMM(page_exists_quick) 6447 PMM(page_wired_mappings) 6448 PMM(page_is_mapped) 6449 PMM(remove_pages) 6450 PMM(is_modified) 6451 PMM(is_prefaultable) 6452 PMM(is_referenced) 6453 PMM(remove_write) 6454 PMM(ts_referenced) 6455 PMM(mapdev_attr) 6456 PMM(unmapdev) 6457 PMM(page_set_memattr) 6458 PMM(extract) 6459 PMM(extract_and_hold) 6460 PMM(map) 6461 PMM(qenter) 6462 PMM(qremove) 6463 PMM(release) 6464 PMM(remove) 6465 PMM(protect) 6466 PMM(remove_all) 6467 PMM(init) 6468 PMM(init_pat) 6469 PMM(growkernel) 6470 PMM(invalidate_page) 6471 PMM(invalidate_range) 6472 PMM(invalidate_all) 6473 PMM(invalidate_cache) 6474 PMM(flush_page) 6475 PMM(kenter) 6476 PMM(kremove) 6477 PMM(sysctl_kmaps) 6478 }; 6479