1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47 /*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * Copyright (c) 2018 The FreeBSD Foundation 51 * All rights reserved. 52 * 53 * This software was developed for the FreeBSD Project by Jake Burkholder, 54 * Safeport Network Services, and Network Associates Laboratories, the 55 * Security Research Division of Network Associates, Inc. under 56 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 57 * CHATS research program. 58 * 59 * Portions of this software were developed by 60 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 61 * the FreeBSD Foundation. 62 * 63 * Redistribution and use in source and binary forms, with or without 64 * modification, are permitted provided that the following conditions 65 * are met: 66 * 1. Redistributions of source code must retain the above copyright 67 * notice, this list of conditions and the following disclaimer. 68 * 2. Redistributions in binary form must reproduce the above copyright 69 * notice, this list of conditions and the following disclaimer in the 70 * documentation and/or other materials provided with the distribution. 71 * 72 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 73 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 74 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 75 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 76 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 77 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 78 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 79 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 80 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 81 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 82 * SUCH DAMAGE. 83 */ 84 85 #include <sys/cdefs.h> 86 __FBSDID("$FreeBSD$"); 87 88 /* 89 * Manages physical address maps. 90 * 91 * Since the information managed by this module is 92 * also stored by the logical address mapping module, 93 * this module may throw away valid virtual-to-physical 94 * mappings at almost any time. However, invalidations 95 * of virtual-to-physical mappings must be done as 96 * requested. 97 * 98 * In order to cope with hardware architectures which 99 * make virtual-to-physical map invalidates expensive, 100 * this module may delay invalidate or reduced protection 101 * operations until such time as they are actually 102 * necessary. This module is given full information as 103 * to which processors are currently using which maps, 104 * and to when physical maps must be made correct. 105 */ 106 107 #include "opt_apic.h" 108 #include "opt_cpu.h" 109 #include "opt_pmap.h" 110 #include "opt_smp.h" 111 #include "opt_vm.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/kernel.h> 116 #include <sys/ktr.h> 117 #include <sys/lock.h> 118 #include <sys/malloc.h> 119 #include <sys/mman.h> 120 #include <sys/msgbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/proc.h> 123 #include <sys/rwlock.h> 124 #include <sys/sf_buf.h> 125 #include <sys/sx.h> 126 #include <sys/vmmeter.h> 127 #include <sys/sched.h> 128 #include <sys/sysctl.h> 129 #include <sys/smp.h> 130 #include <sys/vmem.h> 131 132 #include <vm/vm.h> 133 #include <vm/vm_param.h> 134 #include <vm/vm_kern.h> 135 #include <vm/vm_page.h> 136 #include <vm/vm_map.h> 137 #include <vm/vm_object.h> 138 #include <vm/vm_extern.h> 139 #include <vm/vm_pageout.h> 140 #include <vm/vm_pager.h> 141 #include <vm/vm_phys.h> 142 #include <vm/vm_radix.h> 143 #include <vm/vm_reserv.h> 144 #include <vm/uma.h> 145 146 #ifdef DEV_APIC 147 #include <sys/bus.h> 148 #include <machine/intr_machdep.h> 149 #include <x86/apicvar.h> 150 #endif 151 #include <machine/bootinfo.h> 152 #include <machine/cpu.h> 153 #include <machine/cputypes.h> 154 #include <machine/md_var.h> 155 #include <machine/pcb.h> 156 #include <machine/specialreg.h> 157 #ifdef SMP 158 #include <machine/smp.h> 159 #endif 160 161 #ifndef PMAP_SHPGPERPROC 162 #define PMAP_SHPGPERPROC 200 163 #endif 164 165 #if !defined(DIAGNOSTIC) 166 #ifdef __GNUC_GNU_INLINE__ 167 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 168 #else 169 #define PMAP_INLINE extern inline 170 #endif 171 #else 172 #define PMAP_INLINE 173 #endif 174 175 #ifdef PV_STATS 176 #define PV_STAT(x) do { x ; } while (0) 177 #else 178 #define PV_STAT(x) do { } while (0) 179 #endif 180 181 #define pa_index(pa) ((pa) >> PDRSHIFT) 182 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 183 184 /* 185 * Get PDEs and PTEs for user/kernel address space 186 */ 187 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 188 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 189 190 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 191 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 192 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 193 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 194 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 195 196 #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 197 atomic_clear_int((u_int *)(pte), PG_W)) 198 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 199 200 struct pmap kernel_pmap_store; 201 202 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 203 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 204 int pgeflag = 0; /* PG_G or-in */ 205 int pseflag = 0; /* PG_PS or-in */ 206 207 static int nkpt = NKPT; 208 vm_offset_t kernel_vm_end = /* 0 + */ NKPT * NBPDR; 209 210 #if defined(PAE) || defined(PAE_TABLES) 211 pt_entry_t pg_nx; 212 static uma_zone_t pdptzone; 213 #endif 214 215 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 216 217 static int pat_works = 1; 218 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 219 "Is page attribute table fully functional?"); 220 221 static int pg_ps_enabled = 1; 222 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 223 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 224 225 #define PAT_INDEX_SIZE 8 226 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 227 228 /* 229 * pmap_mapdev support pre initialization (i.e. console) 230 */ 231 #define PMAP_PREINIT_MAPPING_COUNT 8 232 static struct pmap_preinit_mapping { 233 vm_paddr_t pa; 234 vm_offset_t va; 235 vm_size_t sz; 236 int mode; 237 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 238 static int pmap_initialized; 239 240 static struct rwlock_padalign pvh_global_lock; 241 242 /* 243 * Data for the pv entry allocation mechanism 244 */ 245 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 246 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 247 static struct md_page *pv_table; 248 static int shpgperproc = PMAP_SHPGPERPROC; 249 250 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 251 int pv_maxchunks; /* How many chunks we have KVA for */ 252 vm_offset_t pv_vafree; /* freelist stored in the PTE */ 253 254 /* 255 * All those kernel PT submaps that BSD is so fond of 256 */ 257 pt_entry_t *CMAP3; 258 static pd_entry_t *KPTD; 259 caddr_t ptvmmap = 0; 260 caddr_t CADDR3; 261 262 /* 263 * Crashdump maps. 264 */ 265 static caddr_t crashdumpmap; 266 267 static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3; 268 static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3; 269 #ifdef SMP 270 static int PMAP1cpu, PMAP3cpu; 271 static int PMAP1changedcpu; 272 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 273 &PMAP1changedcpu, 0, 274 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 275 #endif 276 static int PMAP1changed; 277 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 278 &PMAP1changed, 0, 279 "Number of times pmap_pte_quick changed PMAP1"); 280 static int PMAP1unchanged; 281 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 282 &PMAP1unchanged, 0, 283 "Number of times pmap_pte_quick didn't change PMAP1"); 284 static struct mtx PMAP2mutex; 285 286 int pti; 287 288 static void free_pv_chunk(struct pv_chunk *pc); 289 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 290 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 291 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 292 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 293 #if VM_NRESERVLEVEL > 0 294 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 295 #endif 296 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 297 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 298 vm_offset_t va); 299 static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 300 301 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 302 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 303 vm_prot_t prot); 304 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 305 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 306 static void pmap_flush_page(vm_page_t m); 307 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 308 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 309 pd_entry_t pde); 310 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 311 static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 312 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 313 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 314 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 315 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 316 #if VM_NRESERVLEVEL > 0 317 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 318 #endif 319 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 320 vm_prot_t prot); 321 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 322 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 323 struct spglist *free); 324 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 325 struct spglist *free); 326 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 327 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 328 struct spglist *free); 329 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 330 vm_offset_t va); 331 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 332 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 333 vm_page_t m); 334 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 335 pd_entry_t newpde); 336 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 337 338 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 339 340 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 341 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 342 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 343 static void pmap_pte_release(pt_entry_t *pte); 344 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 345 #if defined(PAE) || defined(PAE_TABLES) 346 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, 347 uint8_t *flags, int wait); 348 #endif 349 static void pmap_init_trm(void); 350 351 static __inline void pagezero(void *page); 352 353 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 354 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 355 356 void pmap_cold(void); 357 extern char _end[]; 358 u_long physfree; /* phys addr of next free page */ 359 u_long vm86phystk; /* PA of vm86/bios stack */ 360 u_long vm86paddr; /* address of vm86 region */ 361 int vm86pa; /* phys addr of vm86 region */ 362 u_long KERNend; /* phys addr end of kernel (just after bss) */ 363 pd_entry_t *IdlePTD; /* phys addr of kernel PTD */ 364 #if defined(PAE) || defined(PAE_TABLES) 365 pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */ 366 #endif 367 pt_entry_t *KPTmap; /* address of kernel page tables */ 368 u_long KPTphys; /* phys addr of kernel page tables */ 369 extern u_long tramp_idleptd; 370 371 static u_long 372 allocpages(u_int cnt, u_long *physfree) 373 { 374 u_long res; 375 376 res = *physfree; 377 *physfree += PAGE_SIZE * cnt; 378 bzero((void *)res, PAGE_SIZE * cnt); 379 return (res); 380 } 381 382 static void 383 pmap_cold_map(u_long pa, u_long va, u_long cnt) 384 { 385 pt_entry_t *pt; 386 387 for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0; 388 cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE) 389 *pt = pa | PG_V | PG_RW | PG_A | PG_M; 390 } 391 392 static void 393 pmap_cold_mapident(u_long pa, u_long cnt) 394 { 395 396 pmap_cold_map(pa, pa, cnt); 397 } 398 399 _Static_assert(2 * NBPDR == KERNBASE, "Broken double-map of zero PTD"); 400 401 /* 402 * Called from locore.s before paging is enabled. Sets up the first 403 * kernel page table. Since kernel is mapped with PA == VA, this code 404 * does not require relocations. 405 */ 406 void 407 pmap_cold(void) 408 { 409 pt_entry_t *pt; 410 u_long a; 411 u_int cr3, ncr4; 412 413 physfree = (u_long)&_end; 414 if (bootinfo.bi_esymtab != 0) 415 physfree = bootinfo.bi_esymtab; 416 if (bootinfo.bi_kernend != 0) 417 physfree = bootinfo.bi_kernend; 418 physfree = roundup2(physfree, NBPDR); 419 KERNend = physfree; 420 421 /* Allocate Kernel Page Tables */ 422 KPTphys = allocpages(NKPT, &physfree); 423 KPTmap = (pt_entry_t *)KPTphys; 424 425 /* Allocate Page Table Directory */ 426 #if defined(PAE) || defined(PAE_TABLES) 427 /* XXX only need 32 bytes (easier for now) */ 428 IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree); 429 #endif 430 IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree); 431 432 /* 433 * Allocate KSTACK. Leave a guard page between IdlePTD and 434 * proc0kstack, to control stack overflow for thread0 and 435 * prevent corruption of the page table. We leak the guard 436 * physical memory due to 1:1 mappings. 437 */ 438 allocpages(1, &physfree); 439 proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree); 440 441 /* vm86/bios stack */ 442 vm86phystk = allocpages(1, &physfree); 443 444 /* pgtable + ext + IOPAGES */ 445 vm86paddr = vm86pa = allocpages(3, &physfree); 446 447 /* Install page tables into PTD. Page table page 1 is wasted. */ 448 for (a = 0; a < NKPT; a++) 449 IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M; 450 451 #if defined(PAE) || defined(PAE_TABLES) 452 /* PAE install PTD pointers into PDPT */ 453 for (a = 0; a < NPGPTD; a++) 454 IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V; 455 #endif 456 457 /* 458 * Install recursive mapping for kernel page tables into 459 * itself. 460 */ 461 for (a = 0; a < NPGPTD; a++) 462 IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V | 463 PG_RW; 464 465 /* 466 * Initialize page table pages mapping physical address zero 467 * through the (physical) end of the kernel. Many of these 468 * pages must be reserved, and we reserve them all and map 469 * them linearly for convenience. We do this even if we've 470 * enabled PSE above; we'll just switch the corresponding 471 * kernel PDEs before we turn on paging. 472 * 473 * This and all other page table entries allow read and write 474 * access for various reasons. Kernel mappings never have any 475 * access restrictions. 476 */ 477 pmap_cold_mapident(0, atop(NBPDR)); 478 pmap_cold_map(0, NBPDR, atop(NBPDR)); 479 pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE)); 480 481 /* Map page table directory */ 482 #if defined(PAE) || defined(PAE_TABLES) 483 pmap_cold_mapident((u_long)IdlePDPT, 1); 484 #endif 485 pmap_cold_mapident((u_long)IdlePTD, NPGPTD); 486 487 /* Map early KPTmap. It is really pmap_cold_mapident. */ 488 pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT); 489 490 /* Map proc0kstack */ 491 pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES); 492 /* ISA hole already mapped */ 493 494 pmap_cold_mapident(vm86phystk, 1); 495 pmap_cold_mapident(vm86pa, 3); 496 497 /* Map page 0 into the vm86 page table */ 498 *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V; 499 500 /* ...likewise for the ISA hole for vm86 */ 501 for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0; 502 a < atop(ISA_HOLE_LENGTH); a++, pt++) 503 *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A | 504 PG_M | PG_V; 505 506 /* Enable PSE, PGE, VME, and PAE if configured. */ 507 ncr4 = 0; 508 if ((cpu_feature & CPUID_PSE) != 0) { 509 ncr4 |= CR4_PSE; 510 /* 511 * Superpage mapping of the kernel text. Existing 4k 512 * page table pages are wasted. 513 */ 514 for (a = KERNBASE; a < KERNend; a += NBPDR) 515 IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M | 516 PG_RW | PG_V; 517 } 518 if ((cpu_feature & CPUID_PGE) != 0) { 519 ncr4 |= CR4_PGE; 520 pgeflag = PG_G; 521 } 522 ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0; 523 #if defined(PAE) || defined(PAE_TABLES) 524 ncr4 |= CR4_PAE; 525 #endif 526 if (ncr4 != 0) 527 load_cr4(rcr4() | ncr4); 528 529 /* Now enable paging */ 530 #if defined(PAE) || defined(PAE_TABLES) 531 cr3 = (u_int)IdlePDPT; 532 #else 533 cr3 = (u_int)IdlePTD; 534 #endif 535 tramp_idleptd = cr3; 536 load_cr3(cr3); 537 load_cr0(rcr0() | CR0_PG); 538 539 /* 540 * Now running relocated at KERNBASE where the system is 541 * linked to run. 542 */ 543 544 /* 545 * Remove the lowest part of the double mapping of low memory 546 * to get some null pointer checks. 547 */ 548 IdlePTD[0] = 0; 549 load_cr3(cr3); /* invalidate TLB */ 550 } 551 552 /* 553 * Bootstrap the system enough to run with virtual memory. 554 * 555 * On the i386 this is called after mapping has already been enabled 556 * in locore.s with the page table created in pmap_cold(), 557 * and just syncs the pmap module with what has already been done. 558 */ 559 void 560 pmap_bootstrap(vm_paddr_t firstaddr) 561 { 562 vm_offset_t va; 563 pt_entry_t *pte, *unused; 564 struct pcpu *pc; 565 int i; 566 567 /* 568 * Add a physical memory segment (vm_phys_seg) corresponding to the 569 * preallocated kernel page table pages so that vm_page structures 570 * representing these pages will be created. The vm_page structures 571 * are required for promotion of the corresponding kernel virtual 572 * addresses to superpage mappings. 573 */ 574 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 575 576 /* 577 * Initialize the first available kernel virtual address. However, 578 * using "firstaddr" may waste a few pages of the kernel virtual 579 * address space, because locore may not have mapped every physical 580 * page that it allocated. Preferably, locore would provide a first 581 * unused virtual address in addition to "firstaddr". 582 */ 583 virtual_avail = (vm_offset_t)firstaddr; 584 585 virtual_end = VM_MAX_KERNEL_ADDRESS; 586 587 /* 588 * Initialize the kernel pmap (which is statically allocated). 589 */ 590 PMAP_LOCK_INIT(kernel_pmap); 591 kernel_pmap->pm_pdir = IdlePTD; 592 #if defined(PAE) || defined(PAE_TABLES) 593 kernel_pmap->pm_pdpt = IdlePDPT; 594 #endif 595 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 596 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 597 598 /* 599 * Initialize the global pv list lock. 600 */ 601 rw_init(&pvh_global_lock, "pmap pv global"); 602 603 /* 604 * Reserve some special page table entries/VA space for temporary 605 * mapping of pages. 606 */ 607 #define SYSMAP(c, p, v, n) \ 608 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 609 610 va = virtual_avail; 611 pte = vtopte(va); 612 613 614 /* 615 * Initialize temporary map objects on the current CPU for use 616 * during early boot. 617 * CMAP1/CMAP2 are used for zeroing and copying pages. 618 * CMAP3 is used for the boot-time memory test. 619 */ 620 pc = get_pcpu(); 621 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 622 SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) 623 SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) 624 SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) 625 626 SYSMAP(caddr_t, CMAP3, CADDR3, 1); 627 628 /* 629 * Crashdump maps. 630 */ 631 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 632 633 /* 634 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 635 */ 636 SYSMAP(caddr_t, unused, ptvmmap, 1) 637 638 /* 639 * msgbufp is used to map the system message buffer. 640 */ 641 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 642 643 /* 644 * KPTmap is used by pmap_kextract(). 645 * 646 * KPTmap is first initialized by locore. However, that initial 647 * KPTmap can only support NKPT page table pages. Here, a larger 648 * KPTmap is created that can support KVA_PAGES page table pages. 649 */ 650 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 651 652 for (i = 0; i < NKPT; i++) 653 KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; 654 655 /* 656 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 657 * respectively. 658 */ 659 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 660 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 661 SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1) 662 663 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 664 665 virtual_avail = va; 666 667 /* 668 * Initialize the PAT MSR if present. 669 * pmap_init_pat() clears and sets CR4_PGE, which, as a 670 * side-effect, invalidates stale PG_G TLB entries that might 671 * have been created in our pre-boot environment. We assume 672 * that PAT support implies PGE and in reverse, PGE presence 673 * comes with PAT. Both features were added for Pentium Pro. 674 */ 675 pmap_init_pat(); 676 } 677 678 static void 679 pmap_init_reserved_pages(void) 680 { 681 struct pcpu *pc; 682 vm_offset_t pages; 683 int i; 684 685 CPU_FOREACH(i) { 686 pc = pcpu_find(i); 687 mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF | 688 MTX_NEW); 689 pc->pc_copyout_maddr = kva_alloc(ptoa(2)); 690 if (pc->pc_copyout_maddr == 0) 691 panic("unable to allocate non-sleepable copyout KVA"); 692 sx_init(&pc->pc_copyout_slock, "cpslk"); 693 pc->pc_copyout_saddr = kva_alloc(ptoa(2)); 694 if (pc->pc_copyout_saddr == 0) 695 panic("unable to allocate sleepable copyout KVA"); 696 pc->pc_pmap_eh_va = kva_alloc(ptoa(1)); 697 if (pc->pc_pmap_eh_va == 0) 698 panic("unable to allocate pmap_extract_and_hold KVA"); 699 pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va); 700 701 /* 702 * Skip if the mappings have already been initialized, 703 * i.e. this is the BSP. 704 */ 705 if (pc->pc_cmap_addr1 != 0) 706 continue; 707 708 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 709 pages = kva_alloc(PAGE_SIZE * 3); 710 if (pages == 0) 711 panic("unable to allocate CMAP KVA"); 712 pc->pc_cmap_pte1 = vtopte(pages); 713 pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); 714 pc->pc_cmap_addr1 = (caddr_t)pages; 715 pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); 716 pc->pc_qmap_addr = pages + atop(2); 717 } 718 } 719 720 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 721 722 /* 723 * Setup the PAT MSR. 724 */ 725 void 726 pmap_init_pat(void) 727 { 728 int pat_table[PAT_INDEX_SIZE]; 729 uint64_t pat_msr; 730 u_long cr0, cr4; 731 int i; 732 733 /* Set default PAT index table. */ 734 for (i = 0; i < PAT_INDEX_SIZE; i++) 735 pat_table[i] = -1; 736 pat_table[PAT_WRITE_BACK] = 0; 737 pat_table[PAT_WRITE_THROUGH] = 1; 738 pat_table[PAT_UNCACHEABLE] = 3; 739 pat_table[PAT_WRITE_COMBINING] = 3; 740 pat_table[PAT_WRITE_PROTECTED] = 3; 741 pat_table[PAT_UNCACHED] = 3; 742 743 /* 744 * Bail if this CPU doesn't implement PAT. 745 * We assume that PAT support implies PGE. 746 */ 747 if ((cpu_feature & CPUID_PAT) == 0) { 748 for (i = 0; i < PAT_INDEX_SIZE; i++) 749 pat_index[i] = pat_table[i]; 750 pat_works = 0; 751 return; 752 } 753 754 /* 755 * Due to some Intel errata, we can only safely use the lower 4 756 * PAT entries. 757 * 758 * Intel Pentium III Processor Specification Update 759 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 760 * or Mode C Paging) 761 * 762 * Intel Pentium IV Processor Specification Update 763 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 764 */ 765 if (cpu_vendor_id == CPU_VENDOR_INTEL && 766 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 767 pat_works = 0; 768 769 /* Initialize default PAT entries. */ 770 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 771 PAT_VALUE(1, PAT_WRITE_THROUGH) | 772 PAT_VALUE(2, PAT_UNCACHED) | 773 PAT_VALUE(3, PAT_UNCACHEABLE) | 774 PAT_VALUE(4, PAT_WRITE_BACK) | 775 PAT_VALUE(5, PAT_WRITE_THROUGH) | 776 PAT_VALUE(6, PAT_UNCACHED) | 777 PAT_VALUE(7, PAT_UNCACHEABLE); 778 779 if (pat_works) { 780 /* 781 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 782 * Program 5 and 6 as WP and WC. 783 * Leave 4 and 7 as WB and UC. 784 */ 785 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 786 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 787 PAT_VALUE(6, PAT_WRITE_COMBINING); 788 pat_table[PAT_UNCACHED] = 2; 789 pat_table[PAT_WRITE_PROTECTED] = 5; 790 pat_table[PAT_WRITE_COMBINING] = 6; 791 } else { 792 /* 793 * Just replace PAT Index 2 with WC instead of UC-. 794 */ 795 pat_msr &= ~PAT_MASK(2); 796 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 797 pat_table[PAT_WRITE_COMBINING] = 2; 798 } 799 800 /* Disable PGE. */ 801 cr4 = rcr4(); 802 load_cr4(cr4 & ~CR4_PGE); 803 804 /* Disable caches (CD = 1, NW = 0). */ 805 cr0 = rcr0(); 806 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 807 808 /* Flushes caches and TLBs. */ 809 wbinvd(); 810 invltlb(); 811 812 /* Update PAT and index table. */ 813 wrmsr(MSR_PAT, pat_msr); 814 for (i = 0; i < PAT_INDEX_SIZE; i++) 815 pat_index[i] = pat_table[i]; 816 817 /* Flush caches and TLBs again. */ 818 wbinvd(); 819 invltlb(); 820 821 /* Restore caches and PGE. */ 822 load_cr0(cr0); 823 load_cr4(cr4); 824 } 825 826 /* 827 * Initialize a vm_page's machine-dependent fields. 828 */ 829 void 830 pmap_page_init(vm_page_t m) 831 { 832 833 TAILQ_INIT(&m->md.pv_list); 834 m->md.pat_mode = PAT_WRITE_BACK; 835 } 836 837 #if defined(PAE) || defined(PAE_TABLES) 838 static void * 839 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 840 int wait) 841 { 842 843 /* Inform UMA that this allocator uses kernel_map/object. */ 844 *flags = UMA_SLAB_KERNEL; 845 return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, 0x0ULL, 846 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 847 } 848 #endif 849 850 /* 851 * Abuse the pte nodes for unmapped kva to thread a kva freelist through. 852 * Requirements: 853 * - Must deal with pages in order to ensure that none of the PG_* bits 854 * are ever set, PG_V in particular. 855 * - Assumes we can write to ptes without pte_store() atomic ops, even 856 * on PAE systems. This should be ok. 857 * - Assumes nothing will ever test these addresses for 0 to indicate 858 * no mapping instead of correctly checking PG_V. 859 * - Assumes a vm_offset_t will fit in a pte (true for i386). 860 * Because PG_V is never set, there can be no mappings to invalidate. 861 */ 862 static vm_offset_t 863 pmap_ptelist_alloc(vm_offset_t *head) 864 { 865 pt_entry_t *pte; 866 vm_offset_t va; 867 868 va = *head; 869 if (va == 0) 870 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 871 pte = vtopte(va); 872 *head = *pte; 873 if (*head & PG_V) 874 panic("pmap_ptelist_alloc: va with PG_V set!"); 875 *pte = 0; 876 return (va); 877 } 878 879 static void 880 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 881 { 882 pt_entry_t *pte; 883 884 if (va & PG_V) 885 panic("pmap_ptelist_free: freeing va with PG_V set!"); 886 pte = vtopte(va); 887 *pte = *head; /* virtual! PG_V is 0 though */ 888 *head = va; 889 } 890 891 static void 892 pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 893 { 894 int i; 895 vm_offset_t va; 896 897 *head = 0; 898 for (i = npages - 1; i >= 0; i--) { 899 va = (vm_offset_t)base + i * PAGE_SIZE; 900 pmap_ptelist_free(head, va); 901 } 902 } 903 904 905 /* 906 * Initialize the pmap module. 907 * Called by vm_init, to initialize any structures that the pmap 908 * system needs to map virtual memory. 909 */ 910 void 911 pmap_init(void) 912 { 913 struct pmap_preinit_mapping *ppim; 914 vm_page_t mpte; 915 vm_size_t s; 916 int i, pv_npg; 917 918 /* 919 * Initialize the vm page array entries for the kernel pmap's 920 * page table pages. 921 */ 922 for (i = 0; i < NKPT; i++) { 923 mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i)); 924 KASSERT(mpte >= vm_page_array && 925 mpte < &vm_page_array[vm_page_array_size], 926 ("pmap_init: page table page is out of range")); 927 mpte->pindex = i + KPTDI; 928 mpte->phys_addr = KPTphys + ptoa(i); 929 } 930 931 /* 932 * Initialize the address space (zone) for the pv entries. Set a 933 * high water mark so that the system can recover from excessive 934 * numbers of pv entries. 935 */ 936 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 937 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 938 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 939 pv_entry_max = roundup(pv_entry_max, _NPCPV); 940 pv_entry_high_water = 9 * (pv_entry_max / 10); 941 942 /* 943 * If the kernel is running on a virtual machine, then it must assume 944 * that MCA is enabled by the hypervisor. Moreover, the kernel must 945 * be prepared for the hypervisor changing the vendor and family that 946 * are reported by CPUID. Consequently, the workaround for AMD Family 947 * 10h Erratum 383 is enabled if the processor's feature set does not 948 * include at least one feature that is only supported by older Intel 949 * or newer AMD processors. 950 */ 951 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 952 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 953 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 954 AMDID2_FMA4)) == 0) 955 workaround_erratum383 = 1; 956 957 /* 958 * Are large page mappings supported and enabled? 959 */ 960 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 961 if (pseflag == 0) 962 pg_ps_enabled = 0; 963 else if (pg_ps_enabled) { 964 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 965 ("pmap_init: can't assign to pagesizes[1]")); 966 pagesizes[1] = NBPDR; 967 } 968 969 /* 970 * Calculate the size of the pv head table for superpages. 971 * Handle the possibility that "vm_phys_segs[...].end" is zero. 972 */ 973 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 974 PAGE_SIZE) / NBPDR + 1; 975 976 /* 977 * Allocate memory for the pv head table for superpages. 978 */ 979 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 980 s = round_page(s); 981 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 982 M_WAITOK | M_ZERO); 983 for (i = 0; i < pv_npg; i++) 984 TAILQ_INIT(&pv_table[i].pv_list); 985 986 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 987 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 988 if (pv_chunkbase == NULL) 989 panic("pmap_init: not enough kvm for pv chunks"); 990 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 991 #if defined(PAE) || defined(PAE_TABLES) 992 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 993 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 994 UMA_ZONE_VM | UMA_ZONE_NOFREE); 995 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 996 #endif 997 998 pmap_initialized = 1; 999 pmap_init_trm(); 1000 1001 if (!bootverbose) 1002 return; 1003 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 1004 ppim = pmap_preinit_mapping + i; 1005 if (ppim->va == 0) 1006 continue; 1007 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, 1008 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); 1009 } 1010 1011 } 1012 1013 1014 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1015 "Max number of PV entries"); 1016 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1017 "Page share factor per proc"); 1018 1019 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1020 "2/4MB page mapping counters"); 1021 1022 static u_long pmap_pde_demotions; 1023 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1024 &pmap_pde_demotions, 0, "2/4MB page demotions"); 1025 1026 static u_long pmap_pde_mappings; 1027 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1028 &pmap_pde_mappings, 0, "2/4MB page mappings"); 1029 1030 static u_long pmap_pde_p_failures; 1031 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1032 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 1033 1034 static u_long pmap_pde_promotions; 1035 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1036 &pmap_pde_promotions, 0, "2/4MB page promotions"); 1037 1038 /*************************************************** 1039 * Low level helper routines..... 1040 ***************************************************/ 1041 1042 /* 1043 * Determine the appropriate bits to set in a PTE or PDE for a specified 1044 * caching mode. 1045 */ 1046 int 1047 pmap_cache_bits(int mode, boolean_t is_pde) 1048 { 1049 int cache_bits, pat_flag, pat_idx; 1050 1051 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 1052 panic("Unknown caching mode %d\n", mode); 1053 1054 /* The PAT bit is different for PTE's and PDE's. */ 1055 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 1056 1057 /* Map the caching mode to a PAT index. */ 1058 pat_idx = pat_index[mode]; 1059 1060 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1061 cache_bits = 0; 1062 if (pat_idx & 0x4) 1063 cache_bits |= pat_flag; 1064 if (pat_idx & 0x2) 1065 cache_bits |= PG_NC_PCD; 1066 if (pat_idx & 0x1) 1067 cache_bits |= PG_NC_PWT; 1068 return (cache_bits); 1069 } 1070 1071 /* 1072 * The caller is responsible for maintaining TLB consistency. 1073 */ 1074 static void 1075 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 1076 { 1077 pd_entry_t *pde; 1078 1079 pde = pmap_pde(kernel_pmap, va); 1080 pde_store(pde, newpde); 1081 } 1082 1083 /* 1084 * After changing the page size for the specified virtual address in the page 1085 * table, flush the corresponding entries from the processor's TLB. Only the 1086 * calling processor's TLB is affected. 1087 * 1088 * The calling thread must be pinned to a processor. 1089 */ 1090 static void 1091 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 1092 { 1093 1094 if ((newpde & PG_PS) == 0) 1095 /* Demotion: flush a specific 2MB page mapping. */ 1096 invlpg(va); 1097 else /* if ((newpde & PG_G) == 0) */ 1098 /* 1099 * Promotion: flush every 4KB page mapping from the TLB 1100 * because there are too many to flush individually. 1101 */ 1102 invltlb(); 1103 } 1104 1105 void 1106 invltlb_glob(void) 1107 { 1108 1109 invltlb(); 1110 } 1111 1112 1113 #ifdef SMP 1114 /* 1115 * For SMP, these functions have to use the IPI mechanism for coherence. 1116 * 1117 * N.B.: Before calling any of the following TLB invalidation functions, 1118 * the calling processor must ensure that all stores updating a non- 1119 * kernel page table are globally performed. Otherwise, another 1120 * processor could cache an old, pre-update entry without being 1121 * invalidated. This can happen one of two ways: (1) The pmap becomes 1122 * active on another processor after its pm_active field is checked by 1123 * one of the following functions but before a store updating the page 1124 * table is globally performed. (2) The pmap becomes active on another 1125 * processor before its pm_active field is checked but due to 1126 * speculative loads one of the following functions stills reads the 1127 * pmap as inactive on the other processor. 1128 * 1129 * The kernel page table is exempt because its pm_active field is 1130 * immutable. The kernel page table is always active on every 1131 * processor. 1132 */ 1133 void 1134 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1135 { 1136 cpuset_t *mask, other_cpus; 1137 u_int cpuid; 1138 1139 sched_pin(); 1140 if (pmap == kernel_pmap) { 1141 invlpg(va); 1142 mask = &all_cpus; 1143 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1144 mask = &all_cpus; 1145 } else { 1146 cpuid = PCPU_GET(cpuid); 1147 other_cpus = all_cpus; 1148 CPU_CLR(cpuid, &other_cpus); 1149 CPU_AND(&other_cpus, &pmap->pm_active); 1150 mask = &other_cpus; 1151 } 1152 smp_masked_invlpg(*mask, va, pmap); 1153 sched_unpin(); 1154 } 1155 1156 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1157 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1158 1159 void 1160 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1161 { 1162 cpuset_t *mask, other_cpus; 1163 vm_offset_t addr; 1164 u_int cpuid; 1165 1166 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1167 pmap_invalidate_all(pmap); 1168 return; 1169 } 1170 1171 sched_pin(); 1172 if (pmap == kernel_pmap) { 1173 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1174 invlpg(addr); 1175 mask = &all_cpus; 1176 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1177 mask = &all_cpus; 1178 } else { 1179 cpuid = PCPU_GET(cpuid); 1180 other_cpus = all_cpus; 1181 CPU_CLR(cpuid, &other_cpus); 1182 CPU_AND(&other_cpus, &pmap->pm_active); 1183 mask = &other_cpus; 1184 } 1185 smp_masked_invlpg_range(*mask, sva, eva, pmap); 1186 sched_unpin(); 1187 } 1188 1189 void 1190 pmap_invalidate_all(pmap_t pmap) 1191 { 1192 cpuset_t *mask, other_cpus; 1193 u_int cpuid; 1194 1195 sched_pin(); 1196 if (pmap == kernel_pmap) { 1197 invltlb(); 1198 mask = &all_cpus; 1199 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1200 mask = &all_cpus; 1201 } else { 1202 cpuid = PCPU_GET(cpuid); 1203 other_cpus = all_cpus; 1204 CPU_CLR(cpuid, &other_cpus); 1205 CPU_AND(&other_cpus, &pmap->pm_active); 1206 mask = &other_cpus; 1207 } 1208 smp_masked_invltlb(*mask, pmap); 1209 sched_unpin(); 1210 } 1211 1212 void 1213 pmap_invalidate_cache(void) 1214 { 1215 1216 sched_pin(); 1217 wbinvd(); 1218 smp_cache_flush(); 1219 sched_unpin(); 1220 } 1221 1222 struct pde_action { 1223 cpuset_t invalidate; /* processors that invalidate their TLB */ 1224 vm_offset_t va; 1225 pd_entry_t *pde; 1226 pd_entry_t newpde; 1227 u_int store; /* processor that updates the PDE */ 1228 }; 1229 1230 static void 1231 pmap_update_pde_kernel(void *arg) 1232 { 1233 struct pde_action *act = arg; 1234 pd_entry_t *pde; 1235 1236 if (act->store == PCPU_GET(cpuid)) { 1237 pde = pmap_pde(kernel_pmap, act->va); 1238 pde_store(pde, act->newpde); 1239 } 1240 } 1241 1242 static void 1243 pmap_update_pde_user(void *arg) 1244 { 1245 struct pde_action *act = arg; 1246 1247 if (act->store == PCPU_GET(cpuid)) 1248 pde_store(act->pde, act->newpde); 1249 } 1250 1251 static void 1252 pmap_update_pde_teardown(void *arg) 1253 { 1254 struct pde_action *act = arg; 1255 1256 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1257 pmap_update_pde_invalidate(act->va, act->newpde); 1258 } 1259 1260 /* 1261 * Change the page size for the specified virtual address in a way that 1262 * prevents any possibility of the TLB ever having two entries that map the 1263 * same virtual address using different page sizes. This is the recommended 1264 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1265 * machine check exception for a TLB state that is improperly diagnosed as a 1266 * hardware error. 1267 */ 1268 static void 1269 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1270 { 1271 struct pde_action act; 1272 cpuset_t active, other_cpus; 1273 u_int cpuid; 1274 1275 sched_pin(); 1276 cpuid = PCPU_GET(cpuid); 1277 other_cpus = all_cpus; 1278 CPU_CLR(cpuid, &other_cpus); 1279 if (pmap == kernel_pmap) 1280 active = all_cpus; 1281 else 1282 active = pmap->pm_active; 1283 if (CPU_OVERLAP(&active, &other_cpus)) { 1284 act.store = cpuid; 1285 act.invalidate = active; 1286 act.va = va; 1287 act.pde = pde; 1288 act.newpde = newpde; 1289 CPU_SET(cpuid, &active); 1290 smp_rendezvous_cpus(active, 1291 smp_no_rendezvous_barrier, pmap == kernel_pmap ? 1292 pmap_update_pde_kernel : pmap_update_pde_user, 1293 pmap_update_pde_teardown, &act); 1294 } else { 1295 if (pmap == kernel_pmap) 1296 pmap_kenter_pde(va, newpde); 1297 else 1298 pde_store(pde, newpde); 1299 if (CPU_ISSET(cpuid, &active)) 1300 pmap_update_pde_invalidate(va, newpde); 1301 } 1302 sched_unpin(); 1303 } 1304 #else /* !SMP */ 1305 /* 1306 * Normal, non-SMP, 486+ invalidation functions. 1307 * We inline these within pmap.c for speed. 1308 */ 1309 PMAP_INLINE void 1310 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1311 { 1312 1313 if (pmap == kernel_pmap) 1314 invlpg(va); 1315 } 1316 1317 PMAP_INLINE void 1318 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1319 { 1320 vm_offset_t addr; 1321 1322 if (pmap == kernel_pmap) 1323 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1324 invlpg(addr); 1325 } 1326 1327 PMAP_INLINE void 1328 pmap_invalidate_all(pmap_t pmap) 1329 { 1330 1331 if (pmap == kernel_pmap) 1332 invltlb(); 1333 } 1334 1335 PMAP_INLINE void 1336 pmap_invalidate_cache(void) 1337 { 1338 1339 wbinvd(); 1340 } 1341 1342 static void 1343 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1344 { 1345 1346 if (pmap == kernel_pmap) 1347 pmap_kenter_pde(va, newpde); 1348 else 1349 pde_store(pde, newpde); 1350 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1351 pmap_update_pde_invalidate(va, newpde); 1352 } 1353 #endif /* !SMP */ 1354 1355 static void 1356 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1357 { 1358 1359 /* 1360 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was 1361 * created by a promotion that did not invalidate the 512 or 1024 4KB 1362 * page mappings that might exist in the TLB. Consequently, at this 1363 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for 1364 * the address range [va, va + NBPDR). Therefore, the entire range 1365 * must be invalidated here. In contrast, when PG_PROMOTED is clear, 1366 * the TLB will not hold any 4KB page mappings for the address range 1367 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the 1368 * 2- or 4MB page mapping from the TLB. 1369 */ 1370 if ((pde & PG_PROMOTED) != 0) 1371 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 1372 else 1373 pmap_invalidate_page(pmap, va); 1374 } 1375 1376 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1377 1378 void 1379 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1380 { 1381 1382 if (force) { 1383 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 1384 } else { 1385 KASSERT((sva & PAGE_MASK) == 0, 1386 ("pmap_invalidate_cache_range: sva not page-aligned")); 1387 KASSERT((eva & PAGE_MASK) == 0, 1388 ("pmap_invalidate_cache_range: eva not page-aligned")); 1389 } 1390 1391 if ((cpu_feature & CPUID_SS) != 0 && !force) 1392 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1393 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && 1394 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1395 #ifdef DEV_APIC 1396 /* 1397 * XXX: Some CPUs fault, hang, or trash the local APIC 1398 * registers if we use CLFLUSH on the local APIC 1399 * range. The local APIC is always uncached, so we 1400 * don't need to flush for that range anyway. 1401 */ 1402 if (pmap_kextract(sva) == lapic_paddr) 1403 return; 1404 #endif 1405 /* 1406 * Otherwise, do per-cache line flush. Use the sfence 1407 * instruction to insure that previous stores are 1408 * included in the write-back. The processor 1409 * propagates flush to other processors in the cache 1410 * coherence domain. 1411 */ 1412 sfence(); 1413 for (; sva < eva; sva += cpu_clflush_line_size) 1414 clflushopt(sva); 1415 sfence(); 1416 } else if ((cpu_feature & CPUID_CLFSH) != 0 && 1417 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1418 #ifdef DEV_APIC 1419 if (pmap_kextract(sva) == lapic_paddr) 1420 return; 1421 #endif 1422 /* 1423 * Writes are ordered by CLFLUSH on Intel CPUs. 1424 */ 1425 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1426 mfence(); 1427 for (; sva < eva; sva += cpu_clflush_line_size) 1428 clflush(sva); 1429 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1430 mfence(); 1431 } else { 1432 1433 /* 1434 * No targeted cache flush methods are supported by CPU, 1435 * or the supplied range is bigger than 2MB. 1436 * Globally invalidate cache. 1437 */ 1438 pmap_invalidate_cache(); 1439 } 1440 } 1441 1442 void 1443 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1444 { 1445 int i; 1446 1447 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1448 (cpu_feature & CPUID_CLFSH) == 0) { 1449 pmap_invalidate_cache(); 1450 } else { 1451 for (i = 0; i < count; i++) 1452 pmap_flush_page(pages[i]); 1453 } 1454 } 1455 1456 /* 1457 * Are we current address space or kernel? 1458 */ 1459 static __inline int 1460 pmap_is_current(pmap_t pmap) 1461 { 1462 1463 return (pmap == kernel_pmap); 1464 } 1465 1466 /* 1467 * If the given pmap is not the current or kernel pmap, the returned pte must 1468 * be released by passing it to pmap_pte_release(). 1469 */ 1470 pt_entry_t * 1471 pmap_pte(pmap_t pmap, vm_offset_t va) 1472 { 1473 pd_entry_t newpf; 1474 pd_entry_t *pde; 1475 1476 pde = pmap_pde(pmap, va); 1477 if (*pde & PG_PS) 1478 return (pde); 1479 if (*pde != 0) { 1480 /* are we current address space or kernel? */ 1481 if (pmap_is_current(pmap)) 1482 return (vtopte(va)); 1483 mtx_lock(&PMAP2mutex); 1484 newpf = *pde & PG_FRAME; 1485 if ((*PMAP2 & PG_FRAME) != newpf) { 1486 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1487 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1488 } 1489 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1490 } 1491 return (NULL); 1492 } 1493 1494 /* 1495 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1496 * being NULL. 1497 */ 1498 static __inline void 1499 pmap_pte_release(pt_entry_t *pte) 1500 { 1501 1502 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1503 mtx_unlock(&PMAP2mutex); 1504 } 1505 1506 /* 1507 * NB: The sequence of updating a page table followed by accesses to the 1508 * corresponding pages is subject to the situation described in the "AMD64 1509 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1510 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1511 * right after modifying the PTE bits is crucial. 1512 */ 1513 static __inline void 1514 invlcaddr(void *caddr) 1515 { 1516 1517 invlpg((u_int)caddr); 1518 } 1519 1520 /* 1521 * Super fast pmap_pte routine best used when scanning 1522 * the pv lists. This eliminates many coarse-grained 1523 * invltlb calls. Note that many of the pv list 1524 * scans are across different pmaps. It is very wasteful 1525 * to do an entire invltlb for checking a single mapping. 1526 * 1527 * If the given pmap is not the current pmap, pvh_global_lock 1528 * must be held and curthread pinned to a CPU. 1529 */ 1530 static pt_entry_t * 1531 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1532 { 1533 pd_entry_t newpf; 1534 pd_entry_t *pde; 1535 1536 pde = pmap_pde(pmap, va); 1537 if (*pde & PG_PS) 1538 return (pde); 1539 if (*pde != 0) { 1540 /* are we current address space or kernel? */ 1541 if (pmap_is_current(pmap)) 1542 return (vtopte(va)); 1543 rw_assert(&pvh_global_lock, RA_WLOCKED); 1544 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1545 newpf = *pde & PG_FRAME; 1546 if ((*PMAP1 & PG_FRAME) != newpf) { 1547 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1548 #ifdef SMP 1549 PMAP1cpu = PCPU_GET(cpuid); 1550 #endif 1551 invlcaddr(PADDR1); 1552 PMAP1changed++; 1553 } else 1554 #ifdef SMP 1555 if (PMAP1cpu != PCPU_GET(cpuid)) { 1556 PMAP1cpu = PCPU_GET(cpuid); 1557 invlcaddr(PADDR1); 1558 PMAP1changedcpu++; 1559 } else 1560 #endif 1561 PMAP1unchanged++; 1562 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1563 } 1564 return (0); 1565 } 1566 1567 static pt_entry_t * 1568 pmap_pte_quick3(pmap_t pmap, vm_offset_t va) 1569 { 1570 pd_entry_t newpf; 1571 pd_entry_t *pde; 1572 1573 pde = pmap_pde(pmap, va); 1574 if (*pde & PG_PS) 1575 return (pde); 1576 if (*pde != 0) { 1577 rw_assert(&pvh_global_lock, RA_WLOCKED); 1578 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1579 newpf = *pde & PG_FRAME; 1580 if ((*PMAP3 & PG_FRAME) != newpf) { 1581 *PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M; 1582 #ifdef SMP 1583 PMAP3cpu = PCPU_GET(cpuid); 1584 #endif 1585 invlcaddr(PADDR3); 1586 PMAP1changed++; 1587 } else 1588 #ifdef SMP 1589 if (PMAP3cpu != PCPU_GET(cpuid)) { 1590 PMAP3cpu = PCPU_GET(cpuid); 1591 invlcaddr(PADDR3); 1592 PMAP1changedcpu++; 1593 } else 1594 #endif 1595 PMAP1unchanged++; 1596 return (PADDR3 + (i386_btop(va) & (NPTEPG - 1))); 1597 } 1598 return (0); 1599 } 1600 1601 static pt_entry_t 1602 pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1603 { 1604 pt_entry_t *eh_ptep, pte, *ptep; 1605 1606 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1607 pde &= PG_FRAME; 1608 critical_enter(); 1609 eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep); 1610 if ((*eh_ptep & PG_FRAME) != pde) { 1611 *eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M; 1612 invlcaddr((void *)PCPU_GET(pmap_eh_va)); 1613 } 1614 ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) & 1615 (NPTEPG - 1)); 1616 pte = *ptep; 1617 critical_exit(); 1618 return (pte); 1619 } 1620 1621 /* 1622 * Routine: pmap_extract 1623 * Function: 1624 * Extract the physical page address associated 1625 * with the given map/virtual_address pair. 1626 */ 1627 vm_paddr_t 1628 pmap_extract(pmap_t pmap, vm_offset_t va) 1629 { 1630 vm_paddr_t rtval; 1631 pt_entry_t pte; 1632 pd_entry_t pde; 1633 1634 rtval = 0; 1635 PMAP_LOCK(pmap); 1636 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1637 if (pde != 0) { 1638 if ((pde & PG_PS) != 0) 1639 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1640 else { 1641 pte = pmap_pte_ufast(pmap, va, pde); 1642 rtval = (pte & PG_FRAME) | (va & PAGE_MASK); 1643 } 1644 } 1645 PMAP_UNLOCK(pmap); 1646 return (rtval); 1647 } 1648 1649 /* 1650 * Routine: pmap_extract_and_hold 1651 * Function: 1652 * Atomically extract and hold the physical page 1653 * with the given pmap and virtual address pair 1654 * if that mapping permits the given protection. 1655 */ 1656 vm_page_t 1657 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1658 { 1659 pd_entry_t pde; 1660 pt_entry_t pte; 1661 vm_page_t m; 1662 vm_paddr_t pa; 1663 1664 pa = 0; 1665 m = NULL; 1666 PMAP_LOCK(pmap); 1667 retry: 1668 pde = *pmap_pde(pmap, va); 1669 if (pde != 0) { 1670 if (pde & PG_PS) { 1671 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1672 if (vm_page_pa_tryrelock(pmap, (pde & 1673 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1674 goto retry; 1675 m = PHYS_TO_VM_PAGE(pa); 1676 } 1677 } else { 1678 pte = pmap_pte_ufast(pmap, va, pde); 1679 if (pte != 0 && 1680 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1681 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1682 &pa)) 1683 goto retry; 1684 m = PHYS_TO_VM_PAGE(pa); 1685 } 1686 } 1687 if (m != NULL) 1688 vm_page_hold(m); 1689 } 1690 PA_UNLOCK_COND(pa); 1691 PMAP_UNLOCK(pmap); 1692 return (m); 1693 } 1694 1695 /*************************************************** 1696 * Low level mapping routines..... 1697 ***************************************************/ 1698 1699 /* 1700 * Add a wired page to the kva. 1701 * Note: not SMP coherent. 1702 * 1703 * This function may be used before pmap_bootstrap() is called. 1704 */ 1705 PMAP_INLINE void 1706 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1707 { 1708 pt_entry_t *pte; 1709 1710 pte = vtopte(va); 1711 pte_store(pte, pa | PG_RW | PG_V); 1712 } 1713 1714 static __inline void 1715 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1716 { 1717 pt_entry_t *pte; 1718 1719 pte = vtopte(va); 1720 pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(mode, 0)); 1721 } 1722 1723 /* 1724 * Remove a page from the kernel pagetables. 1725 * Note: not SMP coherent. 1726 * 1727 * This function may be used before pmap_bootstrap() is called. 1728 */ 1729 PMAP_INLINE void 1730 pmap_kremove(vm_offset_t va) 1731 { 1732 pt_entry_t *pte; 1733 1734 pte = vtopte(va); 1735 pte_clear(pte); 1736 } 1737 1738 /* 1739 * Used to map a range of physical addresses into kernel 1740 * virtual address space. 1741 * 1742 * The value passed in '*virt' is a suggested virtual address for 1743 * the mapping. Architectures which can support a direct-mapped 1744 * physical to virtual region can return the appropriate address 1745 * within that region, leaving '*virt' unchanged. Other 1746 * architectures should map the pages starting at '*virt' and 1747 * update '*virt' with the first usable address after the mapped 1748 * region. 1749 */ 1750 vm_offset_t 1751 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1752 { 1753 vm_offset_t va, sva; 1754 vm_paddr_t superpage_offset; 1755 pd_entry_t newpde; 1756 1757 va = *virt; 1758 /* 1759 * Does the physical address range's size and alignment permit at 1760 * least one superpage mapping to be created? 1761 */ 1762 superpage_offset = start & PDRMASK; 1763 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1764 /* 1765 * Increase the starting virtual address so that its alignment 1766 * does not preclude the use of superpage mappings. 1767 */ 1768 if ((va & PDRMASK) < superpage_offset) 1769 va = (va & ~PDRMASK) + superpage_offset; 1770 else if ((va & PDRMASK) > superpage_offset) 1771 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1772 } 1773 sva = va; 1774 while (start < end) { 1775 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1776 pseflag) { 1777 KASSERT((va & PDRMASK) == 0, 1778 ("pmap_map: misaligned va %#x", va)); 1779 newpde = start | PG_PS | PG_RW | PG_V; 1780 pmap_kenter_pde(va, newpde); 1781 va += NBPDR; 1782 start += NBPDR; 1783 } else { 1784 pmap_kenter(va, start); 1785 va += PAGE_SIZE; 1786 start += PAGE_SIZE; 1787 } 1788 } 1789 pmap_invalidate_range(kernel_pmap, sva, va); 1790 *virt = va; 1791 return (sva); 1792 } 1793 1794 1795 /* 1796 * Add a list of wired pages to the kva 1797 * this routine is only used for temporary 1798 * kernel mappings that do not need to have 1799 * page modification or references recorded. 1800 * Note that old mappings are simply written 1801 * over. The page *must* be wired. 1802 * Note: SMP coherent. Uses a ranged shootdown IPI. 1803 */ 1804 void 1805 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1806 { 1807 pt_entry_t *endpte, oldpte, pa, *pte; 1808 vm_page_t m; 1809 1810 oldpte = 0; 1811 pte = vtopte(sva); 1812 endpte = pte + count; 1813 while (pte < endpte) { 1814 m = *ma++; 1815 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1816 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1817 oldpte |= *pte; 1818 #if defined(PAE) || defined(PAE_TABLES) 1819 pte_store(pte, pa | pg_nx | PG_RW | PG_V); 1820 #else 1821 pte_store(pte, pa | PG_RW | PG_V); 1822 #endif 1823 } 1824 pte++; 1825 } 1826 if (__predict_false((oldpte & PG_V) != 0)) 1827 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1828 PAGE_SIZE); 1829 } 1830 1831 /* 1832 * This routine tears out page mappings from the 1833 * kernel -- it is meant only for temporary mappings. 1834 * Note: SMP coherent. Uses a ranged shootdown IPI. 1835 */ 1836 void 1837 pmap_qremove(vm_offset_t sva, int count) 1838 { 1839 vm_offset_t va; 1840 1841 va = sva; 1842 while (count-- > 0) { 1843 pmap_kremove(va); 1844 va += PAGE_SIZE; 1845 } 1846 pmap_invalidate_range(kernel_pmap, sva, va); 1847 } 1848 1849 /*************************************************** 1850 * Page table page management routines..... 1851 ***************************************************/ 1852 /* 1853 * Schedule the specified unused page table page to be freed. Specifically, 1854 * add the page to the specified list of pages that will be released to the 1855 * physical memory manager after the TLB has been updated. 1856 */ 1857 static __inline void 1858 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1859 boolean_t set_PG_ZERO) 1860 { 1861 1862 if (set_PG_ZERO) 1863 m->flags |= PG_ZERO; 1864 else 1865 m->flags &= ~PG_ZERO; 1866 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1867 } 1868 1869 /* 1870 * Inserts the specified page table page into the specified pmap's collection 1871 * of idle page table pages. Each of a pmap's page table pages is responsible 1872 * for mapping a distinct range of virtual addresses. The pmap's collection is 1873 * ordered by this virtual address range. 1874 */ 1875 static __inline int 1876 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1877 { 1878 1879 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1880 return (vm_radix_insert(&pmap->pm_root, mpte)); 1881 } 1882 1883 /* 1884 * Removes the page table page mapping the specified virtual address from the 1885 * specified pmap's collection of idle page table pages, and returns it. 1886 * Otherwise, returns NULL if there is no page table page corresponding to the 1887 * specified virtual address. 1888 */ 1889 static __inline vm_page_t 1890 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1891 { 1892 1893 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1894 return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); 1895 } 1896 1897 /* 1898 * Decrements a page table page's wire count, which is used to record the 1899 * number of valid page table entries within the page. If the wire count 1900 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1901 * page table page was unmapped and FALSE otherwise. 1902 */ 1903 static inline boolean_t 1904 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1905 { 1906 1907 --m->wire_count; 1908 if (m->wire_count == 0) { 1909 _pmap_unwire_ptp(pmap, m, free); 1910 return (TRUE); 1911 } else 1912 return (FALSE); 1913 } 1914 1915 static void 1916 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1917 { 1918 1919 /* 1920 * unmap the page table page 1921 */ 1922 pmap->pm_pdir[m->pindex] = 0; 1923 --pmap->pm_stats.resident_count; 1924 1925 /* 1926 * There is not need to invalidate the recursive mapping since 1927 * we never instantiate such mapping for the usermode pmaps, 1928 * and never remove page table pages from the kernel pmap. 1929 * Put page on a list so that it is released since all TLB 1930 * shootdown is done. 1931 */ 1932 MPASS(pmap != kernel_pmap); 1933 pmap_add_delayed_free_list(m, free, TRUE); 1934 } 1935 1936 /* 1937 * After removing a page table entry, this routine is used to 1938 * conditionally free the page, and manage the hold/wire counts. 1939 */ 1940 static int 1941 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 1942 { 1943 pd_entry_t ptepde; 1944 vm_page_t mpte; 1945 1946 if (pmap == kernel_pmap) 1947 return (0); 1948 ptepde = *pmap_pde(pmap, va); 1949 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1950 return (pmap_unwire_ptp(pmap, mpte, free)); 1951 } 1952 1953 /* 1954 * Initialize the pmap for the swapper process. 1955 */ 1956 void 1957 pmap_pinit0(pmap_t pmap) 1958 { 1959 1960 PMAP_LOCK_INIT(pmap); 1961 pmap->pm_pdir = IdlePTD; 1962 #if defined(PAE) || defined(PAE_TABLES) 1963 pmap->pm_pdpt = IdlePDPT; 1964 #endif 1965 pmap->pm_root.rt_root = 0; 1966 CPU_ZERO(&pmap->pm_active); 1967 PCPU_SET(curpmap, pmap); 1968 TAILQ_INIT(&pmap->pm_pvchunk); 1969 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1970 } 1971 1972 /* 1973 * Initialize a preallocated and zeroed pmap structure, 1974 * such as one in a vmspace structure. 1975 */ 1976 int 1977 pmap_pinit(pmap_t pmap) 1978 { 1979 vm_page_t m; 1980 int i; 1981 1982 /* 1983 * No need to allocate page table space yet but we do need a valid 1984 * page directory table. 1985 */ 1986 if (pmap->pm_pdir == NULL) { 1987 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 1988 if (pmap->pm_pdir == NULL) 1989 return (0); 1990 #if defined(PAE) || defined(PAE_TABLES) 1991 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1992 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1993 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1994 ("pmap_pinit: pdpt misaligned")); 1995 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1996 ("pmap_pinit: pdpt above 4g")); 1997 #endif 1998 pmap->pm_root.rt_root = 0; 1999 } 2000 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2001 ("pmap_pinit: pmap has reserved page table page(s)")); 2002 2003 /* 2004 * allocate the page directory page(s) 2005 */ 2006 for (i = 0; i < NPGPTD;) { 2007 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2008 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2009 if (m == NULL) { 2010 vm_wait(NULL); 2011 } else { 2012 pmap->pm_ptdpg[i] = m; 2013 #if defined(PAE) || defined(PAE_TABLES) 2014 pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V; 2015 #endif 2016 i++; 2017 } 2018 } 2019 2020 pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD); 2021 2022 for (i = 0; i < NPGPTD; i++) 2023 if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0) 2024 pagezero(pmap->pm_pdir + (i * NPDEPG)); 2025 2026 /* Install the trampoline mapping. */ 2027 pmap->pm_pdir[TRPTDI] = PTD[TRPTDI]; 2028 2029 CPU_ZERO(&pmap->pm_active); 2030 TAILQ_INIT(&pmap->pm_pvchunk); 2031 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2032 2033 return (1); 2034 } 2035 2036 /* 2037 * this routine is called if the page table page is not 2038 * mapped correctly. 2039 */ 2040 static vm_page_t 2041 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 2042 { 2043 vm_paddr_t ptepa; 2044 vm_page_t m; 2045 2046 /* 2047 * Allocate a page table page. 2048 */ 2049 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2050 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2051 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2052 PMAP_UNLOCK(pmap); 2053 rw_wunlock(&pvh_global_lock); 2054 vm_wait(NULL); 2055 rw_wlock(&pvh_global_lock); 2056 PMAP_LOCK(pmap); 2057 } 2058 2059 /* 2060 * Indicate the need to retry. While waiting, the page table 2061 * page may have been allocated. 2062 */ 2063 return (NULL); 2064 } 2065 if ((m->flags & PG_ZERO) == 0) 2066 pmap_zero_page(m); 2067 2068 /* 2069 * Map the pagetable page into the process address space, if 2070 * it isn't already there. 2071 */ 2072 2073 pmap->pm_stats.resident_count++; 2074 2075 ptepa = VM_PAGE_TO_PHYS(m); 2076 pmap->pm_pdir[ptepindex] = 2077 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 2078 2079 return (m); 2080 } 2081 2082 static vm_page_t 2083 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 2084 { 2085 u_int ptepindex; 2086 pd_entry_t ptepa; 2087 vm_page_t m; 2088 2089 /* 2090 * Calculate pagetable page index 2091 */ 2092 ptepindex = va >> PDRSHIFT; 2093 retry: 2094 /* 2095 * Get the page directory entry 2096 */ 2097 ptepa = pmap->pm_pdir[ptepindex]; 2098 2099 /* 2100 * This supports switching from a 4MB page to a 2101 * normal 4K page. 2102 */ 2103 if (ptepa & PG_PS) { 2104 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 2105 ptepa = pmap->pm_pdir[ptepindex]; 2106 } 2107 2108 /* 2109 * If the page table page is mapped, we just increment the 2110 * hold count, and activate it. 2111 */ 2112 if (ptepa) { 2113 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 2114 m->wire_count++; 2115 } else { 2116 /* 2117 * Here if the pte page isn't mapped, or if it has 2118 * been deallocated. 2119 */ 2120 m = _pmap_allocpte(pmap, ptepindex, flags); 2121 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2122 goto retry; 2123 } 2124 return (m); 2125 } 2126 2127 2128 /*************************************************** 2129 * Pmap allocation/deallocation routines. 2130 ***************************************************/ 2131 2132 /* 2133 * Release any resources held by the given physical map. 2134 * Called when a pmap initialized by pmap_pinit is being released. 2135 * Should only be called if the map contains no valid mappings. 2136 */ 2137 void 2138 pmap_release(pmap_t pmap) 2139 { 2140 vm_page_t m; 2141 int i; 2142 2143 KASSERT(pmap->pm_stats.resident_count == 0, 2144 ("pmap_release: pmap resident count %ld != 0", 2145 pmap->pm_stats.resident_count)); 2146 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2147 ("pmap_release: pmap has reserved page table page(s)")); 2148 KASSERT(CPU_EMPTY(&pmap->pm_active), 2149 ("releasing active pmap %p", pmap)); 2150 2151 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2152 2153 for (i = 0; i < NPGPTD; i++) { 2154 m = pmap->pm_ptdpg[i]; 2155 #if defined(PAE) || defined(PAE_TABLES) 2156 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2157 ("pmap_release: got wrong ptd page")); 2158 #endif 2159 vm_page_unwire_noq(m); 2160 vm_page_free(m); 2161 } 2162 } 2163 2164 static int 2165 kvm_size(SYSCTL_HANDLER_ARGS) 2166 { 2167 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2168 2169 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2170 } 2171 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2172 0, 0, kvm_size, "IU", "Size of KVM"); 2173 2174 static int 2175 kvm_free(SYSCTL_HANDLER_ARGS) 2176 { 2177 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2178 2179 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2180 } 2181 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2182 0, 0, kvm_free, "IU", "Amount of KVM free"); 2183 2184 /* 2185 * grow the number of kernel page table entries, if needed 2186 */ 2187 void 2188 pmap_growkernel(vm_offset_t addr) 2189 { 2190 vm_paddr_t ptppaddr; 2191 vm_page_t nkpg; 2192 pd_entry_t newpdir; 2193 2194 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2195 addr = roundup2(addr, NBPDR); 2196 if (addr - 1 >= kernel_map->max_offset) 2197 addr = kernel_map->max_offset; 2198 while (kernel_vm_end < addr) { 2199 if (pdir_pde(PTD, kernel_vm_end)) { 2200 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2201 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2202 kernel_vm_end = kernel_map->max_offset; 2203 break; 2204 } 2205 continue; 2206 } 2207 2208 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2209 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2210 VM_ALLOC_ZERO); 2211 if (nkpg == NULL) 2212 panic("pmap_growkernel: no memory to grow kernel"); 2213 2214 nkpt++; 2215 2216 if ((nkpg->flags & PG_ZERO) == 0) 2217 pmap_zero_page(nkpg); 2218 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2219 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2220 pdir_pde(KPTD, kernel_vm_end) = newpdir; 2221 2222 pmap_kenter_pde(kernel_vm_end, newpdir); 2223 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2224 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2225 kernel_vm_end = kernel_map->max_offset; 2226 break; 2227 } 2228 } 2229 } 2230 2231 2232 /*************************************************** 2233 * page management routines. 2234 ***************************************************/ 2235 2236 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2237 CTASSERT(_NPCM == 11); 2238 CTASSERT(_NPCPV == 336); 2239 2240 static __inline struct pv_chunk * 2241 pv_to_chunk(pv_entry_t pv) 2242 { 2243 2244 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2245 } 2246 2247 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2248 2249 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2250 #define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2251 2252 static const uint32_t pc_freemask[_NPCM] = { 2253 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2254 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2255 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2256 PC_FREE0_9, PC_FREE10 2257 }; 2258 2259 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2260 "Current number of pv entries"); 2261 2262 #ifdef PV_STATS 2263 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2264 2265 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2266 "Current number of pv entry chunks"); 2267 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2268 "Current number of pv entry chunks allocated"); 2269 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2270 "Current number of pv entry chunks frees"); 2271 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2272 "Number of times tried to get a chunk page but failed."); 2273 2274 static long pv_entry_frees, pv_entry_allocs; 2275 static int pv_entry_spare; 2276 2277 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2278 "Current number of pv entry frees"); 2279 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2280 "Current number of pv entry allocs"); 2281 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2282 "Current number of spare pv entries"); 2283 #endif 2284 2285 /* 2286 * We are in a serious low memory condition. Resort to 2287 * drastic measures to free some pages so we can allocate 2288 * another pv entry chunk. 2289 */ 2290 static vm_page_t 2291 pmap_pv_reclaim(pmap_t locked_pmap) 2292 { 2293 struct pch newtail; 2294 struct pv_chunk *pc; 2295 struct md_page *pvh; 2296 pd_entry_t *pde; 2297 pmap_t pmap; 2298 pt_entry_t *pte, tpte; 2299 pv_entry_t pv; 2300 vm_offset_t va; 2301 vm_page_t m, m_pc; 2302 struct spglist free; 2303 uint32_t inuse; 2304 int bit, field, freed; 2305 2306 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2307 pmap = NULL; 2308 m_pc = NULL; 2309 SLIST_INIT(&free); 2310 TAILQ_INIT(&newtail); 2311 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2312 SLIST_EMPTY(&free))) { 2313 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2314 if (pmap != pc->pc_pmap) { 2315 if (pmap != NULL) { 2316 pmap_invalidate_all(pmap); 2317 if (pmap != locked_pmap) 2318 PMAP_UNLOCK(pmap); 2319 } 2320 pmap = pc->pc_pmap; 2321 /* Avoid deadlock and lock recursion. */ 2322 if (pmap > locked_pmap) 2323 PMAP_LOCK(pmap); 2324 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2325 pmap = NULL; 2326 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2327 continue; 2328 } 2329 } 2330 2331 /* 2332 * Destroy every non-wired, 4 KB page mapping in the chunk. 2333 */ 2334 freed = 0; 2335 for (field = 0; field < _NPCM; field++) { 2336 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2337 inuse != 0; inuse &= ~(1UL << bit)) { 2338 bit = bsfl(inuse); 2339 pv = &pc->pc_pventry[field * 32 + bit]; 2340 va = pv->pv_va; 2341 pde = pmap_pde(pmap, va); 2342 if ((*pde & PG_PS) != 0) 2343 continue; 2344 pte = pmap_pte(pmap, va); 2345 tpte = *pte; 2346 if ((tpte & PG_W) == 0) 2347 tpte = pte_load_clear(pte); 2348 pmap_pte_release(pte); 2349 if ((tpte & PG_W) != 0) 2350 continue; 2351 KASSERT(tpte != 0, 2352 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2353 pmap, va)); 2354 if ((tpte & PG_G) != 0) 2355 pmap_invalidate_page(pmap, va); 2356 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2357 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2358 vm_page_dirty(m); 2359 if ((tpte & PG_A) != 0) 2360 vm_page_aflag_set(m, PGA_REFERENCED); 2361 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2362 if (TAILQ_EMPTY(&m->md.pv_list) && 2363 (m->flags & PG_FICTITIOUS) == 0) { 2364 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2365 if (TAILQ_EMPTY(&pvh->pv_list)) { 2366 vm_page_aflag_clear(m, 2367 PGA_WRITEABLE); 2368 } 2369 } 2370 pc->pc_map[field] |= 1UL << bit; 2371 pmap_unuse_pt(pmap, va, &free); 2372 freed++; 2373 } 2374 } 2375 if (freed == 0) { 2376 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2377 continue; 2378 } 2379 /* Every freed mapping is for a 4 KB page. */ 2380 pmap->pm_stats.resident_count -= freed; 2381 PV_STAT(pv_entry_frees += freed); 2382 PV_STAT(pv_entry_spare += freed); 2383 pv_entry_count -= freed; 2384 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2385 for (field = 0; field < _NPCM; field++) 2386 if (pc->pc_map[field] != pc_freemask[field]) { 2387 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2388 pc_list); 2389 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2390 2391 /* 2392 * One freed pv entry in locked_pmap is 2393 * sufficient. 2394 */ 2395 if (pmap == locked_pmap) 2396 goto out; 2397 break; 2398 } 2399 if (field == _NPCM) { 2400 PV_STAT(pv_entry_spare -= _NPCPV); 2401 PV_STAT(pc_chunk_count--); 2402 PV_STAT(pc_chunk_frees++); 2403 /* Entire chunk is free; return it. */ 2404 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2405 pmap_qremove((vm_offset_t)pc, 1); 2406 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2407 break; 2408 } 2409 } 2410 out: 2411 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2412 if (pmap != NULL) { 2413 pmap_invalidate_all(pmap); 2414 if (pmap != locked_pmap) 2415 PMAP_UNLOCK(pmap); 2416 } 2417 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2418 m_pc = SLIST_FIRST(&free); 2419 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2420 /* Recycle a freed page table page. */ 2421 m_pc->wire_count = 1; 2422 } 2423 vm_page_free_pages_toq(&free, true); 2424 return (m_pc); 2425 } 2426 2427 /* 2428 * free the pv_entry back to the free list 2429 */ 2430 static void 2431 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2432 { 2433 struct pv_chunk *pc; 2434 int idx, field, bit; 2435 2436 rw_assert(&pvh_global_lock, RA_WLOCKED); 2437 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2438 PV_STAT(pv_entry_frees++); 2439 PV_STAT(pv_entry_spare++); 2440 pv_entry_count--; 2441 pc = pv_to_chunk(pv); 2442 idx = pv - &pc->pc_pventry[0]; 2443 field = idx / 32; 2444 bit = idx % 32; 2445 pc->pc_map[field] |= 1ul << bit; 2446 for (idx = 0; idx < _NPCM; idx++) 2447 if (pc->pc_map[idx] != pc_freemask[idx]) { 2448 /* 2449 * 98% of the time, pc is already at the head of the 2450 * list. If it isn't already, move it to the head. 2451 */ 2452 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2453 pc)) { 2454 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2455 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2456 pc_list); 2457 } 2458 return; 2459 } 2460 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2461 free_pv_chunk(pc); 2462 } 2463 2464 static void 2465 free_pv_chunk(struct pv_chunk *pc) 2466 { 2467 vm_page_t m; 2468 2469 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2470 PV_STAT(pv_entry_spare -= _NPCPV); 2471 PV_STAT(pc_chunk_count--); 2472 PV_STAT(pc_chunk_frees++); 2473 /* entire chunk is free, return it */ 2474 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2475 pmap_qremove((vm_offset_t)pc, 1); 2476 vm_page_unwire(m, PQ_NONE); 2477 vm_page_free(m); 2478 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2479 } 2480 2481 /* 2482 * get a new pv_entry, allocating a block from the system 2483 * when needed. 2484 */ 2485 static pv_entry_t 2486 get_pv_entry(pmap_t pmap, boolean_t try) 2487 { 2488 static const struct timeval printinterval = { 60, 0 }; 2489 static struct timeval lastprint; 2490 int bit, field; 2491 pv_entry_t pv; 2492 struct pv_chunk *pc; 2493 vm_page_t m; 2494 2495 rw_assert(&pvh_global_lock, RA_WLOCKED); 2496 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2497 PV_STAT(pv_entry_allocs++); 2498 pv_entry_count++; 2499 if (pv_entry_count > pv_entry_high_water) 2500 if (ratecheck(&lastprint, &printinterval)) 2501 printf("Approaching the limit on PV entries, consider " 2502 "increasing either the vm.pmap.shpgperproc or the " 2503 "vm.pmap.pv_entry_max tunable.\n"); 2504 retry: 2505 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2506 if (pc != NULL) { 2507 for (field = 0; field < _NPCM; field++) { 2508 if (pc->pc_map[field]) { 2509 bit = bsfl(pc->pc_map[field]); 2510 break; 2511 } 2512 } 2513 if (field < _NPCM) { 2514 pv = &pc->pc_pventry[field * 32 + bit]; 2515 pc->pc_map[field] &= ~(1ul << bit); 2516 /* If this was the last item, move it to tail */ 2517 for (field = 0; field < _NPCM; field++) 2518 if (pc->pc_map[field] != 0) { 2519 PV_STAT(pv_entry_spare--); 2520 return (pv); /* not full, return */ 2521 } 2522 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2523 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2524 PV_STAT(pv_entry_spare--); 2525 return (pv); 2526 } 2527 } 2528 /* 2529 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2530 * global lock. If "pv_vafree" is currently non-empty, it will 2531 * remain non-empty until pmap_ptelist_alloc() completes. 2532 */ 2533 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2534 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2535 if (try) { 2536 pv_entry_count--; 2537 PV_STAT(pc_chunk_tryfail++); 2538 return (NULL); 2539 } 2540 m = pmap_pv_reclaim(pmap); 2541 if (m == NULL) 2542 goto retry; 2543 } 2544 PV_STAT(pc_chunk_count++); 2545 PV_STAT(pc_chunk_allocs++); 2546 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2547 pmap_qenter((vm_offset_t)pc, &m, 1); 2548 pc->pc_pmap = pmap; 2549 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2550 for (field = 1; field < _NPCM; field++) 2551 pc->pc_map[field] = pc_freemask[field]; 2552 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2553 pv = &pc->pc_pventry[0]; 2554 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2555 PV_STAT(pv_entry_spare += _NPCPV - 1); 2556 return (pv); 2557 } 2558 2559 static __inline pv_entry_t 2560 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2561 { 2562 pv_entry_t pv; 2563 2564 rw_assert(&pvh_global_lock, RA_WLOCKED); 2565 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2566 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2567 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2568 break; 2569 } 2570 } 2571 return (pv); 2572 } 2573 2574 static void 2575 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2576 { 2577 struct md_page *pvh; 2578 pv_entry_t pv; 2579 vm_offset_t va_last; 2580 vm_page_t m; 2581 2582 rw_assert(&pvh_global_lock, RA_WLOCKED); 2583 KASSERT((pa & PDRMASK) == 0, 2584 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2585 2586 /* 2587 * Transfer the 4mpage's pv entry for this mapping to the first 2588 * page's pv list. 2589 */ 2590 pvh = pa_to_pvh(pa); 2591 va = trunc_4mpage(va); 2592 pv = pmap_pvh_remove(pvh, pmap, va); 2593 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2594 m = PHYS_TO_VM_PAGE(pa); 2595 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2596 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2597 va_last = va + NBPDR - PAGE_SIZE; 2598 do { 2599 m++; 2600 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2601 ("pmap_pv_demote_pde: page %p is not managed", m)); 2602 va += PAGE_SIZE; 2603 pmap_insert_entry(pmap, va, m); 2604 } while (va < va_last); 2605 } 2606 2607 #if VM_NRESERVLEVEL > 0 2608 static void 2609 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2610 { 2611 struct md_page *pvh; 2612 pv_entry_t pv; 2613 vm_offset_t va_last; 2614 vm_page_t m; 2615 2616 rw_assert(&pvh_global_lock, RA_WLOCKED); 2617 KASSERT((pa & PDRMASK) == 0, 2618 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2619 2620 /* 2621 * Transfer the first page's pv entry for this mapping to the 2622 * 4mpage's pv list. Aside from avoiding the cost of a call 2623 * to get_pv_entry(), a transfer avoids the possibility that 2624 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2625 * removes one of the mappings that is being promoted. 2626 */ 2627 m = PHYS_TO_VM_PAGE(pa); 2628 va = trunc_4mpage(va); 2629 pv = pmap_pvh_remove(&m->md, pmap, va); 2630 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2631 pvh = pa_to_pvh(pa); 2632 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2633 /* Free the remaining NPTEPG - 1 pv entries. */ 2634 va_last = va + NBPDR - PAGE_SIZE; 2635 do { 2636 m++; 2637 va += PAGE_SIZE; 2638 pmap_pvh_free(&m->md, pmap, va); 2639 } while (va < va_last); 2640 } 2641 #endif /* VM_NRESERVLEVEL > 0 */ 2642 2643 static void 2644 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2645 { 2646 pv_entry_t pv; 2647 2648 pv = pmap_pvh_remove(pvh, pmap, va); 2649 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2650 free_pv_entry(pmap, pv); 2651 } 2652 2653 static void 2654 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2655 { 2656 struct md_page *pvh; 2657 2658 rw_assert(&pvh_global_lock, RA_WLOCKED); 2659 pmap_pvh_free(&m->md, pmap, va); 2660 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2661 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2662 if (TAILQ_EMPTY(&pvh->pv_list)) 2663 vm_page_aflag_clear(m, PGA_WRITEABLE); 2664 } 2665 } 2666 2667 /* 2668 * Create a pv entry for page at pa for 2669 * (pmap, va). 2670 */ 2671 static void 2672 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2673 { 2674 pv_entry_t pv; 2675 2676 rw_assert(&pvh_global_lock, RA_WLOCKED); 2677 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2678 pv = get_pv_entry(pmap, FALSE); 2679 pv->pv_va = va; 2680 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2681 } 2682 2683 /* 2684 * Conditionally create a pv entry. 2685 */ 2686 static boolean_t 2687 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2688 { 2689 pv_entry_t pv; 2690 2691 rw_assert(&pvh_global_lock, RA_WLOCKED); 2692 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2693 if (pv_entry_count < pv_entry_high_water && 2694 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2695 pv->pv_va = va; 2696 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2697 return (TRUE); 2698 } else 2699 return (FALSE); 2700 } 2701 2702 /* 2703 * Create the pv entries for each of the pages within a superpage. 2704 */ 2705 static boolean_t 2706 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2707 { 2708 struct md_page *pvh; 2709 pv_entry_t pv; 2710 2711 rw_assert(&pvh_global_lock, RA_WLOCKED); 2712 if (pv_entry_count < pv_entry_high_water && 2713 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2714 pv->pv_va = va; 2715 pvh = pa_to_pvh(pa); 2716 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2717 return (TRUE); 2718 } else 2719 return (FALSE); 2720 } 2721 2722 /* 2723 * Fills a page table page with mappings to consecutive physical pages. 2724 */ 2725 static void 2726 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2727 { 2728 pt_entry_t *pte; 2729 2730 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2731 *pte = newpte; 2732 newpte += PAGE_SIZE; 2733 } 2734 } 2735 2736 /* 2737 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2738 * 2- or 4MB page mapping is invalidated. 2739 */ 2740 static boolean_t 2741 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2742 { 2743 pd_entry_t newpde, oldpde; 2744 pt_entry_t *firstpte, newpte; 2745 vm_paddr_t mptepa; 2746 vm_page_t mpte; 2747 struct spglist free; 2748 vm_offset_t sva; 2749 2750 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2751 oldpde = *pde; 2752 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2753 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2754 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2755 NULL) { 2756 KASSERT((oldpde & PG_W) == 0, 2757 ("pmap_demote_pde: page table page for a wired mapping" 2758 " is missing")); 2759 2760 /* 2761 * Invalidate the 2- or 4MB page mapping and return 2762 * "failure" if the mapping was never accessed or the 2763 * allocation of the new page table page fails. 2764 */ 2765 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2766 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2767 VM_ALLOC_WIRED)) == NULL) { 2768 SLIST_INIT(&free); 2769 sva = trunc_4mpage(va); 2770 pmap_remove_pde(pmap, pde, sva, &free); 2771 if ((oldpde & PG_G) == 0) 2772 pmap_invalidate_pde_page(pmap, sva, oldpde); 2773 vm_page_free_pages_toq(&free, true); 2774 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2775 " in pmap %p", va, pmap); 2776 return (FALSE); 2777 } 2778 if (pmap != kernel_pmap) 2779 pmap->pm_stats.resident_count++; 2780 } 2781 mptepa = VM_PAGE_TO_PHYS(mpte); 2782 2783 /* 2784 * If the page mapping is in the kernel's address space, then the 2785 * KPTmap can provide access to the page table page. Otherwise, 2786 * temporarily map the page table page (mpte) into the kernel's 2787 * address space at either PADDR1 or PADDR2. 2788 */ 2789 if (pmap == kernel_pmap) 2790 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2791 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2792 if ((*PMAP1 & PG_FRAME) != mptepa) { 2793 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2794 #ifdef SMP 2795 PMAP1cpu = PCPU_GET(cpuid); 2796 #endif 2797 invlcaddr(PADDR1); 2798 PMAP1changed++; 2799 } else 2800 #ifdef SMP 2801 if (PMAP1cpu != PCPU_GET(cpuid)) { 2802 PMAP1cpu = PCPU_GET(cpuid); 2803 invlcaddr(PADDR1); 2804 PMAP1changedcpu++; 2805 } else 2806 #endif 2807 PMAP1unchanged++; 2808 firstpte = PADDR1; 2809 } else { 2810 mtx_lock(&PMAP2mutex); 2811 if ((*PMAP2 & PG_FRAME) != mptepa) { 2812 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2813 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2814 } 2815 firstpte = PADDR2; 2816 } 2817 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2818 KASSERT((oldpde & PG_A) != 0, 2819 ("pmap_demote_pde: oldpde is missing PG_A")); 2820 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2821 ("pmap_demote_pde: oldpde is missing PG_M")); 2822 newpte = oldpde & ~PG_PS; 2823 if ((newpte & PG_PDE_PAT) != 0) 2824 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2825 2826 /* 2827 * If the page table page is new, initialize it. 2828 */ 2829 if (mpte->wire_count == 1) { 2830 mpte->wire_count = NPTEPG; 2831 pmap_fill_ptp(firstpte, newpte); 2832 } 2833 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2834 ("pmap_demote_pde: firstpte and newpte map different physical" 2835 " addresses")); 2836 2837 /* 2838 * If the mapping has changed attributes, update the page table 2839 * entries. 2840 */ 2841 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2842 pmap_fill_ptp(firstpte, newpte); 2843 2844 /* 2845 * Demote the mapping. This pmap is locked. The old PDE has 2846 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2847 * set. Thus, there is no danger of a race with another 2848 * processor changing the setting of PG_A and/or PG_M between 2849 * the read above and the store below. 2850 */ 2851 if (workaround_erratum383) 2852 pmap_update_pde(pmap, va, pde, newpde); 2853 else if (pmap == kernel_pmap) 2854 pmap_kenter_pde(va, newpde); 2855 else 2856 pde_store(pde, newpde); 2857 if (firstpte == PADDR2) 2858 mtx_unlock(&PMAP2mutex); 2859 2860 /* 2861 * Invalidate the recursive mapping of the page table page. 2862 */ 2863 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2864 2865 /* 2866 * Demote the pv entry. This depends on the earlier demotion 2867 * of the mapping. Specifically, the (re)creation of a per- 2868 * page pv entry might trigger the execution of pmap_collect(), 2869 * which might reclaim a newly (re)created per-page pv entry 2870 * and destroy the associated mapping. In order to destroy 2871 * the mapping, the PDE must have already changed from mapping 2872 * the 2mpage to referencing the page table page. 2873 */ 2874 if ((oldpde & PG_MANAGED) != 0) 2875 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2876 2877 pmap_pde_demotions++; 2878 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2879 " in pmap %p", va, pmap); 2880 return (TRUE); 2881 } 2882 2883 /* 2884 * Removes a 2- or 4MB page mapping from the kernel pmap. 2885 */ 2886 static void 2887 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2888 { 2889 pd_entry_t newpde; 2890 vm_paddr_t mptepa; 2891 vm_page_t mpte; 2892 2893 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2894 mpte = pmap_remove_pt_page(pmap, va); 2895 if (mpte == NULL) 2896 panic("pmap_remove_kernel_pde: Missing pt page."); 2897 2898 mptepa = VM_PAGE_TO_PHYS(mpte); 2899 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2900 2901 /* 2902 * Initialize the page table page. 2903 */ 2904 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2905 2906 /* 2907 * Remove the mapping. 2908 */ 2909 if (workaround_erratum383) 2910 pmap_update_pde(pmap, va, pde, newpde); 2911 else 2912 pmap_kenter_pde(va, newpde); 2913 2914 /* 2915 * Invalidate the recursive mapping of the page table page. 2916 */ 2917 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2918 } 2919 2920 /* 2921 * pmap_remove_pde: do the things to unmap a superpage in a process 2922 */ 2923 static void 2924 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2925 struct spglist *free) 2926 { 2927 struct md_page *pvh; 2928 pd_entry_t oldpde; 2929 vm_offset_t eva, va; 2930 vm_page_t m, mpte; 2931 2932 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2933 KASSERT((sva & PDRMASK) == 0, 2934 ("pmap_remove_pde: sva is not 4mpage aligned")); 2935 oldpde = pte_load_clear(pdq); 2936 if (oldpde & PG_W) 2937 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2938 2939 /* 2940 * Machines that don't support invlpg, also don't support 2941 * PG_G. 2942 */ 2943 if ((oldpde & PG_G) != 0) 2944 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 2945 2946 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2947 if (oldpde & PG_MANAGED) { 2948 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2949 pmap_pvh_free(pvh, pmap, sva); 2950 eva = sva + NBPDR; 2951 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2952 va < eva; va += PAGE_SIZE, m++) { 2953 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2954 vm_page_dirty(m); 2955 if (oldpde & PG_A) 2956 vm_page_aflag_set(m, PGA_REFERENCED); 2957 if (TAILQ_EMPTY(&m->md.pv_list) && 2958 TAILQ_EMPTY(&pvh->pv_list)) 2959 vm_page_aflag_clear(m, PGA_WRITEABLE); 2960 } 2961 } 2962 if (pmap == kernel_pmap) { 2963 pmap_remove_kernel_pde(pmap, pdq, sva); 2964 } else { 2965 mpte = pmap_remove_pt_page(pmap, sva); 2966 if (mpte != NULL) { 2967 pmap->pm_stats.resident_count--; 2968 KASSERT(mpte->wire_count == NPTEPG, 2969 ("pmap_remove_pde: pte page wire count error")); 2970 mpte->wire_count = 0; 2971 pmap_add_delayed_free_list(mpte, free, FALSE); 2972 } 2973 } 2974 } 2975 2976 /* 2977 * pmap_remove_pte: do the things to unmap a page in a process 2978 */ 2979 static int 2980 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2981 struct spglist *free) 2982 { 2983 pt_entry_t oldpte; 2984 vm_page_t m; 2985 2986 rw_assert(&pvh_global_lock, RA_WLOCKED); 2987 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2988 oldpte = pte_load_clear(ptq); 2989 KASSERT(oldpte != 0, 2990 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2991 if (oldpte & PG_W) 2992 pmap->pm_stats.wired_count -= 1; 2993 /* 2994 * Machines that don't support invlpg, also don't support 2995 * PG_G. 2996 */ 2997 if (oldpte & PG_G) 2998 pmap_invalidate_page(kernel_pmap, va); 2999 pmap->pm_stats.resident_count -= 1; 3000 if (oldpte & PG_MANAGED) { 3001 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3002 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3003 vm_page_dirty(m); 3004 if (oldpte & PG_A) 3005 vm_page_aflag_set(m, PGA_REFERENCED); 3006 pmap_remove_entry(pmap, m, va); 3007 } 3008 return (pmap_unuse_pt(pmap, va, free)); 3009 } 3010 3011 /* 3012 * Remove a single page from a process address space 3013 */ 3014 static void 3015 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 3016 { 3017 pt_entry_t *pte; 3018 3019 rw_assert(&pvh_global_lock, RA_WLOCKED); 3020 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 3021 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3022 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 3023 return; 3024 pmap_remove_pte(pmap, pte, va, free); 3025 pmap_invalidate_page(pmap, va); 3026 } 3027 3028 /* 3029 * Remove the given range of addresses from the specified map. 3030 * 3031 * It is assumed that the start and end are properly 3032 * rounded to the page size. 3033 */ 3034 void 3035 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3036 { 3037 vm_offset_t pdnxt; 3038 pd_entry_t ptpaddr; 3039 pt_entry_t *pte; 3040 struct spglist free; 3041 int anyvalid; 3042 3043 /* 3044 * Perform an unsynchronized read. This is, however, safe. 3045 */ 3046 if (pmap->pm_stats.resident_count == 0) 3047 return; 3048 3049 anyvalid = 0; 3050 SLIST_INIT(&free); 3051 3052 rw_wlock(&pvh_global_lock); 3053 sched_pin(); 3054 PMAP_LOCK(pmap); 3055 3056 /* 3057 * special handling of removing one page. a very 3058 * common operation and easy to short circuit some 3059 * code. 3060 */ 3061 if ((sva + PAGE_SIZE == eva) && 3062 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 3063 pmap_remove_page(pmap, sva, &free); 3064 goto out; 3065 } 3066 3067 for (; sva < eva; sva = pdnxt) { 3068 u_int pdirindex; 3069 3070 /* 3071 * Calculate index for next page table. 3072 */ 3073 pdnxt = (sva + NBPDR) & ~PDRMASK; 3074 if (pdnxt < sva) 3075 pdnxt = eva; 3076 if (pmap->pm_stats.resident_count == 0) 3077 break; 3078 3079 pdirindex = sva >> PDRSHIFT; 3080 ptpaddr = pmap->pm_pdir[pdirindex]; 3081 3082 /* 3083 * Weed out invalid mappings. Note: we assume that the page 3084 * directory table is always allocated, and in kernel virtual. 3085 */ 3086 if (ptpaddr == 0) 3087 continue; 3088 3089 /* 3090 * Check for large page. 3091 */ 3092 if ((ptpaddr & PG_PS) != 0) { 3093 /* 3094 * Are we removing the entire large page? If not, 3095 * demote the mapping and fall through. 3096 */ 3097 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3098 /* 3099 * The TLB entry for a PG_G mapping is 3100 * invalidated by pmap_remove_pde(). 3101 */ 3102 if ((ptpaddr & PG_G) == 0) 3103 anyvalid = 1; 3104 pmap_remove_pde(pmap, 3105 &pmap->pm_pdir[pdirindex], sva, &free); 3106 continue; 3107 } else if (!pmap_demote_pde(pmap, 3108 &pmap->pm_pdir[pdirindex], sva)) { 3109 /* The large page mapping was destroyed. */ 3110 continue; 3111 } 3112 } 3113 3114 /* 3115 * Limit our scan to either the end of the va represented 3116 * by the current page table page, or to the end of the 3117 * range being removed. 3118 */ 3119 if (pdnxt > eva) 3120 pdnxt = eva; 3121 3122 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3123 sva += PAGE_SIZE) { 3124 if (*pte == 0) 3125 continue; 3126 3127 /* 3128 * The TLB entry for a PG_G mapping is invalidated 3129 * by pmap_remove_pte(). 3130 */ 3131 if ((*pte & PG_G) == 0) 3132 anyvalid = 1; 3133 if (pmap_remove_pte(pmap, pte, sva, &free)) 3134 break; 3135 } 3136 } 3137 out: 3138 sched_unpin(); 3139 if (anyvalid) 3140 pmap_invalidate_all(pmap); 3141 rw_wunlock(&pvh_global_lock); 3142 PMAP_UNLOCK(pmap); 3143 vm_page_free_pages_toq(&free, true); 3144 } 3145 3146 /* 3147 * Routine: pmap_remove_all 3148 * Function: 3149 * Removes this physical page from 3150 * all physical maps in which it resides. 3151 * Reflects back modify bits to the pager. 3152 * 3153 * Notes: 3154 * Original versions of this routine were very 3155 * inefficient because they iteratively called 3156 * pmap_remove (slow...) 3157 */ 3158 3159 void 3160 pmap_remove_all(vm_page_t m) 3161 { 3162 struct md_page *pvh; 3163 pv_entry_t pv; 3164 pmap_t pmap; 3165 pt_entry_t *pte, tpte; 3166 pd_entry_t *pde; 3167 vm_offset_t va; 3168 struct spglist free; 3169 3170 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3171 ("pmap_remove_all: page %p is not managed", m)); 3172 SLIST_INIT(&free); 3173 rw_wlock(&pvh_global_lock); 3174 sched_pin(); 3175 if ((m->flags & PG_FICTITIOUS) != 0) 3176 goto small_mappings; 3177 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3178 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3179 va = pv->pv_va; 3180 pmap = PV_PMAP(pv); 3181 PMAP_LOCK(pmap); 3182 pde = pmap_pde(pmap, va); 3183 (void)pmap_demote_pde(pmap, pde, va); 3184 PMAP_UNLOCK(pmap); 3185 } 3186 small_mappings: 3187 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3188 pmap = PV_PMAP(pv); 3189 PMAP_LOCK(pmap); 3190 pmap->pm_stats.resident_count--; 3191 pde = pmap_pde(pmap, pv->pv_va); 3192 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3193 " a 4mpage in page %p's pv list", m)); 3194 pte = pmap_pte_quick(pmap, pv->pv_va); 3195 tpte = pte_load_clear(pte); 3196 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3197 pmap, pv->pv_va)); 3198 if (tpte & PG_W) 3199 pmap->pm_stats.wired_count--; 3200 if (tpte & PG_A) 3201 vm_page_aflag_set(m, PGA_REFERENCED); 3202 3203 /* 3204 * Update the vm_page_t clean and reference bits. 3205 */ 3206 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3207 vm_page_dirty(m); 3208 pmap_unuse_pt(pmap, pv->pv_va, &free); 3209 pmap_invalidate_page(pmap, pv->pv_va); 3210 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3211 free_pv_entry(pmap, pv); 3212 PMAP_UNLOCK(pmap); 3213 } 3214 vm_page_aflag_clear(m, PGA_WRITEABLE); 3215 sched_unpin(); 3216 rw_wunlock(&pvh_global_lock); 3217 vm_page_free_pages_toq(&free, true); 3218 } 3219 3220 /* 3221 * pmap_protect_pde: do the things to protect a 4mpage in a process 3222 */ 3223 static boolean_t 3224 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3225 { 3226 pd_entry_t newpde, oldpde; 3227 vm_offset_t eva, va; 3228 vm_page_t m; 3229 boolean_t anychanged; 3230 3231 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3232 KASSERT((sva & PDRMASK) == 0, 3233 ("pmap_protect_pde: sva is not 4mpage aligned")); 3234 anychanged = FALSE; 3235 retry: 3236 oldpde = newpde = *pde; 3237 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 3238 (PG_MANAGED | PG_M | PG_RW)) { 3239 eva = sva + NBPDR; 3240 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3241 va < eva; va += PAGE_SIZE, m++) 3242 vm_page_dirty(m); 3243 } 3244 if ((prot & VM_PROT_WRITE) == 0) 3245 newpde &= ~(PG_RW | PG_M); 3246 #if defined(PAE) || defined(PAE_TABLES) 3247 if ((prot & VM_PROT_EXECUTE) == 0) 3248 newpde |= pg_nx; 3249 #endif 3250 if (newpde != oldpde) { 3251 /* 3252 * As an optimization to future operations on this PDE, clear 3253 * PG_PROMOTED. The impending invalidation will remove any 3254 * lingering 4KB page mappings from the TLB. 3255 */ 3256 if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) 3257 goto retry; 3258 if ((oldpde & PG_G) != 0) 3259 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 3260 else 3261 anychanged = TRUE; 3262 } 3263 return (anychanged); 3264 } 3265 3266 /* 3267 * Set the physical protection on the 3268 * specified range of this map as requested. 3269 */ 3270 void 3271 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3272 { 3273 vm_offset_t pdnxt; 3274 pd_entry_t ptpaddr; 3275 pt_entry_t *pte; 3276 boolean_t anychanged, pv_lists_locked; 3277 3278 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3279 if (prot == VM_PROT_NONE) { 3280 pmap_remove(pmap, sva, eva); 3281 return; 3282 } 3283 3284 #if defined(PAE) || defined(PAE_TABLES) 3285 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3286 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3287 return; 3288 #else 3289 if (prot & VM_PROT_WRITE) 3290 return; 3291 #endif 3292 3293 if (pmap_is_current(pmap)) 3294 pv_lists_locked = FALSE; 3295 else { 3296 pv_lists_locked = TRUE; 3297 resume: 3298 rw_wlock(&pvh_global_lock); 3299 sched_pin(); 3300 } 3301 anychanged = FALSE; 3302 3303 PMAP_LOCK(pmap); 3304 for (; sva < eva; sva = pdnxt) { 3305 pt_entry_t obits, pbits; 3306 u_int pdirindex; 3307 3308 pdnxt = (sva + NBPDR) & ~PDRMASK; 3309 if (pdnxt < sva) 3310 pdnxt = eva; 3311 3312 pdirindex = sva >> PDRSHIFT; 3313 ptpaddr = pmap->pm_pdir[pdirindex]; 3314 3315 /* 3316 * Weed out invalid mappings. Note: we assume that the page 3317 * directory table is always allocated, and in kernel virtual. 3318 */ 3319 if (ptpaddr == 0) 3320 continue; 3321 3322 /* 3323 * Check for large page. 3324 */ 3325 if ((ptpaddr & PG_PS) != 0) { 3326 /* 3327 * Are we protecting the entire large page? If not, 3328 * demote the mapping and fall through. 3329 */ 3330 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3331 /* 3332 * The TLB entry for a PG_G mapping is 3333 * invalidated by pmap_protect_pde(). 3334 */ 3335 if (pmap_protect_pde(pmap, 3336 &pmap->pm_pdir[pdirindex], sva, prot)) 3337 anychanged = TRUE; 3338 continue; 3339 } else { 3340 if (!pv_lists_locked) { 3341 pv_lists_locked = TRUE; 3342 if (!rw_try_wlock(&pvh_global_lock)) { 3343 if (anychanged) 3344 pmap_invalidate_all( 3345 pmap); 3346 PMAP_UNLOCK(pmap); 3347 goto resume; 3348 } 3349 sched_pin(); 3350 } 3351 if (!pmap_demote_pde(pmap, 3352 &pmap->pm_pdir[pdirindex], sva)) { 3353 /* 3354 * The large page mapping was 3355 * destroyed. 3356 */ 3357 continue; 3358 } 3359 } 3360 } 3361 3362 if (pdnxt > eva) 3363 pdnxt = eva; 3364 3365 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3366 sva += PAGE_SIZE) { 3367 vm_page_t m; 3368 3369 retry: 3370 /* 3371 * Regardless of whether a pte is 32 or 64 bits in 3372 * size, PG_RW, PG_A, and PG_M are among the least 3373 * significant 32 bits. 3374 */ 3375 obits = pbits = *pte; 3376 if ((pbits & PG_V) == 0) 3377 continue; 3378 3379 if ((prot & VM_PROT_WRITE) == 0) { 3380 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3381 (PG_MANAGED | PG_M | PG_RW)) { 3382 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3383 vm_page_dirty(m); 3384 } 3385 pbits &= ~(PG_RW | PG_M); 3386 } 3387 #if defined(PAE) || defined(PAE_TABLES) 3388 if ((prot & VM_PROT_EXECUTE) == 0) 3389 pbits |= pg_nx; 3390 #endif 3391 3392 if (pbits != obits) { 3393 #if defined(PAE) || defined(PAE_TABLES) 3394 if (!atomic_cmpset_64(pte, obits, pbits)) 3395 goto retry; 3396 #else 3397 if (!atomic_cmpset_int((u_int *)pte, obits, 3398 pbits)) 3399 goto retry; 3400 #endif 3401 if (obits & PG_G) 3402 pmap_invalidate_page(pmap, sva); 3403 else 3404 anychanged = TRUE; 3405 } 3406 } 3407 } 3408 if (anychanged) 3409 pmap_invalidate_all(pmap); 3410 if (pv_lists_locked) { 3411 sched_unpin(); 3412 rw_wunlock(&pvh_global_lock); 3413 } 3414 PMAP_UNLOCK(pmap); 3415 } 3416 3417 #if VM_NRESERVLEVEL > 0 3418 /* 3419 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3420 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3421 * For promotion to occur, two conditions must be met: (1) the 4KB page 3422 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3423 * mappings must have identical characteristics. 3424 * 3425 * Managed (PG_MANAGED) mappings within the kernel address space are not 3426 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3427 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3428 * pmap. 3429 */ 3430 static void 3431 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3432 { 3433 pd_entry_t newpde; 3434 pt_entry_t *firstpte, oldpte, pa, *pte; 3435 vm_offset_t oldpteva; 3436 vm_page_t mpte; 3437 3438 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3439 3440 /* 3441 * Examine the first PTE in the specified PTP. Abort if this PTE is 3442 * either invalid, unused, or does not map the first 4KB physical page 3443 * within a 2- or 4MB page. 3444 */ 3445 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3446 setpde: 3447 newpde = *firstpte; 3448 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3449 pmap_pde_p_failures++; 3450 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3451 " in pmap %p", va, pmap); 3452 return; 3453 } 3454 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3455 pmap_pde_p_failures++; 3456 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3457 " in pmap %p", va, pmap); 3458 return; 3459 } 3460 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3461 /* 3462 * When PG_M is already clear, PG_RW can be cleared without 3463 * a TLB invalidation. 3464 */ 3465 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3466 ~PG_RW)) 3467 goto setpde; 3468 newpde &= ~PG_RW; 3469 } 3470 3471 /* 3472 * Examine each of the other PTEs in the specified PTP. Abort if this 3473 * PTE maps an unexpected 4KB physical page or does not have identical 3474 * characteristics to the first PTE. 3475 */ 3476 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3477 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3478 setpte: 3479 oldpte = *pte; 3480 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3481 pmap_pde_p_failures++; 3482 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3483 " in pmap %p", va, pmap); 3484 return; 3485 } 3486 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3487 /* 3488 * When PG_M is already clear, PG_RW can be cleared 3489 * without a TLB invalidation. 3490 */ 3491 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3492 oldpte & ~PG_RW)) 3493 goto setpte; 3494 oldpte &= ~PG_RW; 3495 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3496 (va & ~PDRMASK); 3497 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3498 " in pmap %p", oldpteva, pmap); 3499 } 3500 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3501 pmap_pde_p_failures++; 3502 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3503 " in pmap %p", va, pmap); 3504 return; 3505 } 3506 pa -= PAGE_SIZE; 3507 } 3508 3509 /* 3510 * Save the page table page in its current state until the PDE 3511 * mapping the superpage is demoted by pmap_demote_pde() or 3512 * destroyed by pmap_remove_pde(). 3513 */ 3514 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3515 KASSERT(mpte >= vm_page_array && 3516 mpte < &vm_page_array[vm_page_array_size], 3517 ("pmap_promote_pde: page table page is out of range")); 3518 KASSERT(mpte->pindex == va >> PDRSHIFT, 3519 ("pmap_promote_pde: page table page's pindex is wrong")); 3520 if (pmap_insert_pt_page(pmap, mpte)) { 3521 pmap_pde_p_failures++; 3522 CTR2(KTR_PMAP, 3523 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3524 pmap); 3525 return; 3526 } 3527 3528 /* 3529 * Promote the pv entries. 3530 */ 3531 if ((newpde & PG_MANAGED) != 0) 3532 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3533 3534 /* 3535 * Propagate the PAT index to its proper position. 3536 */ 3537 if ((newpde & PG_PTE_PAT) != 0) 3538 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3539 3540 /* 3541 * Map the superpage. 3542 */ 3543 if (workaround_erratum383) 3544 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3545 else if (pmap == kernel_pmap) 3546 pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); 3547 else 3548 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 3549 3550 pmap_pde_promotions++; 3551 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3552 " in pmap %p", va, pmap); 3553 } 3554 #endif /* VM_NRESERVLEVEL > 0 */ 3555 3556 /* 3557 * Insert the given physical page (p) at 3558 * the specified virtual address (v) in the 3559 * target physical map with the protection requested. 3560 * 3561 * If specified, the page will be wired down, meaning 3562 * that the related pte can not be reclaimed. 3563 * 3564 * NB: This is the only routine which MAY NOT lazy-evaluate 3565 * or lose information. That is, this routine must actually 3566 * insert this page into the given map NOW. 3567 */ 3568 int 3569 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3570 u_int flags, int8_t psind) 3571 { 3572 pd_entry_t *pde; 3573 pt_entry_t *pte; 3574 pt_entry_t newpte, origpte; 3575 pv_entry_t pv; 3576 vm_paddr_t opa, pa; 3577 vm_page_t mpte, om; 3578 int rv; 3579 3580 va = trunc_page(va); 3581 KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) || 3582 (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS), 3583 ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va)); 3584 KASSERT(va < PMAP_TRM_MIN_ADDRESS, 3585 ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)", 3586 va)); 3587 KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 || 3588 va < kmi.clean_sva || va >= kmi.clean_eva, 3589 ("pmap_enter: managed mapping within the clean submap")); 3590 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3591 VM_OBJECT_ASSERT_LOCKED(m->object); 3592 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 3593 ("pmap_enter: flags %u has reserved bits set", flags)); 3594 pa = VM_PAGE_TO_PHYS(m); 3595 newpte = (pt_entry_t)(pa | PG_A | PG_V); 3596 if ((flags & VM_PROT_WRITE) != 0) 3597 newpte |= PG_M; 3598 if ((prot & VM_PROT_WRITE) != 0) 3599 newpte |= PG_RW; 3600 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 3601 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 3602 #if defined(PAE) || defined(PAE_TABLES) 3603 if ((prot & VM_PROT_EXECUTE) == 0) 3604 newpte |= pg_nx; 3605 #endif 3606 if ((flags & PMAP_ENTER_WIRED) != 0) 3607 newpte |= PG_W; 3608 if (pmap != kernel_pmap) 3609 newpte |= PG_U; 3610 newpte |= pmap_cache_bits(m->md.pat_mode, psind > 0); 3611 if ((m->oflags & VPO_UNMANAGED) == 0) 3612 newpte |= PG_MANAGED; 3613 3614 rw_wlock(&pvh_global_lock); 3615 PMAP_LOCK(pmap); 3616 sched_pin(); 3617 3618 pde = pmap_pde(pmap, va); 3619 if (pmap != kernel_pmap) { 3620 /* 3621 * va is for UVA. 3622 * In the case that a page table page is not resident, 3623 * we are creating it here. pmap_allocpte() handles 3624 * demotion. 3625 */ 3626 mpte = pmap_allocpte(pmap, va, flags); 3627 if (mpte == NULL) { 3628 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3629 ("pmap_allocpte failed with sleep allowed")); 3630 rv = KERN_RESOURCE_SHORTAGE; 3631 goto out; 3632 } 3633 } else { 3634 /* 3635 * va is for KVA, so pmap_demote_pde() will never fail 3636 * to install a page table page. PG_V is also 3637 * asserted by pmap_demote_pde(). 3638 */ 3639 mpte = NULL; 3640 KASSERT(pde != NULL && (*pde & PG_V) != 0, 3641 ("KVA %#x invalid pde pdir %#jx", va, 3642 (uintmax_t)pmap->pm_pdir[PTDPTDI])); 3643 if ((*pde & PG_PS) != 0) 3644 pmap_demote_pde(pmap, pde, va); 3645 } 3646 pte = pmap_pte_quick(pmap, va); 3647 3648 /* 3649 * Page Directory table entry is not valid, which should not 3650 * happen. We should have either allocated the page table 3651 * page or demoted the existing mapping above. 3652 */ 3653 if (pte == NULL) { 3654 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3655 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3656 } 3657 3658 origpte = *pte; 3659 pv = NULL; 3660 3661 /* 3662 * Is the specified virtual address already mapped? 3663 */ 3664 if ((origpte & PG_V) != 0) { 3665 /* 3666 * Wiring change, just update stats. We don't worry about 3667 * wiring PT pages as they remain resident as long as there 3668 * are valid mappings in them. Hence, if a user page is wired, 3669 * the PT page will be also. 3670 */ 3671 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 3672 pmap->pm_stats.wired_count++; 3673 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 3674 pmap->pm_stats.wired_count--; 3675 3676 /* 3677 * Remove the extra PT page reference. 3678 */ 3679 if (mpte != NULL) { 3680 mpte->wire_count--; 3681 KASSERT(mpte->wire_count > 0, 3682 ("pmap_enter: missing reference to page table page," 3683 " va: 0x%x", va)); 3684 } 3685 3686 /* 3687 * Has the physical page changed? 3688 */ 3689 opa = origpte & PG_FRAME; 3690 if (opa == pa) { 3691 /* 3692 * No, might be a protection or wiring change. 3693 */ 3694 if ((origpte & PG_MANAGED) != 0 && 3695 (newpte & PG_RW) != 0) 3696 vm_page_aflag_set(m, PGA_WRITEABLE); 3697 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 3698 goto unchanged; 3699 goto validate; 3700 } 3701 3702 /* 3703 * The physical page has changed. Temporarily invalidate 3704 * the mapping. This ensures that all threads sharing the 3705 * pmap keep a consistent view of the mapping, which is 3706 * necessary for the correct handling of COW faults. It 3707 * also permits reuse of the old mapping's PV entry, 3708 * avoiding an allocation. 3709 * 3710 * For consistency, handle unmanaged mappings the same way. 3711 */ 3712 origpte = pte_load_clear(pte); 3713 KASSERT((origpte & PG_FRAME) == opa, 3714 ("pmap_enter: unexpected pa update for %#x", va)); 3715 if ((origpte & PG_MANAGED) != 0) { 3716 om = PHYS_TO_VM_PAGE(opa); 3717 3718 /* 3719 * The pmap lock is sufficient to synchronize with 3720 * concurrent calls to pmap_page_test_mappings() and 3721 * pmap_ts_referenced(). 3722 */ 3723 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3724 vm_page_dirty(om); 3725 if ((origpte & PG_A) != 0) 3726 vm_page_aflag_set(om, PGA_REFERENCED); 3727 pv = pmap_pvh_remove(&om->md, pmap, va); 3728 if ((newpte & PG_MANAGED) == 0) 3729 free_pv_entry(pmap, pv); 3730 if ((om->aflags & PGA_WRITEABLE) != 0 && 3731 TAILQ_EMPTY(&om->md.pv_list) && 3732 ((om->flags & PG_FICTITIOUS) != 0 || 3733 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3734 vm_page_aflag_clear(om, PGA_WRITEABLE); 3735 } 3736 if ((origpte & PG_A) != 0) 3737 pmap_invalidate_page(pmap, va); 3738 origpte = 0; 3739 } else { 3740 /* 3741 * Increment the counters. 3742 */ 3743 if ((newpte & PG_W) != 0) 3744 pmap->pm_stats.wired_count++; 3745 pmap->pm_stats.resident_count++; 3746 } 3747 3748 /* 3749 * Enter on the PV list if part of our managed memory. 3750 */ 3751 if ((newpte & PG_MANAGED) != 0) { 3752 if (pv == NULL) { 3753 pv = get_pv_entry(pmap, FALSE); 3754 pv->pv_va = va; 3755 } 3756 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3757 if ((newpte & PG_RW) != 0) 3758 vm_page_aflag_set(m, PGA_WRITEABLE); 3759 } 3760 3761 /* 3762 * Update the PTE. 3763 */ 3764 if ((origpte & PG_V) != 0) { 3765 validate: 3766 origpte = pte_load_store(pte, newpte); 3767 KASSERT((origpte & PG_FRAME) == pa, 3768 ("pmap_enter: unexpected pa update for %#x", va)); 3769 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 3770 (PG_M | PG_RW)) { 3771 if ((origpte & PG_MANAGED) != 0) 3772 vm_page_dirty(m); 3773 3774 /* 3775 * Although the PTE may still have PG_RW set, TLB 3776 * invalidation may nonetheless be required because 3777 * the PTE no longer has PG_M set. 3778 */ 3779 } 3780 #if defined(PAE) || defined(PAE_TABLES) 3781 else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 3782 /* 3783 * This PTE change does not require TLB invalidation. 3784 */ 3785 goto unchanged; 3786 } 3787 #endif 3788 if ((origpte & PG_A) != 0) 3789 pmap_invalidate_page(pmap, va); 3790 } else 3791 pte_store(pte, newpte); 3792 3793 unchanged: 3794 3795 #if VM_NRESERVLEVEL > 0 3796 /* 3797 * If both the page table page and the reservation are fully 3798 * populated, then attempt promotion. 3799 */ 3800 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3801 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3802 vm_reserv_level_iffullpop(m) == 0) 3803 pmap_promote_pde(pmap, pde, va); 3804 #endif 3805 3806 rv = KERN_SUCCESS; 3807 out: 3808 sched_unpin(); 3809 rw_wunlock(&pvh_global_lock); 3810 PMAP_UNLOCK(pmap); 3811 return (rv); 3812 } 3813 3814 /* 3815 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3816 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3817 * blocking, (2) a mapping already exists at the specified virtual address, or 3818 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3819 */ 3820 static boolean_t 3821 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3822 { 3823 pd_entry_t *pde, newpde; 3824 3825 rw_assert(&pvh_global_lock, RA_WLOCKED); 3826 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3827 pde = pmap_pde(pmap, va); 3828 if (*pde != 0) { 3829 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3830 " in pmap %p", va, pmap); 3831 return (FALSE); 3832 } 3833 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3834 PG_PS | PG_V; 3835 if ((m->oflags & VPO_UNMANAGED) == 0) { 3836 newpde |= PG_MANAGED; 3837 3838 /* 3839 * Abort this mapping if its PV entry could not be created. 3840 */ 3841 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3842 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3843 " in pmap %p", va, pmap); 3844 return (FALSE); 3845 } 3846 } 3847 #if defined(PAE) || defined(PAE_TABLES) 3848 if ((prot & VM_PROT_EXECUTE) == 0) 3849 newpde |= pg_nx; 3850 #endif 3851 if (va < VM_MAXUSER_ADDRESS) 3852 newpde |= PG_U; 3853 3854 /* 3855 * Increment counters. 3856 */ 3857 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3858 3859 /* 3860 * Map the superpage. (This is not a promoted mapping; there will not 3861 * be any lingering 4KB page mappings in the TLB.) 3862 */ 3863 pde_store(pde, newpde); 3864 3865 pmap_pde_mappings++; 3866 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3867 " in pmap %p", va, pmap); 3868 return (TRUE); 3869 } 3870 3871 /* 3872 * Maps a sequence of resident pages belonging to the same object. 3873 * The sequence begins with the given page m_start. This page is 3874 * mapped at the given virtual address start. Each subsequent page is 3875 * mapped at a virtual address that is offset from start by the same 3876 * amount as the page is offset from m_start within the object. The 3877 * last page in the sequence is the page with the largest offset from 3878 * m_start that can be mapped at a virtual address less than the given 3879 * virtual address end. Not every virtual page between start and end 3880 * is mapped; only those for which a resident page exists with the 3881 * corresponding offset from m_start are mapped. 3882 */ 3883 void 3884 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3885 vm_page_t m_start, vm_prot_t prot) 3886 { 3887 vm_offset_t va; 3888 vm_page_t m, mpte; 3889 vm_pindex_t diff, psize; 3890 3891 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3892 3893 psize = atop(end - start); 3894 mpte = NULL; 3895 m = m_start; 3896 rw_wlock(&pvh_global_lock); 3897 PMAP_LOCK(pmap); 3898 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3899 va = start + ptoa(diff); 3900 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3901 m->psind == 1 && pg_ps_enabled && 3902 pmap_enter_pde(pmap, va, m, prot)) 3903 m = &m[NBPDR / PAGE_SIZE - 1]; 3904 else 3905 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3906 mpte); 3907 m = TAILQ_NEXT(m, listq); 3908 } 3909 rw_wunlock(&pvh_global_lock); 3910 PMAP_UNLOCK(pmap); 3911 } 3912 3913 /* 3914 * this code makes some *MAJOR* assumptions: 3915 * 1. Current pmap & pmap exists. 3916 * 2. Not wired. 3917 * 3. Read access. 3918 * 4. No page table pages. 3919 * but is *MUCH* faster than pmap_enter... 3920 */ 3921 3922 void 3923 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3924 { 3925 3926 rw_wlock(&pvh_global_lock); 3927 PMAP_LOCK(pmap); 3928 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3929 rw_wunlock(&pvh_global_lock); 3930 PMAP_UNLOCK(pmap); 3931 } 3932 3933 static vm_page_t 3934 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3935 vm_prot_t prot, vm_page_t mpte) 3936 { 3937 pt_entry_t *pte; 3938 vm_paddr_t pa; 3939 struct spglist free; 3940 3941 KASSERT(pmap != kernel_pmap || va < kmi.clean_sva || 3942 va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, 3943 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3944 rw_assert(&pvh_global_lock, RA_WLOCKED); 3945 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3946 3947 /* 3948 * In the case that a page table page is not 3949 * resident, we are creating it here. 3950 */ 3951 if (pmap != kernel_pmap) { 3952 u_int ptepindex; 3953 pd_entry_t ptepa; 3954 3955 /* 3956 * Calculate pagetable page index 3957 */ 3958 ptepindex = va >> PDRSHIFT; 3959 if (mpte && (mpte->pindex == ptepindex)) { 3960 mpte->wire_count++; 3961 } else { 3962 /* 3963 * Get the page directory entry 3964 */ 3965 ptepa = pmap->pm_pdir[ptepindex]; 3966 3967 /* 3968 * If the page table page is mapped, we just increment 3969 * the hold count, and activate it. 3970 */ 3971 if (ptepa) { 3972 if (ptepa & PG_PS) 3973 return (NULL); 3974 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3975 mpte->wire_count++; 3976 } else { 3977 mpte = _pmap_allocpte(pmap, ptepindex, 3978 PMAP_ENTER_NOSLEEP); 3979 if (mpte == NULL) 3980 return (mpte); 3981 } 3982 } 3983 } else { 3984 mpte = NULL; 3985 } 3986 3987 sched_pin(); 3988 pte = pmap_pte_quick(pmap, va); 3989 if (*pte) { 3990 if (mpte != NULL) { 3991 mpte->wire_count--; 3992 mpte = NULL; 3993 } 3994 sched_unpin(); 3995 return (mpte); 3996 } 3997 3998 /* 3999 * Enter on the PV list if part of our managed memory. 4000 */ 4001 if ((m->oflags & VPO_UNMANAGED) == 0 && 4002 !pmap_try_insert_pv_entry(pmap, va, m)) { 4003 if (mpte != NULL) { 4004 SLIST_INIT(&free); 4005 if (pmap_unwire_ptp(pmap, mpte, &free)) { 4006 pmap_invalidate_page(pmap, va); 4007 vm_page_free_pages_toq(&free, true); 4008 } 4009 4010 mpte = NULL; 4011 } 4012 sched_unpin(); 4013 return (mpte); 4014 } 4015 4016 /* 4017 * Increment counters 4018 */ 4019 pmap->pm_stats.resident_count++; 4020 4021 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 4022 #if defined(PAE) || defined(PAE_TABLES) 4023 if ((prot & VM_PROT_EXECUTE) == 0) 4024 pa |= pg_nx; 4025 #endif 4026 4027 /* 4028 * Now validate mapping with RO protection 4029 */ 4030 if ((m->oflags & VPO_UNMANAGED) != 0) 4031 pte_store(pte, pa | PG_V | PG_U); 4032 else 4033 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 4034 sched_unpin(); 4035 return (mpte); 4036 } 4037 4038 /* 4039 * Make a temporary mapping for a physical address. This is only intended 4040 * to be used for panic dumps. 4041 */ 4042 void * 4043 pmap_kenter_temporary(vm_paddr_t pa, int i) 4044 { 4045 vm_offset_t va; 4046 4047 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4048 pmap_kenter(va, pa); 4049 invlpg(va); 4050 return ((void *)crashdumpmap); 4051 } 4052 4053 /* 4054 * This code maps large physical mmap regions into the 4055 * processor address space. Note that some shortcuts 4056 * are taken, but the code works. 4057 */ 4058 void 4059 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4060 vm_pindex_t pindex, vm_size_t size) 4061 { 4062 pd_entry_t *pde; 4063 vm_paddr_t pa, ptepa; 4064 vm_page_t p; 4065 int pat_mode; 4066 4067 VM_OBJECT_ASSERT_WLOCKED(object); 4068 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4069 ("pmap_object_init_pt: non-device object")); 4070 if (pseflag && 4071 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 4072 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4073 return; 4074 p = vm_page_lookup(object, pindex); 4075 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4076 ("pmap_object_init_pt: invalid page %p", p)); 4077 pat_mode = p->md.pat_mode; 4078 4079 /* 4080 * Abort the mapping if the first page is not physically 4081 * aligned to a 2/4MB page boundary. 4082 */ 4083 ptepa = VM_PAGE_TO_PHYS(p); 4084 if (ptepa & (NBPDR - 1)) 4085 return; 4086 4087 /* 4088 * Skip the first page. Abort the mapping if the rest of 4089 * the pages are not physically contiguous or have differing 4090 * memory attributes. 4091 */ 4092 p = TAILQ_NEXT(p, listq); 4093 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4094 pa += PAGE_SIZE) { 4095 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4096 ("pmap_object_init_pt: invalid page %p", p)); 4097 if (pa != VM_PAGE_TO_PHYS(p) || 4098 pat_mode != p->md.pat_mode) 4099 return; 4100 p = TAILQ_NEXT(p, listq); 4101 } 4102 4103 /* 4104 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 4105 * "size" is a multiple of 2/4M, adding the PAT setting to 4106 * "pa" will not affect the termination of this loop. 4107 */ 4108 PMAP_LOCK(pmap); 4109 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 4110 size; pa += NBPDR) { 4111 pde = pmap_pde(pmap, addr); 4112 if (*pde == 0) { 4113 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4114 PG_U | PG_RW | PG_V); 4115 pmap->pm_stats.resident_count += NBPDR / 4116 PAGE_SIZE; 4117 pmap_pde_mappings++; 4118 } 4119 /* Else continue on if the PDE is already valid. */ 4120 addr += NBPDR; 4121 } 4122 PMAP_UNLOCK(pmap); 4123 } 4124 } 4125 4126 /* 4127 * Clear the wired attribute from the mappings for the specified range of 4128 * addresses in the given pmap. Every valid mapping within that range 4129 * must have the wired attribute set. In contrast, invalid mappings 4130 * cannot have the wired attribute set, so they are ignored. 4131 * 4132 * The wired attribute of the page table entry is not a hardware feature, 4133 * so there is no need to invalidate any TLB entries. 4134 */ 4135 void 4136 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4137 { 4138 vm_offset_t pdnxt; 4139 pd_entry_t *pde; 4140 pt_entry_t *pte; 4141 boolean_t pv_lists_locked; 4142 4143 if (pmap_is_current(pmap)) 4144 pv_lists_locked = FALSE; 4145 else { 4146 pv_lists_locked = TRUE; 4147 resume: 4148 rw_wlock(&pvh_global_lock); 4149 sched_pin(); 4150 } 4151 PMAP_LOCK(pmap); 4152 for (; sva < eva; sva = pdnxt) { 4153 pdnxt = (sva + NBPDR) & ~PDRMASK; 4154 if (pdnxt < sva) 4155 pdnxt = eva; 4156 pde = pmap_pde(pmap, sva); 4157 if ((*pde & PG_V) == 0) 4158 continue; 4159 if ((*pde & PG_PS) != 0) { 4160 if ((*pde & PG_W) == 0) 4161 panic("pmap_unwire: pde %#jx is missing PG_W", 4162 (uintmax_t)*pde); 4163 4164 /* 4165 * Are we unwiring the entire large page? If not, 4166 * demote the mapping and fall through. 4167 */ 4168 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4169 /* 4170 * Regardless of whether a pde (or pte) is 32 4171 * or 64 bits in size, PG_W is among the least 4172 * significant 32 bits. 4173 */ 4174 atomic_clear_int((u_int *)pde, PG_W); 4175 pmap->pm_stats.wired_count -= NBPDR / 4176 PAGE_SIZE; 4177 continue; 4178 } else { 4179 if (!pv_lists_locked) { 4180 pv_lists_locked = TRUE; 4181 if (!rw_try_wlock(&pvh_global_lock)) { 4182 PMAP_UNLOCK(pmap); 4183 /* Repeat sva. */ 4184 goto resume; 4185 } 4186 sched_pin(); 4187 } 4188 if (!pmap_demote_pde(pmap, pde, sva)) 4189 panic("pmap_unwire: demotion failed"); 4190 } 4191 } 4192 if (pdnxt > eva) 4193 pdnxt = eva; 4194 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4195 sva += PAGE_SIZE) { 4196 if ((*pte & PG_V) == 0) 4197 continue; 4198 if ((*pte & PG_W) == 0) 4199 panic("pmap_unwire: pte %#jx is missing PG_W", 4200 (uintmax_t)*pte); 4201 4202 /* 4203 * PG_W must be cleared atomically. Although the pmap 4204 * lock synchronizes access to PG_W, another processor 4205 * could be setting PG_M and/or PG_A concurrently. 4206 * 4207 * PG_W is among the least significant 32 bits. 4208 */ 4209 atomic_clear_int((u_int *)pte, PG_W); 4210 pmap->pm_stats.wired_count--; 4211 } 4212 } 4213 if (pv_lists_locked) { 4214 sched_unpin(); 4215 rw_wunlock(&pvh_global_lock); 4216 } 4217 PMAP_UNLOCK(pmap); 4218 } 4219 4220 4221 /* 4222 * Copy the range specified by src_addr/len 4223 * from the source map to the range dst_addr/len 4224 * in the destination map. 4225 * 4226 * This routine is only advisory and need not do anything. Since 4227 * current pmap is always the kernel pmap when executing in 4228 * kernel, and we do not copy from the kernel pmap to a user 4229 * pmap, this optimization is not usable in 4/4G full split i386 4230 * world. 4231 */ 4232 4233 void 4234 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4235 vm_offset_t src_addr) 4236 { 4237 struct spglist free; 4238 pt_entry_t *src_pte, *dst_pte, ptetemp; 4239 pd_entry_t srcptepaddr; 4240 vm_page_t dstmpte, srcmpte; 4241 vm_offset_t addr, end_addr, pdnxt; 4242 u_int ptepindex; 4243 4244 if (dst_addr != src_addr) 4245 return; 4246 4247 end_addr = src_addr + len; 4248 4249 rw_wlock(&pvh_global_lock); 4250 if (dst_pmap < src_pmap) { 4251 PMAP_LOCK(dst_pmap); 4252 PMAP_LOCK(src_pmap); 4253 } else { 4254 PMAP_LOCK(src_pmap); 4255 PMAP_LOCK(dst_pmap); 4256 } 4257 sched_pin(); 4258 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4259 KASSERT(addr < PMAP_TRM_MIN_ADDRESS, 4260 ("pmap_copy: invalid to pmap_copy the trampoline")); 4261 4262 pdnxt = (addr + NBPDR) & ~PDRMASK; 4263 if (pdnxt < addr) 4264 pdnxt = end_addr; 4265 ptepindex = addr >> PDRSHIFT; 4266 4267 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4268 if (srcptepaddr == 0) 4269 continue; 4270 4271 if (srcptepaddr & PG_PS) { 4272 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4273 continue; 4274 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4275 ((srcptepaddr & PG_MANAGED) == 0 || 4276 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4277 PG_PS_FRAME))) { 4278 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4279 ~PG_W; 4280 dst_pmap->pm_stats.resident_count += 4281 NBPDR / PAGE_SIZE; 4282 pmap_pde_mappings++; 4283 } 4284 continue; 4285 } 4286 4287 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4288 KASSERT(srcmpte->wire_count > 0, 4289 ("pmap_copy: source page table page is unused")); 4290 4291 if (pdnxt > end_addr) 4292 pdnxt = end_addr; 4293 4294 src_pte = pmap_pte_quick3(src_pmap, addr); 4295 while (addr < pdnxt) { 4296 ptetemp = *src_pte; 4297 /* 4298 * we only virtual copy managed pages 4299 */ 4300 if ((ptetemp & PG_MANAGED) != 0) { 4301 dstmpte = pmap_allocpte(dst_pmap, addr, 4302 PMAP_ENTER_NOSLEEP); 4303 if (dstmpte == NULL) 4304 goto out; 4305 dst_pte = pmap_pte_quick(dst_pmap, addr); 4306 if (*dst_pte == 0 && 4307 pmap_try_insert_pv_entry(dst_pmap, addr, 4308 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4309 /* 4310 * Clear the wired, modified, and 4311 * accessed (referenced) bits 4312 * during the copy. 4313 */ 4314 *dst_pte = ptetemp & ~(PG_W | PG_M | 4315 PG_A); 4316 dst_pmap->pm_stats.resident_count++; 4317 } else { 4318 SLIST_INIT(&free); 4319 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4320 &free)) { 4321 pmap_invalidate_page(dst_pmap, 4322 addr); 4323 vm_page_free_pages_toq(&free, 4324 true); 4325 } 4326 goto out; 4327 } 4328 if (dstmpte->wire_count >= srcmpte->wire_count) 4329 break; 4330 } 4331 addr += PAGE_SIZE; 4332 src_pte++; 4333 } 4334 } 4335 out: 4336 sched_unpin(); 4337 rw_wunlock(&pvh_global_lock); 4338 PMAP_UNLOCK(src_pmap); 4339 PMAP_UNLOCK(dst_pmap); 4340 } 4341 4342 /* 4343 * Zero 1 page of virtual memory mapped from a hardware page by the caller. 4344 */ 4345 static __inline void 4346 pagezero(void *page) 4347 { 4348 #if defined(I686_CPU) 4349 if (cpu_class == CPUCLASS_686) { 4350 if (cpu_feature & CPUID_SSE2) 4351 sse2_pagezero(page); 4352 else 4353 i686_pagezero(page); 4354 } else 4355 #endif 4356 bzero(page, PAGE_SIZE); 4357 } 4358 4359 /* 4360 * Zero the specified hardware page. 4361 */ 4362 void 4363 pmap_zero_page(vm_page_t m) 4364 { 4365 pt_entry_t *cmap_pte2; 4366 struct pcpu *pc; 4367 4368 sched_pin(); 4369 pc = get_pcpu(); 4370 cmap_pte2 = pc->pc_cmap_pte2; 4371 mtx_lock(&pc->pc_cmap_lock); 4372 if (*cmap_pte2) 4373 panic("pmap_zero_page: CMAP2 busy"); 4374 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4375 pmap_cache_bits(m->md.pat_mode, 0); 4376 invlcaddr(pc->pc_cmap_addr2); 4377 pagezero(pc->pc_cmap_addr2); 4378 *cmap_pte2 = 0; 4379 4380 /* 4381 * Unpin the thread before releasing the lock. Otherwise the thread 4382 * could be rescheduled while still bound to the current CPU, only 4383 * to unpin itself immediately upon resuming execution. 4384 */ 4385 sched_unpin(); 4386 mtx_unlock(&pc->pc_cmap_lock); 4387 } 4388 4389 /* 4390 * Zero an an area within a single hardware page. off and size must not 4391 * cover an area beyond a single hardware page. 4392 */ 4393 void 4394 pmap_zero_page_area(vm_page_t m, int off, int size) 4395 { 4396 pt_entry_t *cmap_pte2; 4397 struct pcpu *pc; 4398 4399 sched_pin(); 4400 pc = get_pcpu(); 4401 cmap_pte2 = pc->pc_cmap_pte2; 4402 mtx_lock(&pc->pc_cmap_lock); 4403 if (*cmap_pte2) 4404 panic("pmap_zero_page_area: CMAP2 busy"); 4405 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4406 pmap_cache_bits(m->md.pat_mode, 0); 4407 invlcaddr(pc->pc_cmap_addr2); 4408 if (off == 0 && size == PAGE_SIZE) 4409 pagezero(pc->pc_cmap_addr2); 4410 else 4411 bzero(pc->pc_cmap_addr2 + off, size); 4412 *cmap_pte2 = 0; 4413 sched_unpin(); 4414 mtx_unlock(&pc->pc_cmap_lock); 4415 } 4416 4417 /* 4418 * Copy 1 specified hardware page to another. 4419 */ 4420 void 4421 pmap_copy_page(vm_page_t src, vm_page_t dst) 4422 { 4423 pt_entry_t *cmap_pte1, *cmap_pte2; 4424 struct pcpu *pc; 4425 4426 sched_pin(); 4427 pc = get_pcpu(); 4428 cmap_pte1 = pc->pc_cmap_pte1; 4429 cmap_pte2 = pc->pc_cmap_pte2; 4430 mtx_lock(&pc->pc_cmap_lock); 4431 if (*cmap_pte1) 4432 panic("pmap_copy_page: CMAP1 busy"); 4433 if (*cmap_pte2) 4434 panic("pmap_copy_page: CMAP2 busy"); 4435 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4436 pmap_cache_bits(src->md.pat_mode, 0); 4437 invlcaddr(pc->pc_cmap_addr1); 4438 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4439 pmap_cache_bits(dst->md.pat_mode, 0); 4440 invlcaddr(pc->pc_cmap_addr2); 4441 bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); 4442 *cmap_pte1 = 0; 4443 *cmap_pte2 = 0; 4444 sched_unpin(); 4445 mtx_unlock(&pc->pc_cmap_lock); 4446 } 4447 4448 int unmapped_buf_allowed = 1; 4449 4450 void 4451 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4452 vm_offset_t b_offset, int xfersize) 4453 { 4454 vm_page_t a_pg, b_pg; 4455 char *a_cp, *b_cp; 4456 vm_offset_t a_pg_offset, b_pg_offset; 4457 pt_entry_t *cmap_pte1, *cmap_pte2; 4458 struct pcpu *pc; 4459 int cnt; 4460 4461 sched_pin(); 4462 pc = get_pcpu(); 4463 cmap_pte1 = pc->pc_cmap_pte1; 4464 cmap_pte2 = pc->pc_cmap_pte2; 4465 mtx_lock(&pc->pc_cmap_lock); 4466 if (*cmap_pte1 != 0) 4467 panic("pmap_copy_pages: CMAP1 busy"); 4468 if (*cmap_pte2 != 0) 4469 panic("pmap_copy_pages: CMAP2 busy"); 4470 while (xfersize > 0) { 4471 a_pg = ma[a_offset >> PAGE_SHIFT]; 4472 a_pg_offset = a_offset & PAGE_MASK; 4473 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4474 b_pg = mb[b_offset >> PAGE_SHIFT]; 4475 b_pg_offset = b_offset & PAGE_MASK; 4476 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4477 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4478 pmap_cache_bits(a_pg->md.pat_mode, 0); 4479 invlcaddr(pc->pc_cmap_addr1); 4480 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4481 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); 4482 invlcaddr(pc->pc_cmap_addr2); 4483 a_cp = pc->pc_cmap_addr1 + a_pg_offset; 4484 b_cp = pc->pc_cmap_addr2 + b_pg_offset; 4485 bcopy(a_cp, b_cp, cnt); 4486 a_offset += cnt; 4487 b_offset += cnt; 4488 xfersize -= cnt; 4489 } 4490 *cmap_pte1 = 0; 4491 *cmap_pte2 = 0; 4492 sched_unpin(); 4493 mtx_unlock(&pc->pc_cmap_lock); 4494 } 4495 4496 /* 4497 * Returns true if the pmap's pv is one of the first 4498 * 16 pvs linked to from this page. This count may 4499 * be changed upwards or downwards in the future; it 4500 * is only necessary that true be returned for a small 4501 * subset of pmaps for proper page aging. 4502 */ 4503 boolean_t 4504 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4505 { 4506 struct md_page *pvh; 4507 pv_entry_t pv; 4508 int loops = 0; 4509 boolean_t rv; 4510 4511 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4512 ("pmap_page_exists_quick: page %p is not managed", m)); 4513 rv = FALSE; 4514 rw_wlock(&pvh_global_lock); 4515 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4516 if (PV_PMAP(pv) == pmap) { 4517 rv = TRUE; 4518 break; 4519 } 4520 loops++; 4521 if (loops >= 16) 4522 break; 4523 } 4524 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4525 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4526 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4527 if (PV_PMAP(pv) == pmap) { 4528 rv = TRUE; 4529 break; 4530 } 4531 loops++; 4532 if (loops >= 16) 4533 break; 4534 } 4535 } 4536 rw_wunlock(&pvh_global_lock); 4537 return (rv); 4538 } 4539 4540 /* 4541 * pmap_page_wired_mappings: 4542 * 4543 * Return the number of managed mappings to the given physical page 4544 * that are wired. 4545 */ 4546 int 4547 pmap_page_wired_mappings(vm_page_t m) 4548 { 4549 int count; 4550 4551 count = 0; 4552 if ((m->oflags & VPO_UNMANAGED) != 0) 4553 return (count); 4554 rw_wlock(&pvh_global_lock); 4555 count = pmap_pvh_wired_mappings(&m->md, count); 4556 if ((m->flags & PG_FICTITIOUS) == 0) { 4557 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4558 count); 4559 } 4560 rw_wunlock(&pvh_global_lock); 4561 return (count); 4562 } 4563 4564 /* 4565 * pmap_pvh_wired_mappings: 4566 * 4567 * Return the updated number "count" of managed mappings that are wired. 4568 */ 4569 static int 4570 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4571 { 4572 pmap_t pmap; 4573 pt_entry_t *pte; 4574 pv_entry_t pv; 4575 4576 rw_assert(&pvh_global_lock, RA_WLOCKED); 4577 sched_pin(); 4578 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4579 pmap = PV_PMAP(pv); 4580 PMAP_LOCK(pmap); 4581 pte = pmap_pte_quick(pmap, pv->pv_va); 4582 if ((*pte & PG_W) != 0) 4583 count++; 4584 PMAP_UNLOCK(pmap); 4585 } 4586 sched_unpin(); 4587 return (count); 4588 } 4589 4590 /* 4591 * Returns TRUE if the given page is mapped individually or as part of 4592 * a 4mpage. Otherwise, returns FALSE. 4593 */ 4594 boolean_t 4595 pmap_page_is_mapped(vm_page_t m) 4596 { 4597 boolean_t rv; 4598 4599 if ((m->oflags & VPO_UNMANAGED) != 0) 4600 return (FALSE); 4601 rw_wlock(&pvh_global_lock); 4602 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4603 ((m->flags & PG_FICTITIOUS) == 0 && 4604 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4605 rw_wunlock(&pvh_global_lock); 4606 return (rv); 4607 } 4608 4609 /* 4610 * Remove all pages from specified address space 4611 * this aids process exit speeds. Also, this code 4612 * is special cased for current process only, but 4613 * can have the more generic (and slightly slower) 4614 * mode enabled. This is much faster than pmap_remove 4615 * in the case of running down an entire address space. 4616 */ 4617 void 4618 pmap_remove_pages(pmap_t pmap) 4619 { 4620 pt_entry_t *pte, tpte; 4621 vm_page_t m, mpte, mt; 4622 pv_entry_t pv; 4623 struct md_page *pvh; 4624 struct pv_chunk *pc, *npc; 4625 struct spglist free; 4626 int field, idx; 4627 int32_t bit; 4628 uint32_t inuse, bitmask; 4629 int allfree; 4630 4631 if (pmap != PCPU_GET(curpmap)) { 4632 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4633 return; 4634 } 4635 SLIST_INIT(&free); 4636 rw_wlock(&pvh_global_lock); 4637 PMAP_LOCK(pmap); 4638 sched_pin(); 4639 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4640 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4641 pc->pc_pmap)); 4642 allfree = 1; 4643 for (field = 0; field < _NPCM; field++) { 4644 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4645 while (inuse != 0) { 4646 bit = bsfl(inuse); 4647 bitmask = 1UL << bit; 4648 idx = field * 32 + bit; 4649 pv = &pc->pc_pventry[idx]; 4650 inuse &= ~bitmask; 4651 4652 pte = pmap_pde(pmap, pv->pv_va); 4653 tpte = *pte; 4654 if ((tpte & PG_PS) == 0) { 4655 pte = pmap_pte_quick(pmap, pv->pv_va); 4656 tpte = *pte & ~PG_PTE_PAT; 4657 } 4658 4659 if (tpte == 0) { 4660 printf( 4661 "TPTE at %p IS ZERO @ VA %08x\n", 4662 pte, pv->pv_va); 4663 panic("bad pte"); 4664 } 4665 4666 /* 4667 * We cannot remove wired pages from a process' mapping at this time 4668 */ 4669 if (tpte & PG_W) { 4670 allfree = 0; 4671 continue; 4672 } 4673 4674 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4675 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4676 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4677 m, (uintmax_t)m->phys_addr, 4678 (uintmax_t)tpte)); 4679 4680 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4681 m < &vm_page_array[vm_page_array_size], 4682 ("pmap_remove_pages: bad tpte %#jx", 4683 (uintmax_t)tpte)); 4684 4685 pte_clear(pte); 4686 4687 /* 4688 * Update the vm_page_t clean/reference bits. 4689 */ 4690 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4691 if ((tpte & PG_PS) != 0) { 4692 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4693 vm_page_dirty(mt); 4694 } else 4695 vm_page_dirty(m); 4696 } 4697 4698 /* Mark free */ 4699 PV_STAT(pv_entry_frees++); 4700 PV_STAT(pv_entry_spare++); 4701 pv_entry_count--; 4702 pc->pc_map[field] |= bitmask; 4703 if ((tpte & PG_PS) != 0) { 4704 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4705 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4706 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4707 if (TAILQ_EMPTY(&pvh->pv_list)) { 4708 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4709 if (TAILQ_EMPTY(&mt->md.pv_list)) 4710 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4711 } 4712 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 4713 if (mpte != NULL) { 4714 pmap->pm_stats.resident_count--; 4715 KASSERT(mpte->wire_count == NPTEPG, 4716 ("pmap_remove_pages: pte page wire count error")); 4717 mpte->wire_count = 0; 4718 pmap_add_delayed_free_list(mpte, &free, FALSE); 4719 } 4720 } else { 4721 pmap->pm_stats.resident_count--; 4722 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4723 if (TAILQ_EMPTY(&m->md.pv_list) && 4724 (m->flags & PG_FICTITIOUS) == 0) { 4725 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4726 if (TAILQ_EMPTY(&pvh->pv_list)) 4727 vm_page_aflag_clear(m, PGA_WRITEABLE); 4728 } 4729 pmap_unuse_pt(pmap, pv->pv_va, &free); 4730 } 4731 } 4732 } 4733 if (allfree) { 4734 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4735 free_pv_chunk(pc); 4736 } 4737 } 4738 sched_unpin(); 4739 pmap_invalidate_all(pmap); 4740 rw_wunlock(&pvh_global_lock); 4741 PMAP_UNLOCK(pmap); 4742 vm_page_free_pages_toq(&free, true); 4743 } 4744 4745 /* 4746 * pmap_is_modified: 4747 * 4748 * Return whether or not the specified physical page was modified 4749 * in any physical maps. 4750 */ 4751 boolean_t 4752 pmap_is_modified(vm_page_t m) 4753 { 4754 boolean_t rv; 4755 4756 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4757 ("pmap_is_modified: page %p is not managed", m)); 4758 4759 /* 4760 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4761 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4762 * is clear, no PTEs can have PG_M set. 4763 */ 4764 VM_OBJECT_ASSERT_WLOCKED(m->object); 4765 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4766 return (FALSE); 4767 rw_wlock(&pvh_global_lock); 4768 rv = pmap_is_modified_pvh(&m->md) || 4769 ((m->flags & PG_FICTITIOUS) == 0 && 4770 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4771 rw_wunlock(&pvh_global_lock); 4772 return (rv); 4773 } 4774 4775 /* 4776 * Returns TRUE if any of the given mappings were used to modify 4777 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4778 * mappings are supported. 4779 */ 4780 static boolean_t 4781 pmap_is_modified_pvh(struct md_page *pvh) 4782 { 4783 pv_entry_t pv; 4784 pt_entry_t *pte; 4785 pmap_t pmap; 4786 boolean_t rv; 4787 4788 rw_assert(&pvh_global_lock, RA_WLOCKED); 4789 rv = FALSE; 4790 sched_pin(); 4791 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4792 pmap = PV_PMAP(pv); 4793 PMAP_LOCK(pmap); 4794 pte = pmap_pte_quick(pmap, pv->pv_va); 4795 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4796 PMAP_UNLOCK(pmap); 4797 if (rv) 4798 break; 4799 } 4800 sched_unpin(); 4801 return (rv); 4802 } 4803 4804 /* 4805 * pmap_is_prefaultable: 4806 * 4807 * Return whether or not the specified virtual address is elgible 4808 * for prefault. 4809 */ 4810 boolean_t 4811 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4812 { 4813 pd_entry_t pde; 4814 boolean_t rv; 4815 4816 rv = FALSE; 4817 PMAP_LOCK(pmap); 4818 pde = *pmap_pde(pmap, addr); 4819 if (pde != 0 && (pde & PG_PS) == 0) 4820 rv = pmap_pte_ufast(pmap, addr, pde) == 0; 4821 PMAP_UNLOCK(pmap); 4822 return (rv); 4823 } 4824 4825 /* 4826 * pmap_is_referenced: 4827 * 4828 * Return whether or not the specified physical page was referenced 4829 * in any physical maps. 4830 */ 4831 boolean_t 4832 pmap_is_referenced(vm_page_t m) 4833 { 4834 boolean_t rv; 4835 4836 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4837 ("pmap_is_referenced: page %p is not managed", m)); 4838 rw_wlock(&pvh_global_lock); 4839 rv = pmap_is_referenced_pvh(&m->md) || 4840 ((m->flags & PG_FICTITIOUS) == 0 && 4841 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4842 rw_wunlock(&pvh_global_lock); 4843 return (rv); 4844 } 4845 4846 /* 4847 * Returns TRUE if any of the given mappings were referenced and FALSE 4848 * otherwise. Both page and 4mpage mappings are supported. 4849 */ 4850 static boolean_t 4851 pmap_is_referenced_pvh(struct md_page *pvh) 4852 { 4853 pv_entry_t pv; 4854 pt_entry_t *pte; 4855 pmap_t pmap; 4856 boolean_t rv; 4857 4858 rw_assert(&pvh_global_lock, RA_WLOCKED); 4859 rv = FALSE; 4860 sched_pin(); 4861 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4862 pmap = PV_PMAP(pv); 4863 PMAP_LOCK(pmap); 4864 pte = pmap_pte_quick(pmap, pv->pv_va); 4865 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4866 PMAP_UNLOCK(pmap); 4867 if (rv) 4868 break; 4869 } 4870 sched_unpin(); 4871 return (rv); 4872 } 4873 4874 /* 4875 * Clear the write and modified bits in each of the given page's mappings. 4876 */ 4877 void 4878 pmap_remove_write(vm_page_t m) 4879 { 4880 struct md_page *pvh; 4881 pv_entry_t next_pv, pv; 4882 pmap_t pmap; 4883 pd_entry_t *pde; 4884 pt_entry_t oldpte, *pte; 4885 vm_offset_t va; 4886 4887 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4888 ("pmap_remove_write: page %p is not managed", m)); 4889 4890 /* 4891 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4892 * set by another thread while the object is locked. Thus, 4893 * if PGA_WRITEABLE is clear, no page table entries need updating. 4894 */ 4895 VM_OBJECT_ASSERT_WLOCKED(m->object); 4896 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4897 return; 4898 rw_wlock(&pvh_global_lock); 4899 sched_pin(); 4900 if ((m->flags & PG_FICTITIOUS) != 0) 4901 goto small_mappings; 4902 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4903 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4904 va = pv->pv_va; 4905 pmap = PV_PMAP(pv); 4906 PMAP_LOCK(pmap); 4907 pde = pmap_pde(pmap, va); 4908 if ((*pde & PG_RW) != 0) 4909 (void)pmap_demote_pde(pmap, pde, va); 4910 PMAP_UNLOCK(pmap); 4911 } 4912 small_mappings: 4913 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4914 pmap = PV_PMAP(pv); 4915 PMAP_LOCK(pmap); 4916 pde = pmap_pde(pmap, pv->pv_va); 4917 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4918 " a 4mpage in page %p's pv list", m)); 4919 pte = pmap_pte_quick(pmap, pv->pv_va); 4920 retry: 4921 oldpte = *pte; 4922 if ((oldpte & PG_RW) != 0) { 4923 /* 4924 * Regardless of whether a pte is 32 or 64 bits 4925 * in size, PG_RW and PG_M are among the least 4926 * significant 32 bits. 4927 */ 4928 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4929 oldpte & ~(PG_RW | PG_M))) 4930 goto retry; 4931 if ((oldpte & PG_M) != 0) 4932 vm_page_dirty(m); 4933 pmap_invalidate_page(pmap, pv->pv_va); 4934 } 4935 PMAP_UNLOCK(pmap); 4936 } 4937 vm_page_aflag_clear(m, PGA_WRITEABLE); 4938 sched_unpin(); 4939 rw_wunlock(&pvh_global_lock); 4940 } 4941 4942 /* 4943 * pmap_ts_referenced: 4944 * 4945 * Return a count of reference bits for a page, clearing those bits. 4946 * It is not necessary for every reference bit to be cleared, but it 4947 * is necessary that 0 only be returned when there are truly no 4948 * reference bits set. 4949 * 4950 * As an optimization, update the page's dirty field if a modified bit is 4951 * found while counting reference bits. This opportunistic update can be 4952 * performed at low cost and can eliminate the need for some future calls 4953 * to pmap_is_modified(). However, since this function stops after 4954 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4955 * dirty pages. Those dirty pages will only be detected by a future call 4956 * to pmap_is_modified(). 4957 */ 4958 int 4959 pmap_ts_referenced(vm_page_t m) 4960 { 4961 struct md_page *pvh; 4962 pv_entry_t pv, pvf; 4963 pmap_t pmap; 4964 pd_entry_t *pde; 4965 pt_entry_t *pte; 4966 vm_paddr_t pa; 4967 int rtval = 0; 4968 4969 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4970 ("pmap_ts_referenced: page %p is not managed", m)); 4971 pa = VM_PAGE_TO_PHYS(m); 4972 pvh = pa_to_pvh(pa); 4973 rw_wlock(&pvh_global_lock); 4974 sched_pin(); 4975 if ((m->flags & PG_FICTITIOUS) != 0 || 4976 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4977 goto small_mappings; 4978 pv = pvf; 4979 do { 4980 pmap = PV_PMAP(pv); 4981 PMAP_LOCK(pmap); 4982 pde = pmap_pde(pmap, pv->pv_va); 4983 if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4984 /* 4985 * Although "*pde" is mapping a 2/4MB page, because 4986 * this function is called at a 4KB page granularity, 4987 * we only update the 4KB page under test. 4988 */ 4989 vm_page_dirty(m); 4990 } 4991 if ((*pde & PG_A) != 0) { 4992 /* 4993 * Since this reference bit is shared by either 1024 4994 * or 512 4KB pages, it should not be cleared every 4995 * time it is tested. Apply a simple "hash" function 4996 * on the physical page number, the virtual superpage 4997 * number, and the pmap address to select one 4KB page 4998 * out of the 1024 or 512 on which testing the 4999 * reference bit will result in clearing that bit. 5000 * This function is designed to avoid the selection of 5001 * the same 4KB page for every 2- or 4MB page mapping. 5002 * 5003 * On demotion, a mapping that hasn't been referenced 5004 * is simply destroyed. To avoid the possibility of a 5005 * subsequent page fault on a demoted wired mapping, 5006 * always leave its reference bit set. Moreover, 5007 * since the superpage is wired, the current state of 5008 * its reference bit won't affect page replacement. 5009 */ 5010 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 5011 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 5012 (*pde & PG_W) == 0) { 5013 atomic_clear_int((u_int *)pde, PG_A); 5014 pmap_invalidate_page(pmap, pv->pv_va); 5015 } 5016 rtval++; 5017 } 5018 PMAP_UNLOCK(pmap); 5019 /* Rotate the PV list if it has more than one entry. */ 5020 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5021 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5022 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5023 } 5024 if (rtval >= PMAP_TS_REFERENCED_MAX) 5025 goto out; 5026 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5027 small_mappings: 5028 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5029 goto out; 5030 pv = pvf; 5031 do { 5032 pmap = PV_PMAP(pv); 5033 PMAP_LOCK(pmap); 5034 pde = pmap_pde(pmap, pv->pv_va); 5035 KASSERT((*pde & PG_PS) == 0, 5036 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 5037 m)); 5038 pte = pmap_pte_quick(pmap, pv->pv_va); 5039 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5040 vm_page_dirty(m); 5041 if ((*pte & PG_A) != 0) { 5042 atomic_clear_int((u_int *)pte, PG_A); 5043 pmap_invalidate_page(pmap, pv->pv_va); 5044 rtval++; 5045 } 5046 PMAP_UNLOCK(pmap); 5047 /* Rotate the PV list if it has more than one entry. */ 5048 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5049 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5050 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5051 } 5052 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5053 PMAP_TS_REFERENCED_MAX); 5054 out: 5055 sched_unpin(); 5056 rw_wunlock(&pvh_global_lock); 5057 return (rtval); 5058 } 5059 5060 /* 5061 * Apply the given advice to the specified range of addresses within the 5062 * given pmap. Depending on the advice, clear the referenced and/or 5063 * modified flags in each mapping and set the mapped page's dirty field. 5064 */ 5065 void 5066 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5067 { 5068 pd_entry_t oldpde, *pde; 5069 pt_entry_t *pte; 5070 vm_offset_t va, pdnxt; 5071 vm_page_t m; 5072 boolean_t anychanged, pv_lists_locked; 5073 5074 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5075 return; 5076 if (pmap_is_current(pmap)) 5077 pv_lists_locked = FALSE; 5078 else { 5079 pv_lists_locked = TRUE; 5080 resume: 5081 rw_wlock(&pvh_global_lock); 5082 sched_pin(); 5083 } 5084 anychanged = FALSE; 5085 PMAP_LOCK(pmap); 5086 for (; sva < eva; sva = pdnxt) { 5087 pdnxt = (sva + NBPDR) & ~PDRMASK; 5088 if (pdnxt < sva) 5089 pdnxt = eva; 5090 pde = pmap_pde(pmap, sva); 5091 oldpde = *pde; 5092 if ((oldpde & PG_V) == 0) 5093 continue; 5094 else if ((oldpde & PG_PS) != 0) { 5095 if ((oldpde & PG_MANAGED) == 0) 5096 continue; 5097 if (!pv_lists_locked) { 5098 pv_lists_locked = TRUE; 5099 if (!rw_try_wlock(&pvh_global_lock)) { 5100 if (anychanged) 5101 pmap_invalidate_all(pmap); 5102 PMAP_UNLOCK(pmap); 5103 goto resume; 5104 } 5105 sched_pin(); 5106 } 5107 if (!pmap_demote_pde(pmap, pde, sva)) { 5108 /* 5109 * The large page mapping was destroyed. 5110 */ 5111 continue; 5112 } 5113 5114 /* 5115 * Unless the page mappings are wired, remove the 5116 * mapping to a single page so that a subsequent 5117 * access may repromote. Since the underlying page 5118 * table page is fully populated, this removal never 5119 * frees a page table page. 5120 */ 5121 if ((oldpde & PG_W) == 0) { 5122 pte = pmap_pte_quick(pmap, sva); 5123 KASSERT((*pte & PG_V) != 0, 5124 ("pmap_advise: invalid PTE")); 5125 pmap_remove_pte(pmap, pte, sva, NULL); 5126 anychanged = TRUE; 5127 } 5128 } 5129 if (pdnxt > eva) 5130 pdnxt = eva; 5131 va = pdnxt; 5132 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 5133 sva += PAGE_SIZE) { 5134 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 5135 goto maybe_invlrng; 5136 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5137 if (advice == MADV_DONTNEED) { 5138 /* 5139 * Future calls to pmap_is_modified() 5140 * can be avoided by making the page 5141 * dirty now. 5142 */ 5143 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5144 vm_page_dirty(m); 5145 } 5146 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5147 } else if ((*pte & PG_A) != 0) 5148 atomic_clear_int((u_int *)pte, PG_A); 5149 else 5150 goto maybe_invlrng; 5151 if ((*pte & PG_G) != 0) { 5152 if (va == pdnxt) 5153 va = sva; 5154 } else 5155 anychanged = TRUE; 5156 continue; 5157 maybe_invlrng: 5158 if (va != pdnxt) { 5159 pmap_invalidate_range(pmap, va, sva); 5160 va = pdnxt; 5161 } 5162 } 5163 if (va != pdnxt) 5164 pmap_invalidate_range(pmap, va, sva); 5165 } 5166 if (anychanged) 5167 pmap_invalidate_all(pmap); 5168 if (pv_lists_locked) { 5169 sched_unpin(); 5170 rw_wunlock(&pvh_global_lock); 5171 } 5172 PMAP_UNLOCK(pmap); 5173 } 5174 5175 /* 5176 * Clear the modify bits on the specified physical page. 5177 */ 5178 void 5179 pmap_clear_modify(vm_page_t m) 5180 { 5181 struct md_page *pvh; 5182 pv_entry_t next_pv, pv; 5183 pmap_t pmap; 5184 pd_entry_t oldpde, *pde; 5185 pt_entry_t oldpte, *pte; 5186 vm_offset_t va; 5187 5188 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5189 ("pmap_clear_modify: page %p is not managed", m)); 5190 VM_OBJECT_ASSERT_WLOCKED(m->object); 5191 KASSERT(!vm_page_xbusied(m), 5192 ("pmap_clear_modify: page %p is exclusive busied", m)); 5193 5194 /* 5195 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 5196 * If the object containing the page is locked and the page is not 5197 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 5198 */ 5199 if ((m->aflags & PGA_WRITEABLE) == 0) 5200 return; 5201 rw_wlock(&pvh_global_lock); 5202 sched_pin(); 5203 if ((m->flags & PG_FICTITIOUS) != 0) 5204 goto small_mappings; 5205 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5206 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5207 va = pv->pv_va; 5208 pmap = PV_PMAP(pv); 5209 PMAP_LOCK(pmap); 5210 pde = pmap_pde(pmap, va); 5211 oldpde = *pde; 5212 if ((oldpde & PG_RW) != 0) { 5213 if (pmap_demote_pde(pmap, pde, va)) { 5214 if ((oldpde & PG_W) == 0) { 5215 /* 5216 * Write protect the mapping to a 5217 * single page so that a subsequent 5218 * write access may repromote. 5219 */ 5220 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5221 PG_PS_FRAME); 5222 pte = pmap_pte_quick(pmap, va); 5223 oldpte = *pte; 5224 if ((oldpte & PG_V) != 0) { 5225 /* 5226 * Regardless of whether a pte is 32 or 64 bits 5227 * in size, PG_RW and PG_M are among the least 5228 * significant 32 bits. 5229 */ 5230 while (!atomic_cmpset_int((u_int *)pte, 5231 oldpte, 5232 oldpte & ~(PG_M | PG_RW))) 5233 oldpte = *pte; 5234 vm_page_dirty(m); 5235 pmap_invalidate_page(pmap, va); 5236 } 5237 } 5238 } 5239 } 5240 PMAP_UNLOCK(pmap); 5241 } 5242 small_mappings: 5243 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5244 pmap = PV_PMAP(pv); 5245 PMAP_LOCK(pmap); 5246 pde = pmap_pde(pmap, pv->pv_va); 5247 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5248 " a 4mpage in page %p's pv list", m)); 5249 pte = pmap_pte_quick(pmap, pv->pv_va); 5250 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5251 /* 5252 * Regardless of whether a pte is 32 or 64 bits 5253 * in size, PG_M is among the least significant 5254 * 32 bits. 5255 */ 5256 atomic_clear_int((u_int *)pte, PG_M); 5257 pmap_invalidate_page(pmap, pv->pv_va); 5258 } 5259 PMAP_UNLOCK(pmap); 5260 } 5261 sched_unpin(); 5262 rw_wunlock(&pvh_global_lock); 5263 } 5264 5265 /* 5266 * Miscellaneous support routines follow 5267 */ 5268 5269 /* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5270 static __inline void 5271 pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5272 { 5273 u_int opte, npte; 5274 5275 /* 5276 * The cache mode bits are all in the low 32-bits of the 5277 * PTE, so we can just spin on updating the low 32-bits. 5278 */ 5279 do { 5280 opte = *(u_int *)pte; 5281 npte = opte & ~PG_PTE_CACHE; 5282 npte |= cache_bits; 5283 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5284 } 5285 5286 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5287 static __inline void 5288 pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5289 { 5290 u_int opde, npde; 5291 5292 /* 5293 * The cache mode bits are all in the low 32-bits of the 5294 * PDE, so we can just spin on updating the low 32-bits. 5295 */ 5296 do { 5297 opde = *(u_int *)pde; 5298 npde = opde & ~PG_PDE_CACHE; 5299 npde |= cache_bits; 5300 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5301 } 5302 5303 /* 5304 * Map a set of physical memory pages into the kernel virtual 5305 * address space. Return a pointer to where it is mapped. This 5306 * routine is intended to be used for mapping device memory, 5307 * NOT real memory. 5308 */ 5309 void * 5310 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5311 { 5312 struct pmap_preinit_mapping *ppim; 5313 vm_offset_t va, offset; 5314 vm_size_t tmpsize; 5315 int i; 5316 5317 offset = pa & PAGE_MASK; 5318 size = round_page(offset + size); 5319 pa = pa & PG_FRAME; 5320 5321 if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW) 5322 va = pa + PMAP_MAP_LOW; 5323 else if (!pmap_initialized) { 5324 va = 0; 5325 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5326 ppim = pmap_preinit_mapping + i; 5327 if (ppim->va == 0) { 5328 ppim->pa = pa; 5329 ppim->sz = size; 5330 ppim->mode = mode; 5331 ppim->va = virtual_avail; 5332 virtual_avail += size; 5333 va = ppim->va; 5334 break; 5335 } 5336 } 5337 if (va == 0) 5338 panic("%s: too many preinit mappings", __func__); 5339 } else { 5340 /* 5341 * If we have a preinit mapping, re-use it. 5342 */ 5343 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5344 ppim = pmap_preinit_mapping + i; 5345 if (ppim->pa == pa && ppim->sz == size && 5346 ppim->mode == mode) 5347 return ((void *)(ppim->va + offset)); 5348 } 5349 va = kva_alloc(size); 5350 if (va == 0) 5351 panic("%s: Couldn't allocate KVA", __func__); 5352 } 5353 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5354 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5355 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5356 pmap_invalidate_cache_range(va, va + size, FALSE); 5357 return ((void *)(va + offset)); 5358 } 5359 5360 void * 5361 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5362 { 5363 5364 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5365 } 5366 5367 void * 5368 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5369 { 5370 5371 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5372 } 5373 5374 void 5375 pmap_unmapdev(vm_offset_t va, vm_size_t size) 5376 { 5377 struct pmap_preinit_mapping *ppim; 5378 vm_offset_t offset; 5379 int i; 5380 5381 if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE) 5382 return; 5383 offset = va & PAGE_MASK; 5384 size = round_page(offset + size); 5385 va = trunc_page(va); 5386 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5387 ppim = pmap_preinit_mapping + i; 5388 if (ppim->va == va && ppim->sz == size) { 5389 if (pmap_initialized) 5390 return; 5391 ppim->pa = 0; 5392 ppim->va = 0; 5393 ppim->sz = 0; 5394 ppim->mode = 0; 5395 if (va + size == virtual_avail) 5396 virtual_avail = va; 5397 return; 5398 } 5399 } 5400 if (pmap_initialized) 5401 kva_free(va, size); 5402 } 5403 5404 /* 5405 * Sets the memory attribute for the specified page. 5406 */ 5407 void 5408 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5409 { 5410 5411 m->md.pat_mode = ma; 5412 if ((m->flags & PG_FICTITIOUS) != 0) 5413 return; 5414 5415 /* 5416 * If "m" is a normal page, flush it from the cache. 5417 * See pmap_invalidate_cache_range(). 5418 * 5419 * First, try to find an existing mapping of the page by sf 5420 * buffer. sf_buf_invalidate_cache() modifies mapping and 5421 * flushes the cache. 5422 */ 5423 if (sf_buf_invalidate_cache(m)) 5424 return; 5425 5426 /* 5427 * If page is not mapped by sf buffer, but CPU does not 5428 * support self snoop, map the page transient and do 5429 * invalidation. In the worst case, whole cache is flushed by 5430 * pmap_invalidate_cache_range(). 5431 */ 5432 if ((cpu_feature & CPUID_SS) == 0) 5433 pmap_flush_page(m); 5434 } 5435 5436 static void 5437 pmap_flush_page(vm_page_t m) 5438 { 5439 pt_entry_t *cmap_pte2; 5440 struct pcpu *pc; 5441 vm_offset_t sva, eva; 5442 bool useclflushopt; 5443 5444 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 5445 if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { 5446 sched_pin(); 5447 pc = get_pcpu(); 5448 cmap_pte2 = pc->pc_cmap_pte2; 5449 mtx_lock(&pc->pc_cmap_lock); 5450 if (*cmap_pte2) 5451 panic("pmap_flush_page: CMAP2 busy"); 5452 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5453 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5454 invlcaddr(pc->pc_cmap_addr2); 5455 sva = (vm_offset_t)pc->pc_cmap_addr2; 5456 eva = sva + PAGE_SIZE; 5457 5458 /* 5459 * Use mfence or sfence despite the ordering implied by 5460 * mtx_{un,}lock() because clflush on non-Intel CPUs 5461 * and clflushopt are not guaranteed to be ordered by 5462 * any other instruction. 5463 */ 5464 if (useclflushopt) 5465 sfence(); 5466 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5467 mfence(); 5468 for (; sva < eva; sva += cpu_clflush_line_size) { 5469 if (useclflushopt) 5470 clflushopt(sva); 5471 else 5472 clflush(sva); 5473 } 5474 if (useclflushopt) 5475 sfence(); 5476 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5477 mfence(); 5478 *cmap_pte2 = 0; 5479 sched_unpin(); 5480 mtx_unlock(&pc->pc_cmap_lock); 5481 } else 5482 pmap_invalidate_cache(); 5483 } 5484 5485 /* 5486 * Changes the specified virtual address range's memory type to that given by 5487 * the parameter "mode". The specified virtual address range must be 5488 * completely contained within either the kernel map. 5489 * 5490 * Returns zero if the change completed successfully, and either EINVAL or 5491 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5492 * of the virtual address range was not mapped, and ENOMEM is returned if 5493 * there was insufficient memory available to complete the change. 5494 */ 5495 int 5496 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5497 { 5498 vm_offset_t base, offset, tmpva; 5499 pd_entry_t *pde; 5500 pt_entry_t *pte; 5501 int cache_bits_pte, cache_bits_pde; 5502 boolean_t changed; 5503 5504 base = trunc_page(va); 5505 offset = va & PAGE_MASK; 5506 size = round_page(offset + size); 5507 5508 /* 5509 * Only supported on kernel virtual addresses above the recursive map. 5510 */ 5511 if (base < VM_MIN_KERNEL_ADDRESS) 5512 return (EINVAL); 5513 5514 cache_bits_pde = pmap_cache_bits(mode, 1); 5515 cache_bits_pte = pmap_cache_bits(mode, 0); 5516 changed = FALSE; 5517 5518 /* 5519 * Pages that aren't mapped aren't supported. Also break down 5520 * 2/4MB pages into 4KB pages if required. 5521 */ 5522 PMAP_LOCK(kernel_pmap); 5523 for (tmpva = base; tmpva < base + size; ) { 5524 pde = pmap_pde(kernel_pmap, tmpva); 5525 if (*pde == 0) { 5526 PMAP_UNLOCK(kernel_pmap); 5527 return (EINVAL); 5528 } 5529 if (*pde & PG_PS) { 5530 /* 5531 * If the current 2/4MB page already has 5532 * the required memory type, then we need not 5533 * demote this page. Just increment tmpva to 5534 * the next 2/4MB page frame. 5535 */ 5536 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5537 tmpva = trunc_4mpage(tmpva) + NBPDR; 5538 continue; 5539 } 5540 5541 /* 5542 * If the current offset aligns with a 2/4MB 5543 * page frame and there is at least 2/4MB left 5544 * within the range, then we need not break 5545 * down this page into 4KB pages. 5546 */ 5547 if ((tmpva & PDRMASK) == 0 && 5548 tmpva + PDRMASK < base + size) { 5549 tmpva += NBPDR; 5550 continue; 5551 } 5552 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5553 PMAP_UNLOCK(kernel_pmap); 5554 return (ENOMEM); 5555 } 5556 } 5557 pte = vtopte(tmpva); 5558 if (*pte == 0) { 5559 PMAP_UNLOCK(kernel_pmap); 5560 return (EINVAL); 5561 } 5562 tmpva += PAGE_SIZE; 5563 } 5564 PMAP_UNLOCK(kernel_pmap); 5565 5566 /* 5567 * Ok, all the pages exist, so run through them updating their 5568 * cache mode if required. 5569 */ 5570 for (tmpva = base; tmpva < base + size; ) { 5571 pde = pmap_pde(kernel_pmap, tmpva); 5572 if (*pde & PG_PS) { 5573 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5574 pmap_pde_attr(pde, cache_bits_pde); 5575 changed = TRUE; 5576 } 5577 tmpva = trunc_4mpage(tmpva) + NBPDR; 5578 } else { 5579 pte = vtopte(tmpva); 5580 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5581 pmap_pte_attr(pte, cache_bits_pte); 5582 changed = TRUE; 5583 } 5584 tmpva += PAGE_SIZE; 5585 } 5586 } 5587 5588 /* 5589 * Flush CPU caches to make sure any data isn't cached that 5590 * shouldn't be, etc. 5591 */ 5592 if (changed) { 5593 pmap_invalidate_range(kernel_pmap, base, tmpva); 5594 pmap_invalidate_cache_range(base, tmpva, FALSE); 5595 } 5596 return (0); 5597 } 5598 5599 /* 5600 * perform the pmap work for mincore 5601 */ 5602 int 5603 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5604 { 5605 pd_entry_t pde; 5606 pt_entry_t pte; 5607 vm_paddr_t pa; 5608 int val; 5609 5610 PMAP_LOCK(pmap); 5611 retry: 5612 pde = *pmap_pde(pmap, addr); 5613 if (pde != 0) { 5614 if ((pde & PG_PS) != 0) { 5615 pte = pde; 5616 /* Compute the physical address of the 4KB page. */ 5617 pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) & 5618 PG_FRAME; 5619 val = MINCORE_SUPER; 5620 } else { 5621 pte = pmap_pte_ufast(pmap, addr, pde); 5622 pa = pte & PG_FRAME; 5623 val = 0; 5624 } 5625 } else { 5626 pte = 0; 5627 pa = 0; 5628 val = 0; 5629 } 5630 if ((pte & PG_V) != 0) { 5631 val |= MINCORE_INCORE; 5632 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5633 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5634 if ((pte & PG_A) != 0) 5635 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5636 } 5637 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5638 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5639 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5640 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5641 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5642 goto retry; 5643 } else 5644 PA_UNLOCK_COND(*locked_pa); 5645 PMAP_UNLOCK(pmap); 5646 return (val); 5647 } 5648 5649 void 5650 pmap_activate(struct thread *td) 5651 { 5652 pmap_t pmap, oldpmap; 5653 u_int cpuid; 5654 u_int32_t cr3; 5655 5656 critical_enter(); 5657 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5658 oldpmap = PCPU_GET(curpmap); 5659 cpuid = PCPU_GET(cpuid); 5660 #if defined(SMP) 5661 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5662 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5663 #else 5664 CPU_CLR(cpuid, &oldpmap->pm_active); 5665 CPU_SET(cpuid, &pmap->pm_active); 5666 #endif 5667 #if defined(PAE) || defined(PAE_TABLES) 5668 cr3 = vtophys(pmap->pm_pdpt); 5669 #else 5670 cr3 = vtophys(pmap->pm_pdir); 5671 #endif 5672 /* 5673 * pmap_activate is for the current thread on the current cpu 5674 */ 5675 td->td_pcb->pcb_cr3 = cr3; 5676 PCPU_SET(curpmap, pmap); 5677 critical_exit(); 5678 } 5679 5680 void 5681 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5682 { 5683 } 5684 5685 /* 5686 * Increase the starting virtual address of the given mapping if a 5687 * different alignment might result in more superpage mappings. 5688 */ 5689 void 5690 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5691 vm_offset_t *addr, vm_size_t size) 5692 { 5693 vm_offset_t superpage_offset; 5694 5695 if (size < NBPDR) 5696 return; 5697 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5698 offset += ptoa(object->pg_color); 5699 superpage_offset = offset & PDRMASK; 5700 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5701 (*addr & PDRMASK) == superpage_offset) 5702 return; 5703 if ((*addr & PDRMASK) < superpage_offset) 5704 *addr = (*addr & ~PDRMASK) + superpage_offset; 5705 else 5706 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5707 } 5708 5709 vm_offset_t 5710 pmap_quick_enter_page(vm_page_t m) 5711 { 5712 vm_offset_t qaddr; 5713 pt_entry_t *pte; 5714 5715 critical_enter(); 5716 qaddr = PCPU_GET(qmap_addr); 5717 pte = vtopte(qaddr); 5718 5719 KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); 5720 *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 5721 pmap_cache_bits(pmap_page_get_memattr(m), 0); 5722 invlpg(qaddr); 5723 5724 return (qaddr); 5725 } 5726 5727 void 5728 pmap_quick_remove_page(vm_offset_t addr) 5729 { 5730 vm_offset_t qaddr; 5731 pt_entry_t *pte; 5732 5733 qaddr = PCPU_GET(qmap_addr); 5734 pte = vtopte(qaddr); 5735 5736 KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); 5737 KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); 5738 5739 *pte = 0; 5740 critical_exit(); 5741 } 5742 5743 static vmem_t *pmap_trm_arena; 5744 static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS; 5745 static int trm_guard = PAGE_SIZE; 5746 5747 static int 5748 pmap_trm_import(void *unused __unused, vmem_size_t size, int flags, 5749 vmem_addr_t *addrp) 5750 { 5751 vm_page_t m; 5752 vmem_addr_t af, addr, prev_addr; 5753 pt_entry_t *trm_pte; 5754 5755 prev_addr = atomic_load_long(&pmap_trm_arena_last); 5756 size = round_page(size) + trm_guard; 5757 for (;;) { 5758 if (prev_addr + size < prev_addr || prev_addr + size < size || 5759 prev_addr + size > PMAP_TRM_MAX_ADDRESS) 5760 return (ENOMEM); 5761 addr = prev_addr + size; 5762 if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr)) 5763 break; 5764 } 5765 prev_addr += trm_guard; 5766 trm_pte = PTmap + atop(prev_addr); 5767 for (af = prev_addr; af < addr; af += PAGE_SIZE) { 5768 m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | 5769 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 5770 pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) | 5771 PG_M | PG_A | PG_RW | PG_V | pgeflag | 5772 pmap_cache_bits(VM_MEMATTR_DEFAULT, FALSE)); 5773 } 5774 *addrp = prev_addr; 5775 return (0); 5776 } 5777 5778 static 5779 void pmap_init_trm(void) 5780 { 5781 vm_page_t pd_m; 5782 5783 TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard); 5784 if ((trm_guard & PAGE_MASK) != 0) 5785 trm_guard = 0; 5786 pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK); 5787 vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE); 5788 pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | 5789 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO); 5790 if ((pd_m->flags & PG_ZERO) == 0) 5791 pmap_zero_page(pd_m); 5792 PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V | 5793 pmap_cache_bits(VM_MEMATTR_DEFAULT, TRUE); 5794 } 5795 5796 void * 5797 pmap_trm_alloc(size_t size, int flags) 5798 { 5799 vmem_addr_t res; 5800 int error; 5801 5802 MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0); 5803 error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int), 5804 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res); 5805 if (error != 0) 5806 return (NULL); 5807 if ((flags & M_ZERO) != 0) 5808 bzero((void *)res, size); 5809 return ((void *)res); 5810 } 5811 5812 void 5813 pmap_trm_free(void *addr, size_t size) 5814 { 5815 5816 vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4)); 5817 } 5818 5819 #if defined(PMAP_DEBUG) 5820 pmap_pid_dump(int pid) 5821 { 5822 pmap_t pmap; 5823 struct proc *p; 5824 int npte = 0; 5825 int index; 5826 5827 sx_slock(&allproc_lock); 5828 FOREACH_PROC_IN_SYSTEM(p) { 5829 if (p->p_pid != pid) 5830 continue; 5831 5832 if (p->p_vmspace) { 5833 int i,j; 5834 index = 0; 5835 pmap = vmspace_pmap(p->p_vmspace); 5836 for (i = 0; i < NPDEPTD; i++) { 5837 pd_entry_t *pde; 5838 pt_entry_t *pte; 5839 vm_offset_t base = i << PDRSHIFT; 5840 5841 pde = &pmap->pm_pdir[i]; 5842 if (pde && pmap_pde_v(pde)) { 5843 for (j = 0; j < NPTEPG; j++) { 5844 vm_offset_t va = base + (j << PAGE_SHIFT); 5845 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5846 if (index) { 5847 index = 0; 5848 printf("\n"); 5849 } 5850 sx_sunlock(&allproc_lock); 5851 return (npte); 5852 } 5853 pte = pmap_pte(pmap, va); 5854 if (pte && pmap_pte_v(pte)) { 5855 pt_entry_t pa; 5856 vm_page_t m; 5857 pa = *pte; 5858 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5859 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5860 va, pa, m->hold_count, m->wire_count, m->flags); 5861 npte++; 5862 index++; 5863 if (index >= 2) { 5864 index = 0; 5865 printf("\n"); 5866 } else { 5867 printf(" "); 5868 } 5869 } 5870 } 5871 } 5872 } 5873 } 5874 } 5875 sx_sunlock(&allproc_lock); 5876 return (npte); 5877 } 5878 #endif 5879