1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47 /*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79 #include <sys/cdefs.h> 80 __FBSDID("$FreeBSD$"); 81 82 /* 83 * Manages physical address maps. 84 * 85 * In addition to hardware address maps, this 86 * module is called upon to provide software-use-only 87 * maps which may or may not be stored in the same 88 * form as hardware maps. These pseudo-maps are 89 * used to store intermediate results from copy 90 * operations to and from address spaces. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108 #include "opt_pmap.h" 109 #include "opt_vm.h" 110 111 #include <sys/param.h> 112 #include <sys/systm.h> 113 #include <sys/kernel.h> 114 #include <sys/ktr.h> 115 #include <sys/lock.h> 116 #include <sys/malloc.h> 117 #include <sys/mman.h> 118 #include <sys/mutex.h> 119 #include <sys/proc.h> 120 #include <sys/sx.h> 121 #include <sys/vmmeter.h> 122 #include <sys/sched.h> 123 #include <sys/sysctl.h> 124 #ifdef SMP 125 #include <sys/smp.h> 126 #endif 127 128 #include <vm/vm.h> 129 #include <vm/vm_param.h> 130 #include <vm/vm_kern.h> 131 #include <vm/vm_page.h> 132 #include <vm/vm_map.h> 133 #include <vm/vm_object.h> 134 #include <vm/vm_extern.h> 135 #include <vm/vm_pageout.h> 136 #include <vm/vm_pager.h> 137 #include <vm/vm_reserv.h> 138 #include <vm/uma.h> 139 140 #include <machine/cpu.h> 141 #include <machine/cputypes.h> 142 #include <machine/md_var.h> 143 #include <machine/pcb.h> 144 #include <machine/specialreg.h> 145 #ifdef SMP 146 #include <machine/smp.h> 147 #endif 148 149 #ifndef PMAP_SHPGPERPROC 150 #define PMAP_SHPGPERPROC 200 151 #endif 152 153 #if !defined(DIAGNOSTIC) 154 #ifdef __GNUC_GNU_INLINE__ 155 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 156 #else 157 #define PMAP_INLINE extern inline 158 #endif 159 #else 160 #define PMAP_INLINE 161 #endif 162 163 #define PV_STATS 164 #ifdef PV_STATS 165 #define PV_STAT(x) do { x ; } while (0) 166 #else 167 #define PV_STAT(x) do { } while (0) 168 #endif 169 170 #define pa_index(pa) ((pa) >> PDRSHIFT) 171 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 172 173 struct pmap kernel_pmap_store; 174 175 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 176 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 177 178 static int ndmpdp; 179 static vm_paddr_t dmaplimit; 180 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 181 pt_entry_t pg_nx; 182 183 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 184 185 static int pat_works = 1; 186 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 187 "Is page attribute table fully functional?"); 188 189 static int pg_ps_enabled = 1; 190 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 191 "Are large page mappings enabled?"); 192 193 #define PAT_INDEX_SIZE 8 194 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 195 196 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 197 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 198 u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 199 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 200 201 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 202 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 203 204 /* 205 * Data for the pv entry allocation mechanism 206 */ 207 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 208 static struct md_page *pv_table; 209 static int shpgperproc = PMAP_SHPGPERPROC; 210 211 /* 212 * All those kernel PT submaps that BSD is so fond of 213 */ 214 pt_entry_t *CMAP1 = 0; 215 caddr_t CADDR1 = 0; 216 217 /* 218 * Crashdump maps. 219 */ 220 static caddr_t crashdumpmap; 221 222 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 223 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); 224 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 225 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 226 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 227 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 228 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 229 vm_offset_t va); 230 static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 231 232 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 233 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 234 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 235 vm_offset_t va); 236 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 237 vm_prot_t prot); 238 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 239 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 240 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 241 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 242 static void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); 243 static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 244 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 245 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 246 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 247 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 248 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 249 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 250 vm_prot_t prot); 251 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 252 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 253 vm_page_t *free); 254 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, 255 vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free); 256 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 257 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 258 vm_page_t *free); 259 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 260 vm_offset_t va); 261 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 262 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 263 vm_page_t m); 264 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 265 pd_entry_t newpde); 266 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 267 268 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags); 269 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 270 271 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags); 272 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 273 vm_page_t* free); 274 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *); 275 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 276 277 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 278 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 279 280 /* 281 * Move the kernel virtual free pointer to the next 282 * 2MB. This is used to help improve performance 283 * by using a large (2MB) page for much of the kernel 284 * (.text, .data, .bss) 285 */ 286 static vm_offset_t 287 pmap_kmem_choose(vm_offset_t addr) 288 { 289 vm_offset_t newaddr = addr; 290 291 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 292 return (newaddr); 293 } 294 295 /********************/ 296 /* Inline functions */ 297 /********************/ 298 299 /* Return a non-clipped PD index for a given VA */ 300 static __inline vm_pindex_t 301 pmap_pde_pindex(vm_offset_t va) 302 { 303 return (va >> PDRSHIFT); 304 } 305 306 307 /* Return various clipped indexes for a given VA */ 308 static __inline vm_pindex_t 309 pmap_pte_index(vm_offset_t va) 310 { 311 312 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 313 } 314 315 static __inline vm_pindex_t 316 pmap_pde_index(vm_offset_t va) 317 { 318 319 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 320 } 321 322 static __inline vm_pindex_t 323 pmap_pdpe_index(vm_offset_t va) 324 { 325 326 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 327 } 328 329 static __inline vm_pindex_t 330 pmap_pml4e_index(vm_offset_t va) 331 { 332 333 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 334 } 335 336 /* Return a pointer to the PML4 slot that corresponds to a VA */ 337 static __inline pml4_entry_t * 338 pmap_pml4e(pmap_t pmap, vm_offset_t va) 339 { 340 341 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 342 } 343 344 /* Return a pointer to the PDP slot that corresponds to a VA */ 345 static __inline pdp_entry_t * 346 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 347 { 348 pdp_entry_t *pdpe; 349 350 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 351 return (&pdpe[pmap_pdpe_index(va)]); 352 } 353 354 /* Return a pointer to the PDP slot that corresponds to a VA */ 355 static __inline pdp_entry_t * 356 pmap_pdpe(pmap_t pmap, vm_offset_t va) 357 { 358 pml4_entry_t *pml4e; 359 360 pml4e = pmap_pml4e(pmap, va); 361 if ((*pml4e & PG_V) == 0) 362 return (NULL); 363 return (pmap_pml4e_to_pdpe(pml4e, va)); 364 } 365 366 /* Return a pointer to the PD slot that corresponds to a VA */ 367 static __inline pd_entry_t * 368 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 369 { 370 pd_entry_t *pde; 371 372 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 373 return (&pde[pmap_pde_index(va)]); 374 } 375 376 /* Return a pointer to the PD slot that corresponds to a VA */ 377 static __inline pd_entry_t * 378 pmap_pde(pmap_t pmap, vm_offset_t va) 379 { 380 pdp_entry_t *pdpe; 381 382 pdpe = pmap_pdpe(pmap, va); 383 if (pdpe == NULL || (*pdpe & PG_V) == 0) 384 return (NULL); 385 return (pmap_pdpe_to_pde(pdpe, va)); 386 } 387 388 /* Return a pointer to the PT slot that corresponds to a VA */ 389 static __inline pt_entry_t * 390 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 391 { 392 pt_entry_t *pte; 393 394 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 395 return (&pte[pmap_pte_index(va)]); 396 } 397 398 /* Return a pointer to the PT slot that corresponds to a VA */ 399 static __inline pt_entry_t * 400 pmap_pte(pmap_t pmap, vm_offset_t va) 401 { 402 pd_entry_t *pde; 403 404 pde = pmap_pde(pmap, va); 405 if (pde == NULL || (*pde & PG_V) == 0) 406 return (NULL); 407 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 408 return ((pt_entry_t *)pde); 409 return (pmap_pde_to_pte(pde, va)); 410 } 411 412 static __inline void 413 pmap_resident_count_inc(pmap_t pmap, int count) 414 { 415 416 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 417 pmap->pm_stats.resident_count += count; 418 } 419 420 static __inline void 421 pmap_resident_count_dec(pmap_t pmap, int count) 422 { 423 424 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 425 pmap->pm_stats.resident_count -= count; 426 } 427 428 PMAP_INLINE pt_entry_t * 429 vtopte(vm_offset_t va) 430 { 431 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 432 433 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 434 } 435 436 static __inline pd_entry_t * 437 vtopde(vm_offset_t va) 438 { 439 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 440 441 return (PDmap + ((va >> PDRSHIFT) & mask)); 442 } 443 444 static u_int64_t 445 allocpages(vm_paddr_t *firstaddr, int n) 446 { 447 u_int64_t ret; 448 449 ret = *firstaddr; 450 bzero((void *)ret, n * PAGE_SIZE); 451 *firstaddr += n * PAGE_SIZE; 452 return (ret); 453 } 454 455 CTASSERT(powerof2(NDMPML4E)); 456 457 static void 458 create_pagetables(vm_paddr_t *firstaddr) 459 { 460 int i, j, ndm1g; 461 462 /* Allocate pages */ 463 KPTphys = allocpages(firstaddr, NKPT); 464 KPML4phys = allocpages(firstaddr, 1); 465 KPDPphys = allocpages(firstaddr, NKPML4E); 466 KPDphys = allocpages(firstaddr, NKPDPE); 467 468 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 469 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 470 ndmpdp = 4; 471 DMPDPphys = allocpages(firstaddr, NDMPML4E); 472 ndm1g = 0; 473 if ((amd_feature & AMDID_PAGE1GB) != 0) 474 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 475 if (ndm1g < ndmpdp) 476 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 477 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 478 479 /* Fill in the underlying page table pages */ 480 /* Read-only from zero to physfree */ 481 /* XXX not fully used, underneath 2M pages */ 482 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 483 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; 484 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; 485 } 486 487 /* Now map the page tables at their location within PTmap */ 488 for (i = 0; i < NKPT; i++) { 489 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 490 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 491 } 492 493 /* Map from zero to end of allocations under 2M pages */ 494 /* This replaces some of the KPTphys entries above */ 495 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 496 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; 497 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; 498 } 499 500 /* And connect up the PD to the PDP */ 501 for (i = 0; i < NKPDPE; i++) { 502 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + 503 (i << PAGE_SHIFT); 504 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; 505 } 506 507 /* 508 * Now, set up the direct map region using 2MB and/or 1GB pages. If 509 * the end of physical memory is not aligned to a 1GB page boundary, 510 * then the residual physical memory is mapped with 2MB pages. Later, 511 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 512 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 513 * that are partially used. 514 */ 515 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 516 ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT; 517 /* Preset PG_M and PG_A because demotion expects it. */ 518 ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G | 519 PG_M | PG_A; 520 } 521 for (i = 0; i < ndm1g; i++) { 522 ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT; 523 /* Preset PG_M and PG_A because demotion expects it. */ 524 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G | 525 PG_M | PG_A; 526 } 527 for (j = 0; i < ndmpdp; i++, j++) { 528 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT); 529 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 530 } 531 532 /* And recursively map PML4 to itself in order to get PTmap */ 533 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 534 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 535 536 /* Connect the Direct Map slot(s) up to the PML4. */ 537 for (i = 0; i < NDMPML4E; i++) { 538 ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys + 539 (i << PAGE_SHIFT); 540 ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U; 541 } 542 543 /* Connect the KVA slot up to the PML4 */ 544 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 545 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 546 } 547 548 /* 549 * Bootstrap the system enough to run with virtual memory. 550 * 551 * On amd64 this is called after mapping has already been enabled 552 * and just syncs the pmap module with what has already been done. 553 * [We can't call it easily with mapping off since the kernel is not 554 * mapped with PA == VA, hence we would have to relocate every address 555 * from the linked base (virtual) address "KERNBASE" to the actual 556 * (physical) address starting relative to 0] 557 */ 558 void 559 pmap_bootstrap(vm_paddr_t *firstaddr) 560 { 561 vm_offset_t va; 562 pt_entry_t *pte, *unused; 563 564 /* 565 * Create an initial set of page tables to run the kernel in. 566 */ 567 create_pagetables(firstaddr); 568 569 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 570 virtual_avail = pmap_kmem_choose(virtual_avail); 571 572 virtual_end = VM_MAX_KERNEL_ADDRESS; 573 574 575 /* XXX do %cr0 as well */ 576 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 577 load_cr3(KPML4phys); 578 579 /* 580 * Initialize the kernel pmap (which is statically allocated). 581 */ 582 PMAP_LOCK_INIT(kernel_pmap); 583 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 584 kernel_pmap->pm_root = NULL; 585 kernel_pmap->pm_active = -1; /* don't allow deactivation */ 586 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 587 588 /* 589 * Reserve some special page table entries/VA space for temporary 590 * mapping of pages. 591 */ 592 #define SYSMAP(c, p, v, n) \ 593 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 594 595 va = virtual_avail; 596 pte = vtopte(va); 597 598 /* 599 * CMAP1 is only used for the memory test. 600 */ 601 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 602 603 /* 604 * Crashdump maps. 605 */ 606 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 607 608 virtual_avail = va; 609 610 /* Initialize the PAT MSR. */ 611 pmap_init_pat(); 612 } 613 614 /* 615 * Setup the PAT MSR. 616 */ 617 void 618 pmap_init_pat(void) 619 { 620 int pat_table[PAT_INDEX_SIZE]; 621 uint64_t pat_msr; 622 u_long cr0, cr4; 623 int i; 624 625 /* Bail if this CPU doesn't implement PAT. */ 626 if ((cpu_feature & CPUID_PAT) == 0) 627 panic("no PAT??"); 628 629 /* Set default PAT index table. */ 630 for (i = 0; i < PAT_INDEX_SIZE; i++) 631 pat_table[i] = -1; 632 pat_table[PAT_WRITE_BACK] = 0; 633 pat_table[PAT_WRITE_THROUGH] = 1; 634 pat_table[PAT_UNCACHEABLE] = 3; 635 pat_table[PAT_WRITE_COMBINING] = 3; 636 pat_table[PAT_WRITE_PROTECTED] = 3; 637 pat_table[PAT_UNCACHED] = 3; 638 639 /* Initialize default PAT entries. */ 640 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 641 PAT_VALUE(1, PAT_WRITE_THROUGH) | 642 PAT_VALUE(2, PAT_UNCACHED) | 643 PAT_VALUE(3, PAT_UNCACHEABLE) | 644 PAT_VALUE(4, PAT_WRITE_BACK) | 645 PAT_VALUE(5, PAT_WRITE_THROUGH) | 646 PAT_VALUE(6, PAT_UNCACHED) | 647 PAT_VALUE(7, PAT_UNCACHEABLE); 648 649 if (pat_works) { 650 /* 651 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 652 * Program 5 and 6 as WP and WC. 653 * Leave 4 and 7 as WB and UC. 654 */ 655 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 656 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 657 PAT_VALUE(6, PAT_WRITE_COMBINING); 658 pat_table[PAT_UNCACHED] = 2; 659 pat_table[PAT_WRITE_PROTECTED] = 5; 660 pat_table[PAT_WRITE_COMBINING] = 6; 661 } else { 662 /* 663 * Just replace PAT Index 2 with WC instead of UC-. 664 */ 665 pat_msr &= ~PAT_MASK(2); 666 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 667 pat_table[PAT_WRITE_COMBINING] = 2; 668 } 669 670 /* Disable PGE. */ 671 cr4 = rcr4(); 672 load_cr4(cr4 & ~CR4_PGE); 673 674 /* Disable caches (CD = 1, NW = 0). */ 675 cr0 = rcr0(); 676 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 677 678 /* Flushes caches and TLBs. */ 679 wbinvd(); 680 invltlb(); 681 682 /* Update PAT and index table. */ 683 wrmsr(MSR_PAT, pat_msr); 684 for (i = 0; i < PAT_INDEX_SIZE; i++) 685 pat_index[i] = pat_table[i]; 686 687 /* Flush caches and TLBs again. */ 688 wbinvd(); 689 invltlb(); 690 691 /* Restore caches and PGE. */ 692 load_cr0(cr0); 693 load_cr4(cr4); 694 } 695 696 /* 697 * Initialize a vm_page's machine-dependent fields. 698 */ 699 void 700 pmap_page_init(vm_page_t m) 701 { 702 703 TAILQ_INIT(&m->md.pv_list); 704 m->md.pat_mode = PAT_WRITE_BACK; 705 } 706 707 /* 708 * Initialize the pmap module. 709 * Called by vm_init, to initialize any structures that the pmap 710 * system needs to map virtual memory. 711 */ 712 void 713 pmap_init(void) 714 { 715 vm_page_t mpte; 716 vm_size_t s; 717 int i, pv_npg; 718 719 /* 720 * Initialize the vm page array entries for the kernel pmap's 721 * page table pages. 722 */ 723 for (i = 0; i < NKPT; i++) { 724 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 725 KASSERT(mpte >= vm_page_array && 726 mpte < &vm_page_array[vm_page_array_size], 727 ("pmap_init: page table page is out of range")); 728 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 729 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 730 } 731 732 /* 733 * Initialize the address space (zone) for the pv entries. Set a 734 * high water mark so that the system can recover from excessive 735 * numbers of pv entries. 736 */ 737 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 738 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 739 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 740 pv_entry_high_water = 9 * (pv_entry_max / 10); 741 742 /* 743 * If the kernel is running in a virtual machine on an AMD Family 10h 744 * processor, then it must assume that MCA is enabled by the virtual 745 * machine monitor. 746 */ 747 if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD && 748 CPUID_TO_FAMILY(cpu_id) == 0x10) 749 workaround_erratum383 = 1; 750 751 /* 752 * Are large page mappings enabled? 753 */ 754 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 755 if (pg_ps_enabled) { 756 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 757 ("pmap_init: can't assign to pagesizes[1]")); 758 pagesizes[1] = NBPDR; 759 } 760 761 /* 762 * Calculate the size of the pv head table for superpages. 763 */ 764 for (i = 0; phys_avail[i + 1]; i += 2); 765 pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR; 766 767 /* 768 * Allocate memory for the pv head table for superpages. 769 */ 770 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 771 s = round_page(s); 772 pv_table = (struct md_page *)kmem_alloc(kernel_map, s); 773 for (i = 0; i < pv_npg; i++) 774 TAILQ_INIT(&pv_table[i].pv_list); 775 } 776 777 static int 778 pmap_pventry_proc(SYSCTL_HANDLER_ARGS) 779 { 780 int error; 781 782 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 783 if (error == 0 && req->newptr) { 784 shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc; 785 pv_entry_high_water = 9 * (pv_entry_max / 10); 786 } 787 return (error); 788 } 789 SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, 790 &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries"); 791 792 static int 793 pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS) 794 { 795 int error; 796 797 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 798 if (error == 0 && req->newptr) { 799 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 800 pv_entry_high_water = 9 * (pv_entry_max / 10); 801 } 802 return (error); 803 } 804 SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, 805 &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc"); 806 807 SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 808 "2MB page mapping counters"); 809 810 static u_long pmap_pde_demotions; 811 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 812 &pmap_pde_demotions, 0, "2MB page demotions"); 813 814 static u_long pmap_pde_mappings; 815 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 816 &pmap_pde_mappings, 0, "2MB page mappings"); 817 818 static u_long pmap_pde_p_failures; 819 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 820 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 821 822 static u_long pmap_pde_promotions; 823 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 824 &pmap_pde_promotions, 0, "2MB page promotions"); 825 826 SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 827 "1GB page mapping counters"); 828 829 static u_long pmap_pdpe_demotions; 830 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 831 &pmap_pdpe_demotions, 0, "1GB page demotions"); 832 833 /*************************************************** 834 * Low level helper routines..... 835 ***************************************************/ 836 837 /* 838 * Determine the appropriate bits to set in a PTE or PDE for a specified 839 * caching mode. 840 */ 841 static int 842 pmap_cache_bits(int mode, boolean_t is_pde) 843 { 844 int cache_bits, pat_flag, pat_idx; 845 846 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 847 panic("Unknown caching mode %d\n", mode); 848 849 /* The PAT bit is different for PTE's and PDE's. */ 850 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 851 852 /* Map the caching mode to a PAT index. */ 853 pat_idx = pat_index[mode]; 854 855 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 856 cache_bits = 0; 857 if (pat_idx & 0x4) 858 cache_bits |= pat_flag; 859 if (pat_idx & 0x2) 860 cache_bits |= PG_NC_PCD; 861 if (pat_idx & 0x1) 862 cache_bits |= PG_NC_PWT; 863 return (cache_bits); 864 } 865 866 /* 867 * After changing the page size for the specified virtual address in the page 868 * table, flush the corresponding entries from the processor's TLB. Only the 869 * calling processor's TLB is affected. 870 * 871 * The calling thread must be pinned to a processor. 872 */ 873 static void 874 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 875 { 876 u_long cr4; 877 878 if ((newpde & PG_PS) == 0) 879 /* Demotion: flush a specific 2MB page mapping. */ 880 invlpg(va); 881 else if ((newpde & PG_G) == 0) 882 /* 883 * Promotion: flush every 4KB page mapping from the TLB 884 * because there are too many to flush individually. 885 */ 886 invltlb(); 887 else { 888 /* 889 * Promotion: flush every 4KB page mapping from the TLB, 890 * including any global (PG_G) mappings. 891 */ 892 cr4 = rcr4(); 893 load_cr4(cr4 & ~CR4_PGE); 894 /* 895 * Although preemption at this point could be detrimental to 896 * performance, it would not lead to an error. PG_G is simply 897 * ignored if CR4.PGE is clear. Moreover, in case this block 898 * is re-entered, the load_cr4() either above or below will 899 * modify CR4.PGE flushing the TLB. 900 */ 901 load_cr4(cr4 | CR4_PGE); 902 } 903 } 904 #ifdef SMP 905 /* 906 * For SMP, these functions have to use the IPI mechanism for coherence. 907 * 908 * N.B.: Before calling any of the following TLB invalidation functions, 909 * the calling processor must ensure that all stores updating a non- 910 * kernel page table are globally performed. Otherwise, another 911 * processor could cache an old, pre-update entry without being 912 * invalidated. This can happen one of two ways: (1) The pmap becomes 913 * active on another processor after its pm_active field is checked by 914 * one of the following functions but before a store updating the page 915 * table is globally performed. (2) The pmap becomes active on another 916 * processor before its pm_active field is checked but due to 917 * speculative loads one of the following functions stills reads the 918 * pmap as inactive on the other processor. 919 * 920 * The kernel page table is exempt because its pm_active field is 921 * immutable. The kernel page table is always active on every 922 * processor. 923 */ 924 void 925 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 926 { 927 cpumask_t cpumask, other_cpus; 928 929 sched_pin(); 930 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 931 invlpg(va); 932 smp_invlpg(va); 933 } else { 934 cpumask = PCPU_GET(cpumask); 935 other_cpus = PCPU_GET(other_cpus); 936 if (pmap->pm_active & cpumask) 937 invlpg(va); 938 if (pmap->pm_active & other_cpus) 939 smp_masked_invlpg(pmap->pm_active & other_cpus, va); 940 } 941 sched_unpin(); 942 } 943 944 void 945 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 946 { 947 cpumask_t cpumask, other_cpus; 948 vm_offset_t addr; 949 950 sched_pin(); 951 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 952 for (addr = sva; addr < eva; addr += PAGE_SIZE) 953 invlpg(addr); 954 smp_invlpg_range(sva, eva); 955 } else { 956 cpumask = PCPU_GET(cpumask); 957 other_cpus = PCPU_GET(other_cpus); 958 if (pmap->pm_active & cpumask) 959 for (addr = sva; addr < eva; addr += PAGE_SIZE) 960 invlpg(addr); 961 if (pmap->pm_active & other_cpus) 962 smp_masked_invlpg_range(pmap->pm_active & other_cpus, 963 sva, eva); 964 } 965 sched_unpin(); 966 } 967 968 void 969 pmap_invalidate_all(pmap_t pmap) 970 { 971 cpumask_t cpumask, other_cpus; 972 973 sched_pin(); 974 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 975 invltlb(); 976 smp_invltlb(); 977 } else { 978 cpumask = PCPU_GET(cpumask); 979 other_cpus = PCPU_GET(other_cpus); 980 if (pmap->pm_active & cpumask) 981 invltlb(); 982 if (pmap->pm_active & other_cpus) 983 smp_masked_invltlb(pmap->pm_active & other_cpus); 984 } 985 sched_unpin(); 986 } 987 988 void 989 pmap_invalidate_cache(void) 990 { 991 992 sched_pin(); 993 wbinvd(); 994 smp_cache_flush(); 995 sched_unpin(); 996 } 997 998 struct pde_action { 999 cpumask_t store; /* processor that updates the PDE */ 1000 cpumask_t invalidate; /* processors that invalidate their TLB */ 1001 vm_offset_t va; 1002 pd_entry_t *pde; 1003 pd_entry_t newpde; 1004 }; 1005 1006 static void 1007 pmap_update_pde_action(void *arg) 1008 { 1009 struct pde_action *act = arg; 1010 1011 if (act->store == PCPU_GET(cpumask)) 1012 pde_store(act->pde, act->newpde); 1013 } 1014 1015 static void 1016 pmap_update_pde_teardown(void *arg) 1017 { 1018 struct pde_action *act = arg; 1019 1020 if ((act->invalidate & PCPU_GET(cpumask)) != 0) 1021 pmap_update_pde_invalidate(act->va, act->newpde); 1022 } 1023 1024 /* 1025 * Change the page size for the specified virtual address in a way that 1026 * prevents any possibility of the TLB ever having two entries that map the 1027 * same virtual address using different page sizes. This is the recommended 1028 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1029 * machine check exception for a TLB state that is improperly diagnosed as a 1030 * hardware error. 1031 */ 1032 static void 1033 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1034 { 1035 struct pde_action act; 1036 cpumask_t active, cpumask; 1037 1038 sched_pin(); 1039 cpumask = PCPU_GET(cpumask); 1040 if (pmap == kernel_pmap) 1041 active = all_cpus; 1042 else 1043 active = pmap->pm_active; 1044 if ((active & PCPU_GET(other_cpus)) != 0) { 1045 act.store = cpumask; 1046 act.invalidate = active; 1047 act.va = va; 1048 act.pde = pde; 1049 act.newpde = newpde; 1050 smp_rendezvous_cpus(cpumask | active, 1051 smp_no_rendevous_barrier, pmap_update_pde_action, 1052 pmap_update_pde_teardown, &act); 1053 } else { 1054 pde_store(pde, newpde); 1055 if ((active & cpumask) != 0) 1056 pmap_update_pde_invalidate(va, newpde); 1057 } 1058 sched_unpin(); 1059 } 1060 #else /* !SMP */ 1061 /* 1062 * Normal, non-SMP, invalidation functions. 1063 * We inline these within pmap.c for speed. 1064 */ 1065 PMAP_INLINE void 1066 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1067 { 1068 1069 if (pmap == kernel_pmap || pmap->pm_active) 1070 invlpg(va); 1071 } 1072 1073 PMAP_INLINE void 1074 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1075 { 1076 vm_offset_t addr; 1077 1078 if (pmap == kernel_pmap || pmap->pm_active) 1079 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1080 invlpg(addr); 1081 } 1082 1083 PMAP_INLINE void 1084 pmap_invalidate_all(pmap_t pmap) 1085 { 1086 1087 if (pmap == kernel_pmap || pmap->pm_active) 1088 invltlb(); 1089 } 1090 1091 PMAP_INLINE void 1092 pmap_invalidate_cache(void) 1093 { 1094 1095 wbinvd(); 1096 } 1097 1098 static void 1099 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1100 { 1101 1102 pde_store(pde, newpde); 1103 if (pmap == kernel_pmap || pmap->pm_active) 1104 pmap_update_pde_invalidate(va, newpde); 1105 } 1106 #endif /* !SMP */ 1107 1108 static void 1109 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1110 { 1111 1112 KASSERT((sva & PAGE_MASK) == 0, 1113 ("pmap_invalidate_cache_range: sva not page-aligned")); 1114 KASSERT((eva & PAGE_MASK) == 0, 1115 ("pmap_invalidate_cache_range: eva not page-aligned")); 1116 1117 if (cpu_feature & CPUID_SS) 1118 ; /* If "Self Snoop" is supported, do nothing. */ 1119 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1120 eva - sva < 2 * 1024 * 1024) { 1121 1122 /* 1123 * Otherwise, do per-cache line flush. Use the mfence 1124 * instruction to insure that previous stores are 1125 * included in the write-back. The processor 1126 * propagates flush to other processors in the cache 1127 * coherence domain. 1128 */ 1129 mfence(); 1130 for (; sva < eva; sva += cpu_clflush_line_size) 1131 clflush(sva); 1132 mfence(); 1133 } else { 1134 1135 /* 1136 * No targeted cache flush methods are supported by CPU, 1137 * or the supplied range is bigger than 2MB. 1138 * Globally invalidate cache. 1139 */ 1140 pmap_invalidate_cache(); 1141 } 1142 } 1143 1144 /* 1145 * Are we current address space or kernel? 1146 */ 1147 static __inline int 1148 pmap_is_current(pmap_t pmap) 1149 { 1150 return (pmap == kernel_pmap || 1151 (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME)); 1152 } 1153 1154 /* 1155 * Routine: pmap_extract 1156 * Function: 1157 * Extract the physical page address associated 1158 * with the given map/virtual_address pair. 1159 */ 1160 vm_paddr_t 1161 pmap_extract(pmap_t pmap, vm_offset_t va) 1162 { 1163 pdp_entry_t *pdpe; 1164 pd_entry_t *pde; 1165 pt_entry_t *pte; 1166 vm_paddr_t pa; 1167 1168 pa = 0; 1169 PMAP_LOCK(pmap); 1170 pdpe = pmap_pdpe(pmap, va); 1171 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1172 if ((*pdpe & PG_PS) != 0) 1173 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 1174 else { 1175 pde = pmap_pdpe_to_pde(pdpe, va); 1176 if ((*pde & PG_V) != 0) { 1177 if ((*pde & PG_PS) != 0) { 1178 pa = (*pde & PG_PS_FRAME) | 1179 (va & PDRMASK); 1180 } else { 1181 pte = pmap_pde_to_pte(pde, va); 1182 pa = (*pte & PG_FRAME) | 1183 (va & PAGE_MASK); 1184 } 1185 } 1186 } 1187 } 1188 PMAP_UNLOCK(pmap); 1189 return (pa); 1190 } 1191 1192 /* 1193 * Routine: pmap_extract_and_hold 1194 * Function: 1195 * Atomically extract and hold the physical page 1196 * with the given pmap and virtual address pair 1197 * if that mapping permits the given protection. 1198 */ 1199 vm_page_t 1200 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1201 { 1202 pd_entry_t pde, *pdep; 1203 pt_entry_t pte; 1204 vm_paddr_t pa; 1205 vm_page_t m; 1206 1207 pa = 0; 1208 m = NULL; 1209 PMAP_LOCK(pmap); 1210 retry: 1211 pdep = pmap_pde(pmap, va); 1212 if (pdep != NULL && (pde = *pdep)) { 1213 if (pde & PG_PS) { 1214 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1215 if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | 1216 (va & PDRMASK), &pa)) 1217 goto retry; 1218 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1219 (va & PDRMASK)); 1220 vm_page_hold(m); 1221 } 1222 } else { 1223 pte = *pmap_pde_to_pte(pdep, va); 1224 if ((pte & PG_V) && 1225 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1226 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) 1227 goto retry; 1228 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1229 vm_page_hold(m); 1230 } 1231 } 1232 } 1233 PA_UNLOCK_COND(pa); 1234 PMAP_UNLOCK(pmap); 1235 return (m); 1236 } 1237 1238 vm_paddr_t 1239 pmap_kextract(vm_offset_t va) 1240 { 1241 pd_entry_t pde; 1242 vm_paddr_t pa; 1243 1244 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1245 pa = DMAP_TO_PHYS(va); 1246 } else { 1247 pde = *vtopde(va); 1248 if (pde & PG_PS) { 1249 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 1250 } else { 1251 /* 1252 * Beware of a concurrent promotion that changes the 1253 * PDE at this point! For example, vtopte() must not 1254 * be used to access the PTE because it would use the 1255 * new PDE. It is, however, safe to use the old PDE 1256 * because the page table page is preserved by the 1257 * promotion. 1258 */ 1259 pa = *pmap_pde_to_pte(&pde, va); 1260 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1261 } 1262 } 1263 return (pa); 1264 } 1265 1266 /*************************************************** 1267 * Low level mapping routines..... 1268 ***************************************************/ 1269 1270 /* 1271 * Add a wired page to the kva. 1272 * Note: not SMP coherent. 1273 */ 1274 PMAP_INLINE void 1275 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1276 { 1277 pt_entry_t *pte; 1278 1279 pte = vtopte(va); 1280 pte_store(pte, pa | PG_RW | PG_V | PG_G); 1281 } 1282 1283 static __inline void 1284 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1285 { 1286 pt_entry_t *pte; 1287 1288 pte = vtopte(va); 1289 pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0)); 1290 } 1291 1292 /* 1293 * Remove a page from the kernel pagetables. 1294 * Note: not SMP coherent. 1295 */ 1296 PMAP_INLINE void 1297 pmap_kremove(vm_offset_t va) 1298 { 1299 pt_entry_t *pte; 1300 1301 pte = vtopte(va); 1302 pte_clear(pte); 1303 } 1304 1305 /* 1306 * Used to map a range of physical addresses into kernel 1307 * virtual address space. 1308 * 1309 * The value passed in '*virt' is a suggested virtual address for 1310 * the mapping. Architectures which can support a direct-mapped 1311 * physical to virtual region can return the appropriate address 1312 * within that region, leaving '*virt' unchanged. Other 1313 * architectures should map the pages starting at '*virt' and 1314 * update '*virt' with the first usable address after the mapped 1315 * region. 1316 */ 1317 vm_offset_t 1318 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1319 { 1320 return PHYS_TO_DMAP(start); 1321 } 1322 1323 1324 /* 1325 * Add a list of wired pages to the kva 1326 * this routine is only used for temporary 1327 * kernel mappings that do not need to have 1328 * page modification or references recorded. 1329 * Note that old mappings are simply written 1330 * over. The page *must* be wired. 1331 * Note: SMP coherent. Uses a ranged shootdown IPI. 1332 */ 1333 void 1334 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1335 { 1336 pt_entry_t *endpte, oldpte, pa, *pte; 1337 vm_page_t m; 1338 1339 oldpte = 0; 1340 pte = vtopte(sva); 1341 endpte = pte + count; 1342 while (pte < endpte) { 1343 m = *ma++; 1344 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1345 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1346 oldpte |= *pte; 1347 pte_store(pte, pa | PG_G | PG_RW | PG_V); 1348 } 1349 pte++; 1350 } 1351 if (__predict_false((oldpte & PG_V) != 0)) 1352 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1353 PAGE_SIZE); 1354 } 1355 1356 /* 1357 * This routine tears out page mappings from the 1358 * kernel -- it is meant only for temporary mappings. 1359 * Note: SMP coherent. Uses a ranged shootdown IPI. 1360 */ 1361 void 1362 pmap_qremove(vm_offset_t sva, int count) 1363 { 1364 vm_offset_t va; 1365 1366 va = sva; 1367 while (count-- > 0) { 1368 pmap_kremove(va); 1369 va += PAGE_SIZE; 1370 } 1371 pmap_invalidate_range(kernel_pmap, sva, va); 1372 } 1373 1374 /*************************************************** 1375 * Page table page management routines..... 1376 ***************************************************/ 1377 static __inline void 1378 pmap_free_zero_pages(vm_page_t free) 1379 { 1380 vm_page_t m; 1381 1382 while (free != NULL) { 1383 m = free; 1384 free = m->right; 1385 /* Preserve the page's PG_ZERO setting. */ 1386 vm_page_free_toq(m); 1387 } 1388 } 1389 1390 /* 1391 * Schedule the specified unused page table page to be freed. Specifically, 1392 * add the page to the specified list of pages that will be released to the 1393 * physical memory manager after the TLB has been updated. 1394 */ 1395 static __inline void 1396 pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) 1397 { 1398 1399 if (set_PG_ZERO) 1400 m->flags |= PG_ZERO; 1401 else 1402 m->flags &= ~PG_ZERO; 1403 m->right = *free; 1404 *free = m; 1405 } 1406 1407 /* 1408 * Inserts the specified page table page into the specified pmap's collection 1409 * of idle page table pages. Each of a pmap's page table pages is responsible 1410 * for mapping a distinct range of virtual addresses. The pmap's collection is 1411 * ordered by this virtual address range. 1412 */ 1413 static void 1414 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1415 { 1416 vm_page_t root; 1417 1418 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1419 root = pmap->pm_root; 1420 if (root == NULL) { 1421 mpte->left = NULL; 1422 mpte->right = NULL; 1423 } else { 1424 root = vm_page_splay(mpte->pindex, root); 1425 if (mpte->pindex < root->pindex) { 1426 mpte->left = root->left; 1427 mpte->right = root; 1428 root->left = NULL; 1429 } else if (mpte->pindex == root->pindex) 1430 panic("pmap_insert_pt_page: pindex already inserted"); 1431 else { 1432 mpte->right = root->right; 1433 mpte->left = root; 1434 root->right = NULL; 1435 } 1436 } 1437 pmap->pm_root = mpte; 1438 } 1439 1440 /* 1441 * Looks for a page table page mapping the specified virtual address in the 1442 * specified pmap's collection of idle page table pages. Returns NULL if there 1443 * is no page table page corresponding to the specified virtual address. 1444 */ 1445 static vm_page_t 1446 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1447 { 1448 vm_page_t mpte; 1449 vm_pindex_t pindex = pmap_pde_pindex(va); 1450 1451 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1452 if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { 1453 mpte = vm_page_splay(pindex, mpte); 1454 if ((pmap->pm_root = mpte)->pindex != pindex) 1455 mpte = NULL; 1456 } 1457 return (mpte); 1458 } 1459 1460 /* 1461 * Removes the specified page table page from the specified pmap's collection 1462 * of idle page table pages. The specified page table page must be a member of 1463 * the pmap's collection. 1464 */ 1465 static void 1466 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1467 { 1468 vm_page_t root; 1469 1470 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1471 if (mpte != pmap->pm_root) { 1472 root = vm_page_splay(mpte->pindex, pmap->pm_root); 1473 KASSERT(mpte == root, 1474 ("pmap_remove_pt_page: mpte %p is missing from pmap %p", 1475 mpte, pmap)); 1476 } 1477 if (mpte->left == NULL) 1478 root = mpte->right; 1479 else { 1480 root = vm_page_splay(mpte->pindex, mpte->left); 1481 root->right = mpte->right; 1482 } 1483 pmap->pm_root = root; 1484 } 1485 1486 /* 1487 * This routine unholds page table pages, and if the hold count 1488 * drops to zero, then it decrements the wire count. 1489 */ 1490 static __inline int 1491 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) 1492 { 1493 1494 --m->wire_count; 1495 if (m->wire_count == 0) 1496 return (_pmap_unwire_pte_hold(pmap, va, m, free)); 1497 else 1498 return (0); 1499 } 1500 1501 static int 1502 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1503 vm_page_t *free) 1504 { 1505 1506 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1507 /* 1508 * unmap the page table page 1509 */ 1510 if (m->pindex >= (NUPDE + NUPDPE)) { 1511 /* PDP page */ 1512 pml4_entry_t *pml4; 1513 pml4 = pmap_pml4e(pmap, va); 1514 *pml4 = 0; 1515 } else if (m->pindex >= NUPDE) { 1516 /* PD page */ 1517 pdp_entry_t *pdp; 1518 pdp = pmap_pdpe(pmap, va); 1519 *pdp = 0; 1520 } else { 1521 /* PTE page */ 1522 pd_entry_t *pd; 1523 pd = pmap_pde(pmap, va); 1524 *pd = 0; 1525 } 1526 pmap_resident_count_dec(pmap, 1); 1527 if (m->pindex < NUPDE) { 1528 /* We just released a PT, unhold the matching PD */ 1529 vm_page_t pdpg; 1530 1531 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1532 pmap_unwire_pte_hold(pmap, va, pdpg, free); 1533 } 1534 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1535 /* We just released a PD, unhold the matching PDP */ 1536 vm_page_t pdppg; 1537 1538 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1539 pmap_unwire_pte_hold(pmap, va, pdppg, free); 1540 } 1541 1542 /* 1543 * This is a release store so that the ordinary store unmapping 1544 * the page table page is globally performed before TLB shoot- 1545 * down is begun. 1546 */ 1547 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1548 1549 /* 1550 * Put page on a list so that it is released after 1551 * *ALL* TLB shootdown is done 1552 */ 1553 pmap_add_delayed_free_list(m, free, TRUE); 1554 1555 return (1); 1556 } 1557 1558 /* 1559 * After removing a page table entry, this routine is used to 1560 * conditionally free the page, and manage the hold/wire counts. 1561 */ 1562 static int 1563 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free) 1564 { 1565 vm_page_t mpte; 1566 1567 if (va >= VM_MAXUSER_ADDRESS) 1568 return (0); 1569 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1570 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1571 return (pmap_unwire_pte_hold(pmap, va, mpte, free)); 1572 } 1573 1574 void 1575 pmap_pinit0(pmap_t pmap) 1576 { 1577 1578 PMAP_LOCK_INIT(pmap); 1579 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1580 pmap->pm_root = NULL; 1581 pmap->pm_active = 0; 1582 PCPU_SET(curpmap, pmap); 1583 TAILQ_INIT(&pmap->pm_pvchunk); 1584 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1585 } 1586 1587 /* 1588 * Initialize a preallocated and zeroed pmap structure, 1589 * such as one in a vmspace structure. 1590 */ 1591 int 1592 pmap_pinit(pmap_t pmap) 1593 { 1594 vm_page_t pml4pg; 1595 static vm_pindex_t color; 1596 int i; 1597 1598 PMAP_LOCK_INIT(pmap); 1599 1600 /* 1601 * allocate the page directory page 1602 */ 1603 while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ | 1604 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1605 VM_WAIT; 1606 1607 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 1608 1609 if ((pml4pg->flags & PG_ZERO) == 0) 1610 pagezero(pmap->pm_pml4); 1611 1612 /* Wire in kernel global address entries. */ 1613 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1614 for (i = 0; i < NDMPML4E; i++) { 1615 pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) | 1616 PG_RW | PG_V | PG_U; 1617 } 1618 1619 /* install self-referential address mapping entry(s) */ 1620 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; 1621 1622 pmap->pm_root = NULL; 1623 pmap->pm_active = 0; 1624 TAILQ_INIT(&pmap->pm_pvchunk); 1625 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1626 1627 return (1); 1628 } 1629 1630 /* 1631 * this routine is called if the page table page is not 1632 * mapped correctly. 1633 * 1634 * Note: If a page allocation fails at page table level two or three, 1635 * one or two pages may be held during the wait, only to be released 1636 * afterwards. This conservative approach is easily argued to avoid 1637 * race conditions. 1638 */ 1639 static vm_page_t 1640 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags) 1641 { 1642 vm_page_t m, pdppg, pdpg; 1643 1644 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1645 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1646 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1647 1648 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1649 /* 1650 * Allocate a page table page. 1651 */ 1652 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1653 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1654 if (flags & M_WAITOK) { 1655 PMAP_UNLOCK(pmap); 1656 vm_page_unlock_queues(); 1657 VM_WAIT; 1658 vm_page_lock_queues(); 1659 PMAP_LOCK(pmap); 1660 } 1661 1662 /* 1663 * Indicate the need to retry. While waiting, the page table 1664 * page may have been allocated. 1665 */ 1666 return (NULL); 1667 } 1668 if ((m->flags & PG_ZERO) == 0) 1669 pmap_zero_page(m); 1670 1671 /* 1672 * Map the pagetable page into the process address space, if 1673 * it isn't already there. 1674 */ 1675 1676 if (ptepindex >= (NUPDE + NUPDPE)) { 1677 pml4_entry_t *pml4; 1678 vm_pindex_t pml4index; 1679 1680 /* Wire up a new PDPE page */ 1681 pml4index = ptepindex - (NUPDE + NUPDPE); 1682 pml4 = &pmap->pm_pml4[pml4index]; 1683 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1684 1685 } else if (ptepindex >= NUPDE) { 1686 vm_pindex_t pml4index; 1687 vm_pindex_t pdpindex; 1688 pml4_entry_t *pml4; 1689 pdp_entry_t *pdp; 1690 1691 /* Wire up a new PDE page */ 1692 pdpindex = ptepindex - NUPDE; 1693 pml4index = pdpindex >> NPML4EPGSHIFT; 1694 1695 pml4 = &pmap->pm_pml4[pml4index]; 1696 if ((*pml4 & PG_V) == 0) { 1697 /* Have to allocate a new pdp, recurse */ 1698 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 1699 flags) == NULL) { 1700 --m->wire_count; 1701 atomic_subtract_int(&cnt.v_wire_count, 1); 1702 vm_page_free_zero(m); 1703 return (NULL); 1704 } 1705 } else { 1706 /* Add reference to pdp page */ 1707 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1708 pdppg->wire_count++; 1709 } 1710 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1711 1712 /* Now find the pdp page */ 1713 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1714 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1715 1716 } else { 1717 vm_pindex_t pml4index; 1718 vm_pindex_t pdpindex; 1719 pml4_entry_t *pml4; 1720 pdp_entry_t *pdp; 1721 pd_entry_t *pd; 1722 1723 /* Wire up a new PTE page */ 1724 pdpindex = ptepindex >> NPDPEPGSHIFT; 1725 pml4index = pdpindex >> NPML4EPGSHIFT; 1726 1727 /* First, find the pdp and check that its valid. */ 1728 pml4 = &pmap->pm_pml4[pml4index]; 1729 if ((*pml4 & PG_V) == 0) { 1730 /* Have to allocate a new pd, recurse */ 1731 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1732 flags) == NULL) { 1733 --m->wire_count; 1734 atomic_subtract_int(&cnt.v_wire_count, 1); 1735 vm_page_free_zero(m); 1736 return (NULL); 1737 } 1738 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1739 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1740 } else { 1741 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1742 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1743 if ((*pdp & PG_V) == 0) { 1744 /* Have to allocate a new pd, recurse */ 1745 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1746 flags) == NULL) { 1747 --m->wire_count; 1748 atomic_subtract_int(&cnt.v_wire_count, 1749 1); 1750 vm_page_free_zero(m); 1751 return (NULL); 1752 } 1753 } else { 1754 /* Add reference to the pd page */ 1755 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1756 pdpg->wire_count++; 1757 } 1758 } 1759 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1760 1761 /* Now we know where the page directory page is */ 1762 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1763 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1764 } 1765 1766 pmap_resident_count_inc(pmap, 1); 1767 1768 return (m); 1769 } 1770 1771 static vm_page_t 1772 pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags) 1773 { 1774 vm_pindex_t pdpindex, ptepindex; 1775 pdp_entry_t *pdpe; 1776 vm_page_t pdpg; 1777 1778 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1779 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1780 ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK")); 1781 retry: 1782 pdpe = pmap_pdpe(pmap, va); 1783 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1784 /* Add a reference to the pd page. */ 1785 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 1786 pdpg->wire_count++; 1787 } else { 1788 /* Allocate a pd page. */ 1789 ptepindex = pmap_pde_pindex(va); 1790 pdpindex = ptepindex >> NPDPEPGSHIFT; 1791 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags); 1792 if (pdpg == NULL && (flags & M_WAITOK)) 1793 goto retry; 1794 } 1795 return (pdpg); 1796 } 1797 1798 static vm_page_t 1799 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1800 { 1801 vm_pindex_t ptepindex; 1802 pd_entry_t *pd; 1803 vm_page_t m; 1804 1805 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1806 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1807 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1808 1809 /* 1810 * Calculate pagetable page index 1811 */ 1812 ptepindex = pmap_pde_pindex(va); 1813 retry: 1814 /* 1815 * Get the page directory entry 1816 */ 1817 pd = pmap_pde(pmap, va); 1818 1819 /* 1820 * This supports switching from a 2MB page to a 1821 * normal 4K page. 1822 */ 1823 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1824 if (!pmap_demote_pde(pmap, pd, va)) { 1825 /* 1826 * Invalidation of the 2MB page mapping may have caused 1827 * the deallocation of the underlying PD page. 1828 */ 1829 pd = NULL; 1830 } 1831 } 1832 1833 /* 1834 * If the page table page is mapped, we just increment the 1835 * hold count, and activate it. 1836 */ 1837 if (pd != NULL && (*pd & PG_V) != 0) { 1838 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 1839 m->wire_count++; 1840 } else { 1841 /* 1842 * Here if the pte page isn't mapped, or if it has been 1843 * deallocated. 1844 */ 1845 m = _pmap_allocpte(pmap, ptepindex, flags); 1846 if (m == NULL && (flags & M_WAITOK)) 1847 goto retry; 1848 } 1849 return (m); 1850 } 1851 1852 1853 /*************************************************** 1854 * Pmap allocation/deallocation routines. 1855 ***************************************************/ 1856 1857 /* 1858 * Release any resources held by the given physical map. 1859 * Called when a pmap initialized by pmap_pinit is being released. 1860 * Should only be called if the map contains no valid mappings. 1861 */ 1862 void 1863 pmap_release(pmap_t pmap) 1864 { 1865 vm_page_t m; 1866 int i; 1867 1868 KASSERT(pmap->pm_stats.resident_count == 0, 1869 ("pmap_release: pmap resident count %ld != 0", 1870 pmap->pm_stats.resident_count)); 1871 KASSERT(pmap->pm_root == NULL, 1872 ("pmap_release: pmap has reserved page table page(s)")); 1873 1874 m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); 1875 1876 pmap->pm_pml4[KPML4I] = 0; /* KVA */ 1877 for (i = 0; i < NDMPML4E; i++) /* Direct Map */ 1878 pmap->pm_pml4[DMPML4I + i] = 0; 1879 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 1880 1881 m->wire_count--; 1882 atomic_subtract_int(&cnt.v_wire_count, 1); 1883 vm_page_free_zero(m); 1884 PMAP_LOCK_DESTROY(pmap); 1885 } 1886 1887 static int 1888 kvm_size(SYSCTL_HANDLER_ARGS) 1889 { 1890 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1891 1892 return sysctl_handle_long(oidp, &ksize, 0, req); 1893 } 1894 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1895 0, 0, kvm_size, "LU", "Size of KVM"); 1896 1897 static int 1898 kvm_free(SYSCTL_HANDLER_ARGS) 1899 { 1900 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1901 1902 return sysctl_handle_long(oidp, &kfree, 0, req); 1903 } 1904 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1905 0, 0, kvm_free, "LU", "Amount of KVM free"); 1906 1907 /* 1908 * grow the number of kernel page table entries, if needed 1909 */ 1910 void 1911 pmap_growkernel(vm_offset_t addr) 1912 { 1913 vm_paddr_t paddr; 1914 vm_page_t nkpg; 1915 pd_entry_t *pde, newpdir; 1916 pdp_entry_t *pdpe; 1917 1918 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1919 1920 /* 1921 * Return if "addr" is within the range of kernel page table pages 1922 * that were preallocated during pmap bootstrap. Moreover, leave 1923 * "kernel_vm_end" and the kernel page table as they were. 1924 * 1925 * The correctness of this action is based on the following 1926 * argument: vm_map_findspace() allocates contiguous ranges of the 1927 * kernel virtual address space. It calls this function if a range 1928 * ends after "kernel_vm_end". If the kernel is mapped between 1929 * "kernel_vm_end" and "addr", then the range cannot begin at 1930 * "kernel_vm_end". In fact, its beginning address cannot be less 1931 * than the kernel. Thus, there is no immediate need to allocate 1932 * any new kernel page table pages between "kernel_vm_end" and 1933 * "KERNBASE". 1934 */ 1935 if (KERNBASE < addr && addr <= KERNBASE + NKPT * NBPDR) 1936 return; 1937 1938 addr = roundup2(addr, NBPDR); 1939 if (addr - 1 >= kernel_map->max_offset) 1940 addr = kernel_map->max_offset; 1941 while (kernel_vm_end < addr) { 1942 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 1943 if ((*pdpe & PG_V) == 0) { 1944 /* We need a new PDP entry */ 1945 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 1946 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1947 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1948 if (nkpg == NULL) 1949 panic("pmap_growkernel: no memory to grow kernel"); 1950 if ((nkpg->flags & PG_ZERO) == 0) 1951 pmap_zero_page(nkpg); 1952 paddr = VM_PAGE_TO_PHYS(nkpg); 1953 *pdpe = (pdp_entry_t) 1954 (paddr | PG_V | PG_RW | PG_A | PG_M); 1955 continue; /* try again */ 1956 } 1957 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 1958 if ((*pde & PG_V) != 0) { 1959 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 1960 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1961 kernel_vm_end = kernel_map->max_offset; 1962 break; 1963 } 1964 continue; 1965 } 1966 1967 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 1968 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1969 VM_ALLOC_ZERO); 1970 if (nkpg == NULL) 1971 panic("pmap_growkernel: no memory to grow kernel"); 1972 if ((nkpg->flags & PG_ZERO) == 0) 1973 pmap_zero_page(nkpg); 1974 paddr = VM_PAGE_TO_PHYS(nkpg); 1975 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); 1976 pde_store(pde, newpdir); 1977 1978 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 1979 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1980 kernel_vm_end = kernel_map->max_offset; 1981 break; 1982 } 1983 } 1984 } 1985 1986 1987 /*************************************************** 1988 * page management routines. 1989 ***************************************************/ 1990 1991 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1992 CTASSERT(_NPCM == 3); 1993 CTASSERT(_NPCPV == 168); 1994 1995 static __inline struct pv_chunk * 1996 pv_to_chunk(pv_entry_t pv) 1997 { 1998 1999 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); 2000 } 2001 2002 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2003 2004 #define PC_FREE0 0xfffffffffffffffful 2005 #define PC_FREE1 0xfffffffffffffffful 2006 #define PC_FREE2 0x000000fffffffffful 2007 2008 static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2009 2010 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2011 "Current number of pv entries"); 2012 2013 #ifdef PV_STATS 2014 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2015 2016 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2017 "Current number of pv entry chunks"); 2018 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2019 "Current number of pv entry chunks allocated"); 2020 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2021 "Current number of pv entry chunks frees"); 2022 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2023 "Number of times tried to get a chunk page but failed."); 2024 2025 static long pv_entry_frees, pv_entry_allocs; 2026 static int pv_entry_spare; 2027 2028 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2029 "Current number of pv entry frees"); 2030 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2031 "Current number of pv entry allocs"); 2032 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2033 "Current number of spare pv entries"); 2034 2035 static int pmap_collect_inactive, pmap_collect_active; 2036 2037 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, 2038 "Current number times pmap_collect called on inactive queue"); 2039 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, 2040 "Current number times pmap_collect called on active queue"); 2041 #endif 2042 2043 /* 2044 * We are in a serious low memory condition. Resort to 2045 * drastic measures to free some pages so we can allocate 2046 * another pv entry chunk. This is normally called to 2047 * unmap inactive pages, and if necessary, active pages. 2048 * 2049 * We do not, however, unmap 2mpages because subsequent accesses will 2050 * allocate per-page pv entries until repromotion occurs, thereby 2051 * exacerbating the shortage of free pv entries. 2052 */ 2053 static void 2054 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) 2055 { 2056 pd_entry_t *pde; 2057 pmap_t pmap; 2058 pt_entry_t *pte, tpte; 2059 pv_entry_t next_pv, pv; 2060 vm_offset_t va; 2061 vm_page_t m, free; 2062 2063 TAILQ_FOREACH(m, &vpq->pl, pageq) { 2064 if (m->hold_count || m->busy) 2065 continue; 2066 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { 2067 va = pv->pv_va; 2068 pmap = PV_PMAP(pv); 2069 /* Avoid deadlock and lock recursion. */ 2070 if (pmap > locked_pmap) 2071 PMAP_LOCK(pmap); 2072 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) 2073 continue; 2074 pmap_resident_count_dec(pmap, 1); 2075 pde = pmap_pde(pmap, va); 2076 KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found" 2077 " a 2mpage in page %p's pv list", m)); 2078 pte = pmap_pde_to_pte(pde, va); 2079 tpte = pte_load_clear(pte); 2080 KASSERT((tpte & PG_W) == 0, 2081 ("pmap_collect: wired pte %#lx", tpte)); 2082 if (tpte & PG_A) 2083 vm_page_flag_set(m, PG_REFERENCED); 2084 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2085 vm_page_dirty(m); 2086 free = NULL; 2087 pmap_unuse_pt(pmap, va, *pde, &free); 2088 pmap_invalidate_page(pmap, va); 2089 pmap_free_zero_pages(free); 2090 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2091 free_pv_entry(pmap, pv); 2092 if (pmap != locked_pmap) 2093 PMAP_UNLOCK(pmap); 2094 } 2095 if (TAILQ_EMPTY(&m->md.pv_list) && 2096 TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)) 2097 vm_page_flag_clear(m, PG_WRITEABLE); 2098 } 2099 } 2100 2101 2102 /* 2103 * free the pv_entry back to the free list 2104 */ 2105 static void 2106 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2107 { 2108 vm_page_t m; 2109 struct pv_chunk *pc; 2110 int idx, field, bit; 2111 2112 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2113 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2114 PV_STAT(pv_entry_frees++); 2115 PV_STAT(pv_entry_spare++); 2116 pv_entry_count--; 2117 pc = pv_to_chunk(pv); 2118 idx = pv - &pc->pc_pventry[0]; 2119 field = idx / 64; 2120 bit = idx % 64; 2121 pc->pc_map[field] |= 1ul << bit; 2122 /* move to head of list */ 2123 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2124 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2125 pc->pc_map[2] != PC_FREE2) { 2126 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2127 return; 2128 } 2129 PV_STAT(pv_entry_spare -= _NPCPV); 2130 PV_STAT(pc_chunk_count--); 2131 PV_STAT(pc_chunk_frees++); 2132 /* entire chunk is free, return it */ 2133 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2134 dump_drop_page(m->phys_addr); 2135 vm_page_unwire(m, 0); 2136 vm_page_free(m); 2137 } 2138 2139 /* 2140 * get a new pv_entry, allocating a block from the system 2141 * when needed. 2142 */ 2143 static pv_entry_t 2144 get_pv_entry(pmap_t pmap, int try) 2145 { 2146 static const struct timeval printinterval = { 60, 0 }; 2147 static struct timeval lastprint; 2148 static vm_pindex_t colour; 2149 struct vpgqueues *pq; 2150 int bit, field; 2151 pv_entry_t pv; 2152 struct pv_chunk *pc; 2153 vm_page_t m; 2154 2155 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2156 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2157 PV_STAT(pv_entry_allocs++); 2158 pv_entry_count++; 2159 if (pv_entry_count > pv_entry_high_water) 2160 if (ratecheck(&lastprint, &printinterval)) 2161 printf("Approaching the limit on PV entries, consider " 2162 "increasing either the vm.pmap.shpgperproc or the " 2163 "vm.pmap.pv_entry_max sysctl.\n"); 2164 pq = NULL; 2165 retry: 2166 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2167 if (pc != NULL) { 2168 for (field = 0; field < _NPCM; field++) { 2169 if (pc->pc_map[field]) { 2170 bit = bsfq(pc->pc_map[field]); 2171 break; 2172 } 2173 } 2174 if (field < _NPCM) { 2175 pv = &pc->pc_pventry[field * 64 + bit]; 2176 pc->pc_map[field] &= ~(1ul << bit); 2177 /* If this was the last item, move it to tail */ 2178 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2179 pc->pc_map[2] == 0) { 2180 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2181 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2182 } 2183 PV_STAT(pv_entry_spare--); 2184 return (pv); 2185 } 2186 } 2187 /* No free items, allocate another chunk */ 2188 m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ? 2189 VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | 2190 VM_ALLOC_WIRED); 2191 if (m == NULL) { 2192 if (try) { 2193 pv_entry_count--; 2194 PV_STAT(pc_chunk_tryfail++); 2195 return (NULL); 2196 } 2197 /* 2198 * Reclaim pv entries: At first, destroy mappings to inactive 2199 * pages. After that, if a pv chunk entry is still needed, 2200 * destroy mappings to active pages. 2201 */ 2202 if (pq == NULL) { 2203 PV_STAT(pmap_collect_inactive++); 2204 pq = &vm_page_queues[PQ_INACTIVE]; 2205 } else if (pq == &vm_page_queues[PQ_INACTIVE]) { 2206 PV_STAT(pmap_collect_active++); 2207 pq = &vm_page_queues[PQ_ACTIVE]; 2208 } else 2209 panic("get_pv_entry: increase vm.pmap.shpgperproc"); 2210 pmap_collect(pmap, pq); 2211 goto retry; 2212 } 2213 PV_STAT(pc_chunk_count++); 2214 PV_STAT(pc_chunk_allocs++); 2215 colour++; 2216 dump_add_page(m->phys_addr); 2217 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2218 pc->pc_pmap = pmap; 2219 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2220 pc->pc_map[1] = PC_FREE1; 2221 pc->pc_map[2] = PC_FREE2; 2222 pv = &pc->pc_pventry[0]; 2223 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2224 PV_STAT(pv_entry_spare += _NPCPV - 1); 2225 return (pv); 2226 } 2227 2228 /* 2229 * First find and then remove the pv entry for the specified pmap and virtual 2230 * address from the specified pv list. Returns the pv entry if found and NULL 2231 * otherwise. This operation can be performed on pv lists for either 4KB or 2232 * 2MB page mappings. 2233 */ 2234 static __inline pv_entry_t 2235 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2236 { 2237 pv_entry_t pv; 2238 2239 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2240 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 2241 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2242 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 2243 break; 2244 } 2245 } 2246 return (pv); 2247 } 2248 2249 /* 2250 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2251 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2252 * entries for each of the 4KB page mappings. 2253 */ 2254 static void 2255 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2256 { 2257 struct md_page *pvh; 2258 pv_entry_t pv; 2259 vm_offset_t va_last; 2260 vm_page_t m; 2261 2262 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2263 KASSERT((pa & PDRMASK) == 0, 2264 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 2265 2266 /* 2267 * Transfer the 2mpage's pv entry for this mapping to the first 2268 * page's pv list. 2269 */ 2270 pvh = pa_to_pvh(pa); 2271 va = trunc_2mpage(va); 2272 pv = pmap_pvh_remove(pvh, pmap, va); 2273 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2274 m = PHYS_TO_VM_PAGE(pa); 2275 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2276 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2277 va_last = va + NBPDR - PAGE_SIZE; 2278 do { 2279 m++; 2280 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 2281 ("pmap_pv_demote_pde: page %p is not managed", m)); 2282 va += PAGE_SIZE; 2283 pmap_insert_entry(pmap, va, m); 2284 } while (va < va_last); 2285 } 2286 2287 /* 2288 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 2289 * replace the many pv entries for the 4KB page mappings by a single pv entry 2290 * for the 2MB page mapping. 2291 */ 2292 static void 2293 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2294 { 2295 struct md_page *pvh; 2296 pv_entry_t pv; 2297 vm_offset_t va_last; 2298 vm_page_t m; 2299 2300 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2301 KASSERT((pa & PDRMASK) == 0, 2302 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 2303 2304 /* 2305 * Transfer the first page's pv entry for this mapping to the 2306 * 2mpage's pv list. Aside from avoiding the cost of a call 2307 * to get_pv_entry(), a transfer avoids the possibility that 2308 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2309 * removes one of the mappings that is being promoted. 2310 */ 2311 m = PHYS_TO_VM_PAGE(pa); 2312 va = trunc_2mpage(va); 2313 pv = pmap_pvh_remove(&m->md, pmap, va); 2314 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2315 pvh = pa_to_pvh(pa); 2316 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2317 /* Free the remaining NPTEPG - 1 pv entries. */ 2318 va_last = va + NBPDR - PAGE_SIZE; 2319 do { 2320 m++; 2321 va += PAGE_SIZE; 2322 pmap_pvh_free(&m->md, pmap, va); 2323 } while (va < va_last); 2324 } 2325 2326 /* 2327 * First find and then destroy the pv entry for the specified pmap and virtual 2328 * address. This operation can be performed on pv lists for either 4KB or 2MB 2329 * page mappings. 2330 */ 2331 static void 2332 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2333 { 2334 pv_entry_t pv; 2335 2336 pv = pmap_pvh_remove(pvh, pmap, va); 2337 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2338 free_pv_entry(pmap, pv); 2339 } 2340 2341 static void 2342 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2343 { 2344 struct md_page *pvh; 2345 2346 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2347 pmap_pvh_free(&m->md, pmap, va); 2348 if (TAILQ_EMPTY(&m->md.pv_list)) { 2349 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2350 if (TAILQ_EMPTY(&pvh->pv_list)) 2351 vm_page_flag_clear(m, PG_WRITEABLE); 2352 } 2353 } 2354 2355 /* 2356 * Create a pv entry for page at pa for 2357 * (pmap, va). 2358 */ 2359 static void 2360 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2361 { 2362 pv_entry_t pv; 2363 2364 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2365 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2366 pv = get_pv_entry(pmap, FALSE); 2367 pv->pv_va = va; 2368 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2369 } 2370 2371 /* 2372 * Conditionally create a pv entry. 2373 */ 2374 static boolean_t 2375 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2376 { 2377 pv_entry_t pv; 2378 2379 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2380 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2381 if (pv_entry_count < pv_entry_high_water && 2382 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2383 pv->pv_va = va; 2384 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2385 return (TRUE); 2386 } else 2387 return (FALSE); 2388 } 2389 2390 /* 2391 * Create the pv entry for a 2MB page mapping. 2392 */ 2393 static boolean_t 2394 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2395 { 2396 struct md_page *pvh; 2397 pv_entry_t pv; 2398 2399 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2400 if (pv_entry_count < pv_entry_high_water && 2401 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2402 pv->pv_va = va; 2403 pvh = pa_to_pvh(pa); 2404 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2405 return (TRUE); 2406 } else 2407 return (FALSE); 2408 } 2409 2410 /* 2411 * Fills a page table page with mappings to consecutive physical pages. 2412 */ 2413 static void 2414 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2415 { 2416 pt_entry_t *pte; 2417 2418 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2419 *pte = newpte; 2420 newpte += PAGE_SIZE; 2421 } 2422 } 2423 2424 /* 2425 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2426 * mapping is invalidated. 2427 */ 2428 static boolean_t 2429 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2430 { 2431 pd_entry_t newpde, oldpde; 2432 pt_entry_t *firstpte, newpte; 2433 vm_paddr_t mptepa; 2434 vm_page_t free, mpte; 2435 2436 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2437 oldpde = *pde; 2438 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2439 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2440 mpte = pmap_lookup_pt_page(pmap, va); 2441 if (mpte != NULL) 2442 pmap_remove_pt_page(pmap, mpte); 2443 else { 2444 KASSERT((oldpde & PG_W) == 0, 2445 ("pmap_demote_pde: page table page for a wired mapping" 2446 " is missing")); 2447 2448 /* 2449 * Invalidate the 2MB page mapping and return "failure" if the 2450 * mapping was never accessed or the allocation of the new 2451 * page table page fails. If the 2MB page mapping belongs to 2452 * the direct map region of the kernel's address space, then 2453 * the page allocation request specifies the highest possible 2454 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 2455 * normal. Page table pages are preallocated for every other 2456 * part of the kernel address space, so the direct map region 2457 * is the only part of the kernel address space that must be 2458 * handled here. 2459 */ 2460 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2461 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 2462 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 2463 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2464 free = NULL; 2465 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free); 2466 pmap_invalidate_page(pmap, trunc_2mpage(va)); 2467 pmap_free_zero_pages(free); 2468 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 2469 " in pmap %p", va, pmap); 2470 return (FALSE); 2471 } 2472 if (va < VM_MAXUSER_ADDRESS) 2473 pmap_resident_count_inc(pmap, 1); 2474 } 2475 mptepa = VM_PAGE_TO_PHYS(mpte); 2476 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2477 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2478 KASSERT((oldpde & PG_A) != 0, 2479 ("pmap_demote_pde: oldpde is missing PG_A")); 2480 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2481 ("pmap_demote_pde: oldpde is missing PG_M")); 2482 newpte = oldpde & ~PG_PS; 2483 if ((newpte & PG_PDE_PAT) != 0) 2484 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2485 2486 /* 2487 * If the page table page is new, initialize it. 2488 */ 2489 if (mpte->wire_count == 1) { 2490 mpte->wire_count = NPTEPG; 2491 pmap_fill_ptp(firstpte, newpte); 2492 } 2493 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2494 ("pmap_demote_pde: firstpte and newpte map different physical" 2495 " addresses")); 2496 2497 /* 2498 * If the mapping has changed attributes, update the page table 2499 * entries. 2500 */ 2501 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2502 pmap_fill_ptp(firstpte, newpte); 2503 2504 /* 2505 * Demote the mapping. This pmap is locked. The old PDE has 2506 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2507 * set. Thus, there is no danger of a race with another 2508 * processor changing the setting of PG_A and/or PG_M between 2509 * the read above and the store below. 2510 */ 2511 if (workaround_erratum383) 2512 pmap_update_pde(pmap, va, pde, newpde); 2513 else 2514 pde_store(pde, newpde); 2515 2516 /* 2517 * Invalidate a stale recursive mapping of the page table page. 2518 */ 2519 if (va >= VM_MAXUSER_ADDRESS) 2520 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2521 2522 /* 2523 * Demote the pv entry. This depends on the earlier demotion 2524 * of the mapping. Specifically, the (re)creation of a per- 2525 * page pv entry might trigger the execution of pmap_collect(), 2526 * which might reclaim a newly (re)created per-page pv entry 2527 * and destroy the associated mapping. In order to destroy 2528 * the mapping, the PDE must have already changed from mapping 2529 * the 2mpage to referencing the page table page. 2530 */ 2531 if ((oldpde & PG_MANAGED) != 0) 2532 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2533 2534 pmap_pde_demotions++; 2535 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 2536 " in pmap %p", va, pmap); 2537 return (TRUE); 2538 } 2539 2540 /* 2541 * pmap_remove_pde: do the things to unmap a superpage in a process 2542 */ 2543 static int 2544 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2545 vm_page_t *free) 2546 { 2547 struct md_page *pvh; 2548 pd_entry_t oldpde; 2549 vm_offset_t eva, va; 2550 vm_page_t m, mpte; 2551 2552 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2553 KASSERT((sva & PDRMASK) == 0, 2554 ("pmap_remove_pde: sva is not 2mpage aligned")); 2555 oldpde = pte_load_clear(pdq); 2556 if (oldpde & PG_W) 2557 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2558 2559 /* 2560 * Machines that don't support invlpg, also don't support 2561 * PG_G. 2562 */ 2563 if (oldpde & PG_G) 2564 pmap_invalidate_page(kernel_pmap, sva); 2565 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 2566 if (oldpde & PG_MANAGED) { 2567 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2568 pmap_pvh_free(pvh, pmap, sva); 2569 eva = sva + NBPDR; 2570 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2571 va < eva; va += PAGE_SIZE, m++) { 2572 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2573 vm_page_dirty(m); 2574 if (oldpde & PG_A) 2575 vm_page_flag_set(m, PG_REFERENCED); 2576 if (TAILQ_EMPTY(&m->md.pv_list) && 2577 TAILQ_EMPTY(&pvh->pv_list)) 2578 vm_page_flag_clear(m, PG_WRITEABLE); 2579 } 2580 } 2581 if (pmap == kernel_pmap) { 2582 if (!pmap_demote_pde(pmap, pdq, sva)) 2583 panic("pmap_remove_pde: failed demotion"); 2584 } else { 2585 mpte = pmap_lookup_pt_page(pmap, sva); 2586 if (mpte != NULL) { 2587 pmap_remove_pt_page(pmap, mpte); 2588 pmap_resident_count_dec(pmap, 1); 2589 KASSERT(mpte->wire_count == NPTEPG, 2590 ("pmap_remove_pde: pte page wire count error")); 2591 mpte->wire_count = 0; 2592 pmap_add_delayed_free_list(mpte, free, FALSE); 2593 atomic_subtract_int(&cnt.v_wire_count, 1); 2594 } 2595 } 2596 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 2597 } 2598 2599 /* 2600 * pmap_remove_pte: do the things to unmap a page in a process 2601 */ 2602 static int 2603 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2604 pd_entry_t ptepde, vm_page_t *free) 2605 { 2606 pt_entry_t oldpte; 2607 vm_page_t m; 2608 2609 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2610 oldpte = pte_load_clear(ptq); 2611 if (oldpte & PG_W) 2612 pmap->pm_stats.wired_count -= 1; 2613 pmap_resident_count_dec(pmap, 1); 2614 if (oldpte & PG_MANAGED) { 2615 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2616 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2617 vm_page_dirty(m); 2618 if (oldpte & PG_A) 2619 vm_page_flag_set(m, PG_REFERENCED); 2620 pmap_remove_entry(pmap, m, va); 2621 } 2622 return (pmap_unuse_pt(pmap, va, ptepde, free)); 2623 } 2624 2625 /* 2626 * Remove a single page from a process address space 2627 */ 2628 static void 2629 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free) 2630 { 2631 pt_entry_t *pte; 2632 2633 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2634 if ((*pde & PG_V) == 0) 2635 return; 2636 pte = pmap_pde_to_pte(pde, va); 2637 if ((*pte & PG_V) == 0) 2638 return; 2639 pmap_remove_pte(pmap, pte, va, *pde, free); 2640 pmap_invalidate_page(pmap, va); 2641 } 2642 2643 /* 2644 * Remove the given range of addresses from the specified map. 2645 * 2646 * It is assumed that the start and end are properly 2647 * rounded to the page size. 2648 */ 2649 void 2650 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2651 { 2652 vm_offset_t va, va_next; 2653 pml4_entry_t *pml4e; 2654 pdp_entry_t *pdpe; 2655 pd_entry_t ptpaddr, *pde; 2656 pt_entry_t *pte; 2657 vm_page_t free = NULL; 2658 int anyvalid; 2659 2660 /* 2661 * Perform an unsynchronized read. This is, however, safe. 2662 */ 2663 if (pmap->pm_stats.resident_count == 0) 2664 return; 2665 2666 anyvalid = 0; 2667 2668 vm_page_lock_queues(); 2669 PMAP_LOCK(pmap); 2670 2671 /* 2672 * special handling of removing one page. a very 2673 * common operation and easy to short circuit some 2674 * code. 2675 */ 2676 if (sva + PAGE_SIZE == eva) { 2677 pde = pmap_pde(pmap, sva); 2678 if (pde && (*pde & PG_PS) == 0) { 2679 pmap_remove_page(pmap, sva, pde, &free); 2680 goto out; 2681 } 2682 } 2683 2684 for (; sva < eva; sva = va_next) { 2685 2686 if (pmap->pm_stats.resident_count == 0) 2687 break; 2688 2689 pml4e = pmap_pml4e(pmap, sva); 2690 if ((*pml4e & PG_V) == 0) { 2691 va_next = (sva + NBPML4) & ~PML4MASK; 2692 if (va_next < sva) 2693 va_next = eva; 2694 continue; 2695 } 2696 2697 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2698 if ((*pdpe & PG_V) == 0) { 2699 va_next = (sva + NBPDP) & ~PDPMASK; 2700 if (va_next < sva) 2701 va_next = eva; 2702 continue; 2703 } 2704 2705 /* 2706 * Calculate index for next page table. 2707 */ 2708 va_next = (sva + NBPDR) & ~PDRMASK; 2709 if (va_next < sva) 2710 va_next = eva; 2711 2712 pde = pmap_pdpe_to_pde(pdpe, sva); 2713 ptpaddr = *pde; 2714 2715 /* 2716 * Weed out invalid mappings. 2717 */ 2718 if (ptpaddr == 0) 2719 continue; 2720 2721 /* 2722 * Check for large page. 2723 */ 2724 if ((ptpaddr & PG_PS) != 0) { 2725 /* 2726 * Are we removing the entire large page? If not, 2727 * demote the mapping and fall through. 2728 */ 2729 if (sva + NBPDR == va_next && eva >= va_next) { 2730 /* 2731 * The TLB entry for a PG_G mapping is 2732 * invalidated by pmap_remove_pde(). 2733 */ 2734 if ((ptpaddr & PG_G) == 0) 2735 anyvalid = 1; 2736 pmap_remove_pde(pmap, pde, sva, &free); 2737 continue; 2738 } else if (!pmap_demote_pde(pmap, pde, sva)) { 2739 /* The large page mapping was destroyed. */ 2740 continue; 2741 } else 2742 ptpaddr = *pde; 2743 } 2744 2745 /* 2746 * Limit our scan to either the end of the va represented 2747 * by the current page table page, or to the end of the 2748 * range being removed. 2749 */ 2750 if (va_next > eva) 2751 va_next = eva; 2752 2753 va = va_next; 2754 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2755 sva += PAGE_SIZE) { 2756 if (*pte == 0) { 2757 if (va != va_next) { 2758 pmap_invalidate_range(pmap, va, sva); 2759 va = va_next; 2760 } 2761 continue; 2762 } 2763 if ((*pte & PG_G) == 0) 2764 anyvalid = 1; 2765 else if (va == va_next) 2766 va = sva; 2767 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free)) { 2768 sva += PAGE_SIZE; 2769 break; 2770 } 2771 } 2772 if (va != va_next) 2773 pmap_invalidate_range(pmap, va, sva); 2774 } 2775 out: 2776 if (anyvalid) 2777 pmap_invalidate_all(pmap); 2778 vm_page_unlock_queues(); 2779 PMAP_UNLOCK(pmap); 2780 pmap_free_zero_pages(free); 2781 } 2782 2783 /* 2784 * Routine: pmap_remove_all 2785 * Function: 2786 * Removes this physical page from 2787 * all physical maps in which it resides. 2788 * Reflects back modify bits to the pager. 2789 * 2790 * Notes: 2791 * Original versions of this routine were very 2792 * inefficient because they iteratively called 2793 * pmap_remove (slow...) 2794 */ 2795 2796 void 2797 pmap_remove_all(vm_page_t m) 2798 { 2799 struct md_page *pvh; 2800 pv_entry_t pv; 2801 pmap_t pmap; 2802 pt_entry_t *pte, tpte; 2803 pd_entry_t *pde; 2804 vm_offset_t va; 2805 vm_page_t free; 2806 2807 KASSERT((m->flags & PG_FICTITIOUS) == 0, 2808 ("pmap_remove_all: page %p is fictitious", m)); 2809 free = NULL; 2810 vm_page_lock_queues(); 2811 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2812 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2813 pmap = PV_PMAP(pv); 2814 PMAP_LOCK(pmap); 2815 va = pv->pv_va; 2816 pde = pmap_pde(pmap, va); 2817 (void)pmap_demote_pde(pmap, pde, va); 2818 PMAP_UNLOCK(pmap); 2819 } 2820 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2821 pmap = PV_PMAP(pv); 2822 PMAP_LOCK(pmap); 2823 pmap_resident_count_dec(pmap, 1); 2824 pde = pmap_pde(pmap, pv->pv_va); 2825 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 2826 " a 2mpage in page %p's pv list", m)); 2827 pte = pmap_pde_to_pte(pde, pv->pv_va); 2828 tpte = pte_load_clear(pte); 2829 if (tpte & PG_W) 2830 pmap->pm_stats.wired_count--; 2831 if (tpte & PG_A) 2832 vm_page_flag_set(m, PG_REFERENCED); 2833 2834 /* 2835 * Update the vm_page_t clean and reference bits. 2836 */ 2837 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2838 vm_page_dirty(m); 2839 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 2840 pmap_invalidate_page(pmap, pv->pv_va); 2841 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2842 free_pv_entry(pmap, pv); 2843 PMAP_UNLOCK(pmap); 2844 } 2845 vm_page_flag_clear(m, PG_WRITEABLE); 2846 vm_page_unlock_queues(); 2847 pmap_free_zero_pages(free); 2848 } 2849 2850 /* 2851 * pmap_protect_pde: do the things to protect a 2mpage in a process 2852 */ 2853 static boolean_t 2854 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 2855 { 2856 pd_entry_t newpde, oldpde; 2857 vm_offset_t eva, va; 2858 vm_page_t m; 2859 boolean_t anychanged; 2860 2861 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2862 KASSERT((sva & PDRMASK) == 0, 2863 ("pmap_protect_pde: sva is not 2mpage aligned")); 2864 anychanged = FALSE; 2865 retry: 2866 oldpde = newpde = *pde; 2867 if (oldpde & PG_MANAGED) { 2868 eva = sva + NBPDR; 2869 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2870 va < eva; va += PAGE_SIZE, m++) 2871 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2872 vm_page_dirty(m); 2873 } 2874 if ((prot & VM_PROT_WRITE) == 0) 2875 newpde &= ~(PG_RW | PG_M); 2876 if ((prot & VM_PROT_EXECUTE) == 0) 2877 newpde |= pg_nx; 2878 if (newpde != oldpde) { 2879 if (!atomic_cmpset_long(pde, oldpde, newpde)) 2880 goto retry; 2881 if (oldpde & PG_G) 2882 pmap_invalidate_page(pmap, sva); 2883 else 2884 anychanged = TRUE; 2885 } 2886 return (anychanged); 2887 } 2888 2889 /* 2890 * Set the physical protection on the 2891 * specified range of this map as requested. 2892 */ 2893 void 2894 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2895 { 2896 vm_offset_t va_next; 2897 pml4_entry_t *pml4e; 2898 pdp_entry_t *pdpe; 2899 pd_entry_t ptpaddr, *pde; 2900 pt_entry_t *pte; 2901 int anychanged; 2902 2903 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2904 pmap_remove(pmap, sva, eva); 2905 return; 2906 } 2907 2908 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 2909 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 2910 return; 2911 2912 anychanged = 0; 2913 2914 vm_page_lock_queues(); 2915 PMAP_LOCK(pmap); 2916 for (; sva < eva; sva = va_next) { 2917 2918 pml4e = pmap_pml4e(pmap, sva); 2919 if ((*pml4e & PG_V) == 0) { 2920 va_next = (sva + NBPML4) & ~PML4MASK; 2921 if (va_next < sva) 2922 va_next = eva; 2923 continue; 2924 } 2925 2926 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2927 if ((*pdpe & PG_V) == 0) { 2928 va_next = (sva + NBPDP) & ~PDPMASK; 2929 if (va_next < sva) 2930 va_next = eva; 2931 continue; 2932 } 2933 2934 va_next = (sva + NBPDR) & ~PDRMASK; 2935 if (va_next < sva) 2936 va_next = eva; 2937 2938 pde = pmap_pdpe_to_pde(pdpe, sva); 2939 ptpaddr = *pde; 2940 2941 /* 2942 * Weed out invalid mappings. 2943 */ 2944 if (ptpaddr == 0) 2945 continue; 2946 2947 /* 2948 * Check for large page. 2949 */ 2950 if ((ptpaddr & PG_PS) != 0) { 2951 /* 2952 * Are we protecting the entire large page? If not, 2953 * demote the mapping and fall through. 2954 */ 2955 if (sva + NBPDR == va_next && eva >= va_next) { 2956 /* 2957 * The TLB entry for a PG_G mapping is 2958 * invalidated by pmap_protect_pde(). 2959 */ 2960 if (pmap_protect_pde(pmap, pde, sva, prot)) 2961 anychanged = 1; 2962 continue; 2963 } else if (!pmap_demote_pde(pmap, pde, sva)) { 2964 /* The large page mapping was destroyed. */ 2965 continue; 2966 } 2967 } 2968 2969 if (va_next > eva) 2970 va_next = eva; 2971 2972 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2973 sva += PAGE_SIZE) { 2974 pt_entry_t obits, pbits; 2975 vm_page_t m; 2976 2977 retry: 2978 obits = pbits = *pte; 2979 if ((pbits & PG_V) == 0) 2980 continue; 2981 2982 if ((prot & VM_PROT_WRITE) == 0) { 2983 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 2984 (PG_MANAGED | PG_M | PG_RW)) { 2985 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2986 vm_page_dirty(m); 2987 } 2988 pbits &= ~(PG_RW | PG_M); 2989 } 2990 if ((prot & VM_PROT_EXECUTE) == 0) 2991 pbits |= pg_nx; 2992 2993 if (pbits != obits) { 2994 if (!atomic_cmpset_long(pte, obits, pbits)) 2995 goto retry; 2996 if (obits & PG_G) 2997 pmap_invalidate_page(pmap, sva); 2998 else 2999 anychanged = 1; 3000 } 3001 } 3002 } 3003 if (anychanged) 3004 pmap_invalidate_all(pmap); 3005 vm_page_unlock_queues(); 3006 PMAP_UNLOCK(pmap); 3007 } 3008 3009 /* 3010 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3011 * single page table page (PTP) to a single 2MB page mapping. For promotion 3012 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3013 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3014 * identical characteristics. 3015 */ 3016 static void 3017 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3018 { 3019 pd_entry_t newpde; 3020 pt_entry_t *firstpte, oldpte, pa, *pte; 3021 vm_offset_t oldpteva; 3022 vm_page_t mpte; 3023 3024 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3025 3026 /* 3027 * Examine the first PTE in the specified PTP. Abort if this PTE is 3028 * either invalid, unused, or does not map the first 4KB physical page 3029 * within a 2MB page. 3030 */ 3031 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 3032 setpde: 3033 newpde = *firstpte; 3034 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3035 pmap_pde_p_failures++; 3036 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 3037 " in pmap %p", va, pmap); 3038 return; 3039 } 3040 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3041 /* 3042 * When PG_M is already clear, PG_RW can be cleared without 3043 * a TLB invalidation. 3044 */ 3045 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 3046 goto setpde; 3047 newpde &= ~PG_RW; 3048 } 3049 3050 /* 3051 * Examine each of the other PTEs in the specified PTP. Abort if this 3052 * PTE maps an unexpected 4KB physical page or does not have identical 3053 * characteristics to the first PTE. 3054 */ 3055 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3056 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3057 setpte: 3058 oldpte = *pte; 3059 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3060 pmap_pde_p_failures++; 3061 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 3062 " in pmap %p", va, pmap); 3063 return; 3064 } 3065 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3066 /* 3067 * When PG_M is already clear, PG_RW can be cleared 3068 * without a TLB invalidation. 3069 */ 3070 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 3071 goto setpte; 3072 oldpte &= ~PG_RW; 3073 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3074 (va & ~PDRMASK); 3075 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 3076 " in pmap %p", oldpteva, pmap); 3077 } 3078 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3079 pmap_pde_p_failures++; 3080 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 3081 " in pmap %p", va, pmap); 3082 return; 3083 } 3084 pa -= PAGE_SIZE; 3085 } 3086 3087 /* 3088 * Save the page table page in its current state until the PDE 3089 * mapping the superpage is demoted by pmap_demote_pde() or 3090 * destroyed by pmap_remove_pde(). 3091 */ 3092 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3093 KASSERT(mpte >= vm_page_array && 3094 mpte < &vm_page_array[vm_page_array_size], 3095 ("pmap_promote_pde: page table page is out of range")); 3096 KASSERT(mpte->pindex == pmap_pde_pindex(va), 3097 ("pmap_promote_pde: page table page's pindex is wrong")); 3098 pmap_insert_pt_page(pmap, mpte); 3099 3100 /* 3101 * Promote the pv entries. 3102 */ 3103 if ((newpde & PG_MANAGED) != 0) 3104 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3105 3106 /* 3107 * Propagate the PAT index to its proper position. 3108 */ 3109 if ((newpde & PG_PTE_PAT) != 0) 3110 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3111 3112 /* 3113 * Map the superpage. 3114 */ 3115 if (workaround_erratum383) 3116 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3117 else 3118 pde_store(pde, PG_PS | newpde); 3119 3120 pmap_pde_promotions++; 3121 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 3122 " in pmap %p", va, pmap); 3123 } 3124 3125 /* 3126 * Insert the given physical page (p) at 3127 * the specified virtual address (v) in the 3128 * target physical map with the protection requested. 3129 * 3130 * If specified, the page will be wired down, meaning 3131 * that the related pte can not be reclaimed. 3132 * 3133 * NB: This is the only routine which MAY NOT lazy-evaluate 3134 * or lose information. That is, this routine must actually 3135 * insert this page into the given map NOW. 3136 */ 3137 void 3138 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 3139 vm_prot_t prot, boolean_t wired) 3140 { 3141 pd_entry_t *pde; 3142 pt_entry_t *pte; 3143 pt_entry_t newpte, origpte; 3144 pv_entry_t pv; 3145 vm_paddr_t opa, pa; 3146 vm_page_t mpte, om; 3147 boolean_t invlva; 3148 3149 va = trunc_page(va); 3150 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3151 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3152 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 3153 va)); 3154 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0 || 3155 (m->oflags & VPO_BUSY) != 0, 3156 ("pmap_enter: page %p is not busy", m)); 3157 3158 mpte = NULL; 3159 3160 vm_page_lock_queues(); 3161 PMAP_LOCK(pmap); 3162 3163 /* 3164 * In the case that a page table page is not 3165 * resident, we are creating it here. 3166 */ 3167 if (va < VM_MAXUSER_ADDRESS) 3168 mpte = pmap_allocpte(pmap, va, M_WAITOK); 3169 3170 pde = pmap_pde(pmap, va); 3171 if (pde != NULL && (*pde & PG_V) != 0) { 3172 if ((*pde & PG_PS) != 0) 3173 panic("pmap_enter: attempted pmap_enter on 2MB page"); 3174 pte = pmap_pde_to_pte(pde, va); 3175 } else 3176 panic("pmap_enter: invalid page directory va=%#lx", va); 3177 3178 pa = VM_PAGE_TO_PHYS(m); 3179 om = NULL; 3180 origpte = *pte; 3181 opa = origpte & PG_FRAME; 3182 3183 /* 3184 * Mapping has not changed, must be protection or wiring change. 3185 */ 3186 if (origpte && (opa == pa)) { 3187 /* 3188 * Wiring change, just update stats. We don't worry about 3189 * wiring PT pages as they remain resident as long as there 3190 * are valid mappings in them. Hence, if a user page is wired, 3191 * the PT page will be also. 3192 */ 3193 if (wired && ((origpte & PG_W) == 0)) 3194 pmap->pm_stats.wired_count++; 3195 else if (!wired && (origpte & PG_W)) 3196 pmap->pm_stats.wired_count--; 3197 3198 /* 3199 * Remove extra pte reference 3200 */ 3201 if (mpte) 3202 mpte->wire_count--; 3203 3204 if (origpte & PG_MANAGED) { 3205 om = m; 3206 pa |= PG_MANAGED; 3207 } 3208 goto validate; 3209 } 3210 3211 pv = NULL; 3212 3213 /* 3214 * Mapping has changed, invalidate old range and fall through to 3215 * handle validating new mapping. 3216 */ 3217 if (opa) { 3218 if (origpte & PG_W) 3219 pmap->pm_stats.wired_count--; 3220 if (origpte & PG_MANAGED) { 3221 om = PHYS_TO_VM_PAGE(opa); 3222 pv = pmap_pvh_remove(&om->md, pmap, va); 3223 } 3224 if (mpte != NULL) { 3225 mpte->wire_count--; 3226 KASSERT(mpte->wire_count > 0, 3227 ("pmap_enter: missing reference to page table page," 3228 " va: 0x%lx", va)); 3229 } 3230 } else 3231 pmap_resident_count_inc(pmap, 1); 3232 3233 /* 3234 * Enter on the PV list if part of our managed memory. 3235 */ 3236 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 3237 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3238 ("pmap_enter: managed mapping within the clean submap")); 3239 if (pv == NULL) 3240 pv = get_pv_entry(pmap, FALSE); 3241 pv->pv_va = va; 3242 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3243 pa |= PG_MANAGED; 3244 } else if (pv != NULL) 3245 free_pv_entry(pmap, pv); 3246 3247 /* 3248 * Increment counters 3249 */ 3250 if (wired) 3251 pmap->pm_stats.wired_count++; 3252 3253 validate: 3254 /* 3255 * Now validate mapping with desired protection/wiring. 3256 */ 3257 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3258 if ((prot & VM_PROT_WRITE) != 0) { 3259 newpte |= PG_RW; 3260 if ((newpte & PG_MANAGED) != 0) 3261 vm_page_flag_set(m, PG_WRITEABLE); 3262 } 3263 if ((prot & VM_PROT_EXECUTE) == 0) 3264 newpte |= pg_nx; 3265 if (wired) 3266 newpte |= PG_W; 3267 if (va < VM_MAXUSER_ADDRESS) 3268 newpte |= PG_U; 3269 if (pmap == kernel_pmap) 3270 newpte |= PG_G; 3271 3272 /* 3273 * if the mapping or permission bits are different, we need 3274 * to update the pte. 3275 */ 3276 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3277 newpte |= PG_A; 3278 if ((access & VM_PROT_WRITE) != 0) 3279 newpte |= PG_M; 3280 if (origpte & PG_V) { 3281 invlva = FALSE; 3282 origpte = pte_load_store(pte, newpte); 3283 if (origpte & PG_A) { 3284 if (origpte & PG_MANAGED) 3285 vm_page_flag_set(om, PG_REFERENCED); 3286 if (opa != VM_PAGE_TO_PHYS(m) || ((origpte & 3287 PG_NX) == 0 && (newpte & PG_NX))) 3288 invlva = TRUE; 3289 } 3290 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3291 if ((origpte & PG_MANAGED) != 0) 3292 vm_page_dirty(om); 3293 if ((newpte & PG_RW) == 0) 3294 invlva = TRUE; 3295 } 3296 if ((origpte & PG_MANAGED) != 0 && 3297 TAILQ_EMPTY(&om->md.pv_list) && 3298 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)) 3299 vm_page_flag_clear(om, PG_WRITEABLE); 3300 if (invlva) 3301 pmap_invalidate_page(pmap, va); 3302 } else 3303 pte_store(pte, newpte); 3304 } 3305 3306 /* 3307 * If both the page table page and the reservation are fully 3308 * populated, then attempt promotion. 3309 */ 3310 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3311 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0) 3312 pmap_promote_pde(pmap, pde, va); 3313 3314 vm_page_unlock_queues(); 3315 PMAP_UNLOCK(pmap); 3316 } 3317 3318 /* 3319 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 3320 * otherwise. Fails if (1) a page table page cannot be allocated without 3321 * blocking, (2) a mapping already exists at the specified virtual address, or 3322 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3323 */ 3324 static boolean_t 3325 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3326 { 3327 pd_entry_t *pde, newpde; 3328 vm_page_t free, mpde; 3329 3330 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3331 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3332 if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) { 3333 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3334 " in pmap %p", va, pmap); 3335 return (FALSE); 3336 } 3337 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 3338 pde = &pde[pmap_pde_index(va)]; 3339 if ((*pde & PG_V) != 0) { 3340 KASSERT(mpde->wire_count > 1, 3341 ("pmap_enter_pde: mpde's wire count is too low")); 3342 mpde->wire_count--; 3343 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3344 " in pmap %p", va, pmap); 3345 return (FALSE); 3346 } 3347 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3348 PG_PS | PG_V; 3349 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 3350 newpde |= PG_MANAGED; 3351 3352 /* 3353 * Abort this mapping if its PV entry could not be created. 3354 */ 3355 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3356 free = NULL; 3357 if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) { 3358 pmap_invalidate_page(pmap, va); 3359 pmap_free_zero_pages(free); 3360 } 3361 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3362 " in pmap %p", va, pmap); 3363 return (FALSE); 3364 } 3365 } 3366 if ((prot & VM_PROT_EXECUTE) == 0) 3367 newpde |= pg_nx; 3368 if (va < VM_MAXUSER_ADDRESS) 3369 newpde |= PG_U; 3370 3371 /* 3372 * Increment counters. 3373 */ 3374 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 3375 3376 /* 3377 * Map the superpage. 3378 */ 3379 pde_store(pde, newpde); 3380 3381 pmap_pde_mappings++; 3382 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3383 " in pmap %p", va, pmap); 3384 return (TRUE); 3385 } 3386 3387 /* 3388 * Maps a sequence of resident pages belonging to the same object. 3389 * The sequence begins with the given page m_start. This page is 3390 * mapped at the given virtual address start. Each subsequent page is 3391 * mapped at a virtual address that is offset from start by the same 3392 * amount as the page is offset from m_start within the object. The 3393 * last page in the sequence is the page with the largest offset from 3394 * m_start that can be mapped at a virtual address less than the given 3395 * virtual address end. Not every virtual page between start and end 3396 * is mapped; only those for which a resident page exists with the 3397 * corresponding offset from m_start are mapped. 3398 */ 3399 void 3400 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3401 vm_page_t m_start, vm_prot_t prot) 3402 { 3403 vm_offset_t va; 3404 vm_page_t m, mpte; 3405 vm_pindex_t diff, psize; 3406 3407 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); 3408 psize = atop(end - start); 3409 mpte = NULL; 3410 m = m_start; 3411 vm_page_lock_queues(); 3412 PMAP_LOCK(pmap); 3413 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3414 va = start + ptoa(diff); 3415 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3416 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && 3417 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && 3418 pmap_enter_pde(pmap, va, m, prot)) 3419 m = &m[NBPDR / PAGE_SIZE - 1]; 3420 else 3421 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3422 mpte); 3423 m = TAILQ_NEXT(m, listq); 3424 } 3425 vm_page_unlock_queues(); 3426 PMAP_UNLOCK(pmap); 3427 } 3428 3429 /* 3430 * this code makes some *MAJOR* assumptions: 3431 * 1. Current pmap & pmap exists. 3432 * 2. Not wired. 3433 * 3. Read access. 3434 * 4. No page table pages. 3435 * but is *MUCH* faster than pmap_enter... 3436 */ 3437 3438 void 3439 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3440 { 3441 3442 vm_page_lock_queues(); 3443 PMAP_LOCK(pmap); 3444 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3445 vm_page_unlock_queues(); 3446 PMAP_UNLOCK(pmap); 3447 } 3448 3449 static vm_page_t 3450 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3451 vm_prot_t prot, vm_page_t mpte) 3452 { 3453 vm_page_t free; 3454 pt_entry_t *pte; 3455 vm_paddr_t pa; 3456 3457 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3458 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, 3459 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3460 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3461 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3462 3463 /* 3464 * In the case that a page table page is not 3465 * resident, we are creating it here. 3466 */ 3467 if (va < VM_MAXUSER_ADDRESS) { 3468 vm_pindex_t ptepindex; 3469 pd_entry_t *ptepa; 3470 3471 /* 3472 * Calculate pagetable page index 3473 */ 3474 ptepindex = pmap_pde_pindex(va); 3475 if (mpte && (mpte->pindex == ptepindex)) { 3476 mpte->wire_count++; 3477 } else { 3478 /* 3479 * Get the page directory entry 3480 */ 3481 ptepa = pmap_pde(pmap, va); 3482 3483 /* 3484 * If the page table page is mapped, we just increment 3485 * the hold count, and activate it. 3486 */ 3487 if (ptepa && (*ptepa & PG_V) != 0) { 3488 if (*ptepa & PG_PS) 3489 return (NULL); 3490 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 3491 mpte->wire_count++; 3492 } else { 3493 mpte = _pmap_allocpte(pmap, ptepindex, 3494 M_NOWAIT); 3495 if (mpte == NULL) 3496 return (mpte); 3497 } 3498 } 3499 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3500 pte = &pte[pmap_pte_index(va)]; 3501 } else { 3502 mpte = NULL; 3503 pte = vtopte(va); 3504 } 3505 if (*pte) { 3506 if (mpte != NULL) { 3507 mpte->wire_count--; 3508 mpte = NULL; 3509 } 3510 return (mpte); 3511 } 3512 3513 /* 3514 * Enter on the PV list if part of our managed memory. 3515 */ 3516 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && 3517 !pmap_try_insert_pv_entry(pmap, va, m)) { 3518 if (mpte != NULL) { 3519 free = NULL; 3520 if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) { 3521 pmap_invalidate_page(pmap, va); 3522 pmap_free_zero_pages(free); 3523 } 3524 mpte = NULL; 3525 } 3526 return (mpte); 3527 } 3528 3529 /* 3530 * Increment counters 3531 */ 3532 pmap_resident_count_inc(pmap, 1); 3533 3534 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3535 if ((prot & VM_PROT_EXECUTE) == 0) 3536 pa |= pg_nx; 3537 3538 /* 3539 * Now validate mapping with RO protection 3540 */ 3541 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 3542 pte_store(pte, pa | PG_V | PG_U); 3543 else 3544 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3545 return (mpte); 3546 } 3547 3548 /* 3549 * Make a temporary mapping for a physical address. This is only intended 3550 * to be used for panic dumps. 3551 */ 3552 void * 3553 pmap_kenter_temporary(vm_paddr_t pa, int i) 3554 { 3555 vm_offset_t va; 3556 3557 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3558 pmap_kenter(va, pa); 3559 invlpg(va); 3560 return ((void *)crashdumpmap); 3561 } 3562 3563 /* 3564 * This code maps large physical mmap regions into the 3565 * processor address space. Note that some shortcuts 3566 * are taken, but the code works. 3567 */ 3568 void 3569 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3570 vm_pindex_t pindex, vm_size_t size) 3571 { 3572 pd_entry_t *pde; 3573 vm_paddr_t pa, ptepa; 3574 vm_page_t p, pdpg; 3575 int pat_mode; 3576 3577 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 3578 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3579 ("pmap_object_init_pt: non-device object")); 3580 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3581 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3582 return; 3583 p = vm_page_lookup(object, pindex); 3584 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3585 ("pmap_object_init_pt: invalid page %p", p)); 3586 pat_mode = p->md.pat_mode; 3587 3588 /* 3589 * Abort the mapping if the first page is not physically 3590 * aligned to a 2MB page boundary. 3591 */ 3592 ptepa = VM_PAGE_TO_PHYS(p); 3593 if (ptepa & (NBPDR - 1)) 3594 return; 3595 3596 /* 3597 * Skip the first page. Abort the mapping if the rest of 3598 * the pages are not physically contiguous or have differing 3599 * memory attributes. 3600 */ 3601 p = TAILQ_NEXT(p, listq); 3602 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3603 pa += PAGE_SIZE) { 3604 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3605 ("pmap_object_init_pt: invalid page %p", p)); 3606 if (pa != VM_PAGE_TO_PHYS(p) || 3607 pat_mode != p->md.pat_mode) 3608 return; 3609 p = TAILQ_NEXT(p, listq); 3610 } 3611 3612 /* 3613 * Map using 2MB pages. Since "ptepa" is 2M aligned and 3614 * "size" is a multiple of 2M, adding the PAT setting to "pa" 3615 * will not affect the termination of this loop. 3616 */ 3617 PMAP_LOCK(pmap); 3618 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3619 size; pa += NBPDR) { 3620 pdpg = pmap_allocpde(pmap, addr, M_NOWAIT); 3621 if (pdpg == NULL) { 3622 /* 3623 * The creation of mappings below is only an 3624 * optimization. If a page directory page 3625 * cannot be allocated without blocking, 3626 * continue on to the next mapping rather than 3627 * blocking. 3628 */ 3629 addr += NBPDR; 3630 continue; 3631 } 3632 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3633 pde = &pde[pmap_pde_index(addr)]; 3634 if ((*pde & PG_V) == 0) { 3635 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3636 PG_U | PG_RW | PG_V); 3637 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 3638 pmap_pde_mappings++; 3639 } else { 3640 /* Continue on if the PDE is already valid. */ 3641 pdpg->wire_count--; 3642 KASSERT(pdpg->wire_count > 0, 3643 ("pmap_object_init_pt: missing reference " 3644 "to page directory page, va: 0x%lx", addr)); 3645 } 3646 addr += NBPDR; 3647 } 3648 PMAP_UNLOCK(pmap); 3649 } 3650 } 3651 3652 /* 3653 * Routine: pmap_change_wiring 3654 * Function: Change the wiring attribute for a map/virtual-address 3655 * pair. 3656 * In/out conditions: 3657 * The mapping must already exist in the pmap. 3658 */ 3659 void 3660 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 3661 { 3662 pd_entry_t *pde; 3663 pt_entry_t *pte; 3664 boolean_t are_queues_locked; 3665 3666 are_queues_locked = FALSE; 3667 3668 /* 3669 * Wiring is not a hardware characteristic so there is no need to 3670 * invalidate TLB. 3671 */ 3672 retry: 3673 PMAP_LOCK(pmap); 3674 pde = pmap_pde(pmap, va); 3675 if ((*pde & PG_PS) != 0) { 3676 if (!wired != ((*pde & PG_W) == 0)) { 3677 if (!are_queues_locked) { 3678 are_queues_locked = TRUE; 3679 if (!mtx_trylock(&vm_page_queue_mtx)) { 3680 PMAP_UNLOCK(pmap); 3681 vm_page_lock_queues(); 3682 goto retry; 3683 } 3684 } 3685 if (!pmap_demote_pde(pmap, pde, va)) 3686 panic("pmap_change_wiring: demotion failed"); 3687 } else 3688 goto out; 3689 } 3690 pte = pmap_pde_to_pte(pde, va); 3691 if (wired && (*pte & PG_W) == 0) { 3692 pmap->pm_stats.wired_count++; 3693 atomic_set_long(pte, PG_W); 3694 } else if (!wired && (*pte & PG_W) != 0) { 3695 pmap->pm_stats.wired_count--; 3696 atomic_clear_long(pte, PG_W); 3697 } 3698 out: 3699 if (are_queues_locked) 3700 vm_page_unlock_queues(); 3701 PMAP_UNLOCK(pmap); 3702 } 3703 3704 /* 3705 * Copy the range specified by src_addr/len 3706 * from the source map to the range dst_addr/len 3707 * in the destination map. 3708 * 3709 * This routine is only advisory and need not do anything. 3710 */ 3711 3712 void 3713 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3714 vm_offset_t src_addr) 3715 { 3716 vm_page_t free; 3717 vm_offset_t addr; 3718 vm_offset_t end_addr = src_addr + len; 3719 vm_offset_t va_next; 3720 3721 if (dst_addr != src_addr) 3722 return; 3723 3724 vm_page_lock_queues(); 3725 if (dst_pmap < src_pmap) { 3726 PMAP_LOCK(dst_pmap); 3727 PMAP_LOCK(src_pmap); 3728 } else { 3729 PMAP_LOCK(src_pmap); 3730 PMAP_LOCK(dst_pmap); 3731 } 3732 for (addr = src_addr; addr < end_addr; addr = va_next) { 3733 pt_entry_t *src_pte, *dst_pte; 3734 vm_page_t dstmpde, dstmpte, srcmpte; 3735 pml4_entry_t *pml4e; 3736 pdp_entry_t *pdpe; 3737 pd_entry_t srcptepaddr, *pde; 3738 3739 KASSERT(addr < UPT_MIN_ADDRESS, 3740 ("pmap_copy: invalid to pmap_copy page tables")); 3741 3742 pml4e = pmap_pml4e(src_pmap, addr); 3743 if ((*pml4e & PG_V) == 0) { 3744 va_next = (addr + NBPML4) & ~PML4MASK; 3745 if (va_next < addr) 3746 va_next = end_addr; 3747 continue; 3748 } 3749 3750 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 3751 if ((*pdpe & PG_V) == 0) { 3752 va_next = (addr + NBPDP) & ~PDPMASK; 3753 if (va_next < addr) 3754 va_next = end_addr; 3755 continue; 3756 } 3757 3758 va_next = (addr + NBPDR) & ~PDRMASK; 3759 if (va_next < addr) 3760 va_next = end_addr; 3761 3762 pde = pmap_pdpe_to_pde(pdpe, addr); 3763 srcptepaddr = *pde; 3764 if (srcptepaddr == 0) 3765 continue; 3766 3767 if (srcptepaddr & PG_PS) { 3768 dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT); 3769 if (dstmpde == NULL) 3770 break; 3771 pde = (pd_entry_t *) 3772 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 3773 pde = &pde[pmap_pde_index(addr)]; 3774 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 3775 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 3776 PG_PS_FRAME))) { 3777 *pde = srcptepaddr & ~PG_W; 3778 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 3779 } else 3780 dstmpde->wire_count--; 3781 continue; 3782 } 3783 3784 srcptepaddr &= PG_FRAME; 3785 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 3786 KASSERT(srcmpte->wire_count > 0, 3787 ("pmap_copy: source page table page is unused")); 3788 3789 if (va_next > end_addr) 3790 va_next = end_addr; 3791 3792 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 3793 src_pte = &src_pte[pmap_pte_index(addr)]; 3794 dstmpte = NULL; 3795 while (addr < va_next) { 3796 pt_entry_t ptetemp; 3797 ptetemp = *src_pte; 3798 /* 3799 * we only virtual copy managed pages 3800 */ 3801 if ((ptetemp & PG_MANAGED) != 0) { 3802 if (dstmpte != NULL && 3803 dstmpte->pindex == pmap_pde_pindex(addr)) 3804 dstmpte->wire_count++; 3805 else if ((dstmpte = pmap_allocpte(dst_pmap, 3806 addr, M_NOWAIT)) == NULL) 3807 goto out; 3808 dst_pte = (pt_entry_t *) 3809 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 3810 dst_pte = &dst_pte[pmap_pte_index(addr)]; 3811 if (*dst_pte == 0 && 3812 pmap_try_insert_pv_entry(dst_pmap, addr, 3813 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 3814 /* 3815 * Clear the wired, modified, and 3816 * accessed (referenced) bits 3817 * during the copy. 3818 */ 3819 *dst_pte = ptetemp & ~(PG_W | PG_M | 3820 PG_A); 3821 pmap_resident_count_inc(dst_pmap, 1); 3822 } else { 3823 free = NULL; 3824 if (pmap_unwire_pte_hold(dst_pmap, 3825 addr, dstmpte, &free)) { 3826 pmap_invalidate_page(dst_pmap, 3827 addr); 3828 pmap_free_zero_pages(free); 3829 } 3830 goto out; 3831 } 3832 if (dstmpte->wire_count >= srcmpte->wire_count) 3833 break; 3834 } 3835 addr += PAGE_SIZE; 3836 src_pte++; 3837 } 3838 } 3839 out: 3840 vm_page_unlock_queues(); 3841 PMAP_UNLOCK(src_pmap); 3842 PMAP_UNLOCK(dst_pmap); 3843 } 3844 3845 /* 3846 * pmap_zero_page zeros the specified hardware page by mapping 3847 * the page into KVM and using bzero to clear its contents. 3848 */ 3849 void 3850 pmap_zero_page(vm_page_t m) 3851 { 3852 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3853 3854 pagezero((void *)va); 3855 } 3856 3857 /* 3858 * pmap_zero_page_area zeros the specified hardware page by mapping 3859 * the page into KVM and using bzero to clear its contents. 3860 * 3861 * off and size may not cover an area beyond a single hardware page. 3862 */ 3863 void 3864 pmap_zero_page_area(vm_page_t m, int off, int size) 3865 { 3866 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3867 3868 if (off == 0 && size == PAGE_SIZE) 3869 pagezero((void *)va); 3870 else 3871 bzero((char *)va + off, size); 3872 } 3873 3874 /* 3875 * pmap_zero_page_idle zeros the specified hardware page by mapping 3876 * the page into KVM and using bzero to clear its contents. This 3877 * is intended to be called from the vm_pagezero process only and 3878 * outside of Giant. 3879 */ 3880 void 3881 pmap_zero_page_idle(vm_page_t m) 3882 { 3883 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3884 3885 pagezero((void *)va); 3886 } 3887 3888 /* 3889 * pmap_copy_page copies the specified (machine independent) 3890 * page by mapping the page into virtual memory and using 3891 * bcopy to copy the page, one machine dependent page at a 3892 * time. 3893 */ 3894 void 3895 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3896 { 3897 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3898 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3899 3900 pagecopy((void *)src, (void *)dst); 3901 } 3902 3903 /* 3904 * Returns true if the pmap's pv is one of the first 3905 * 16 pvs linked to from this page. This count may 3906 * be changed upwards or downwards in the future; it 3907 * is only necessary that true be returned for a small 3908 * subset of pmaps for proper page aging. 3909 */ 3910 boolean_t 3911 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3912 { 3913 struct md_page *pvh; 3914 pv_entry_t pv; 3915 int loops = 0; 3916 boolean_t rv; 3917 3918 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 3919 ("pmap_page_exists_quick: page %p is not managed", m)); 3920 rv = FALSE; 3921 vm_page_lock_queues(); 3922 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3923 if (PV_PMAP(pv) == pmap) { 3924 rv = TRUE; 3925 break; 3926 } 3927 loops++; 3928 if (loops >= 16) 3929 break; 3930 } 3931 if (!rv && loops < 16) { 3932 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3933 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 3934 if (PV_PMAP(pv) == pmap) { 3935 rv = TRUE; 3936 break; 3937 } 3938 loops++; 3939 if (loops >= 16) 3940 break; 3941 } 3942 } 3943 vm_page_unlock_queues(); 3944 return (rv); 3945 } 3946 3947 /* 3948 * pmap_page_wired_mappings: 3949 * 3950 * Return the number of managed mappings to the given physical page 3951 * that are wired. 3952 */ 3953 int 3954 pmap_page_wired_mappings(vm_page_t m) 3955 { 3956 int count; 3957 3958 count = 0; 3959 if ((m->flags & PG_FICTITIOUS) != 0) 3960 return (count); 3961 vm_page_lock_queues(); 3962 count = pmap_pvh_wired_mappings(&m->md, count); 3963 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); 3964 vm_page_unlock_queues(); 3965 return (count); 3966 } 3967 3968 /* 3969 * pmap_pvh_wired_mappings: 3970 * 3971 * Return the updated number "count" of managed mappings that are wired. 3972 */ 3973 static int 3974 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 3975 { 3976 pmap_t pmap; 3977 pt_entry_t *pte; 3978 pv_entry_t pv; 3979 3980 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3981 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 3982 pmap = PV_PMAP(pv); 3983 PMAP_LOCK(pmap); 3984 pte = pmap_pte(pmap, pv->pv_va); 3985 if ((*pte & PG_W) != 0) 3986 count++; 3987 PMAP_UNLOCK(pmap); 3988 } 3989 return (count); 3990 } 3991 3992 /* 3993 * Returns TRUE if the given page is mapped individually or as part of 3994 * a 2mpage. Otherwise, returns FALSE. 3995 */ 3996 boolean_t 3997 pmap_page_is_mapped(vm_page_t m) 3998 { 3999 boolean_t rv; 4000 4001 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) 4002 return (FALSE); 4003 vm_page_lock_queues(); 4004 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4005 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list); 4006 vm_page_unlock_queues(); 4007 return (rv); 4008 } 4009 4010 /* 4011 * Remove all pages from specified address space 4012 * this aids process exit speeds. Also, this code 4013 * is special cased for current process only, but 4014 * can have the more generic (and slightly slower) 4015 * mode enabled. This is much faster than pmap_remove 4016 * in the case of running down an entire address space. 4017 */ 4018 void 4019 pmap_remove_pages(pmap_t pmap) 4020 { 4021 pd_entry_t ptepde; 4022 pt_entry_t *pte, tpte; 4023 vm_page_t free = NULL; 4024 vm_page_t m, mpte, mt; 4025 pv_entry_t pv; 4026 struct md_page *pvh; 4027 struct pv_chunk *pc, *npc; 4028 int field, idx; 4029 int64_t bit; 4030 uint64_t inuse, bitmask; 4031 int allfree; 4032 4033 if (pmap != PCPU_GET(curpmap)) { 4034 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4035 return; 4036 } 4037 vm_page_lock_queues(); 4038 PMAP_LOCK(pmap); 4039 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4040 allfree = 1; 4041 for (field = 0; field < _NPCM; field++) { 4042 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4043 while (inuse != 0) { 4044 bit = bsfq(inuse); 4045 bitmask = 1UL << bit; 4046 idx = field * 64 + bit; 4047 pv = &pc->pc_pventry[idx]; 4048 inuse &= ~bitmask; 4049 4050 pte = pmap_pdpe(pmap, pv->pv_va); 4051 ptepde = *pte; 4052 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 4053 tpte = *pte; 4054 if ((tpte & (PG_PS | PG_V)) == PG_V) { 4055 ptepde = tpte; 4056 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 4057 PG_FRAME); 4058 pte = &pte[pmap_pte_index(pv->pv_va)]; 4059 tpte = *pte & ~PG_PTE_PAT; 4060 } 4061 if ((tpte & PG_V) == 0) 4062 panic("bad pte"); 4063 4064 /* 4065 * We cannot remove wired pages from a process' mapping at this time 4066 */ 4067 if (tpte & PG_W) { 4068 allfree = 0; 4069 continue; 4070 } 4071 4072 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4073 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4074 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4075 m, (uintmax_t)m->phys_addr, 4076 (uintmax_t)tpte)); 4077 4078 KASSERT(m < &vm_page_array[vm_page_array_size], 4079 ("pmap_remove_pages: bad tpte %#jx", 4080 (uintmax_t)tpte)); 4081 4082 pte_clear(pte); 4083 4084 /* 4085 * Update the vm_page_t clean/reference bits. 4086 */ 4087 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4088 if ((tpte & PG_PS) != 0) { 4089 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4090 vm_page_dirty(mt); 4091 } else 4092 vm_page_dirty(m); 4093 } 4094 4095 /* Mark free */ 4096 PV_STAT(pv_entry_frees++); 4097 PV_STAT(pv_entry_spare++); 4098 pv_entry_count--; 4099 pc->pc_map[field] |= bitmask; 4100 if ((tpte & PG_PS) != 0) { 4101 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 4102 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4103 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 4104 if (TAILQ_EMPTY(&pvh->pv_list)) { 4105 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4106 if (TAILQ_EMPTY(&mt->md.pv_list)) 4107 vm_page_flag_clear(mt, PG_WRITEABLE); 4108 } 4109 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4110 if (mpte != NULL) { 4111 pmap_remove_pt_page(pmap, mpte); 4112 pmap_resident_count_dec(pmap, 1); 4113 KASSERT(mpte->wire_count == NPTEPG, 4114 ("pmap_remove_pages: pte page wire count error")); 4115 mpte->wire_count = 0; 4116 pmap_add_delayed_free_list(mpte, &free, FALSE); 4117 atomic_subtract_int(&cnt.v_wire_count, 1); 4118 } 4119 } else { 4120 pmap_resident_count_dec(pmap, 1); 4121 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4122 if (TAILQ_EMPTY(&m->md.pv_list)) { 4123 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4124 if (TAILQ_EMPTY(&pvh->pv_list)) 4125 vm_page_flag_clear(m, PG_WRITEABLE); 4126 } 4127 } 4128 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 4129 } 4130 } 4131 if (allfree) { 4132 PV_STAT(pv_entry_spare -= _NPCPV); 4133 PV_STAT(pc_chunk_count--); 4134 PV_STAT(pc_chunk_frees++); 4135 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4136 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 4137 dump_drop_page(m->phys_addr); 4138 vm_page_unwire(m, 0); 4139 vm_page_free(m); 4140 } 4141 } 4142 pmap_invalidate_all(pmap); 4143 vm_page_unlock_queues(); 4144 PMAP_UNLOCK(pmap); 4145 pmap_free_zero_pages(free); 4146 } 4147 4148 /* 4149 * pmap_is_modified: 4150 * 4151 * Return whether or not the specified physical page was modified 4152 * in any physical maps. 4153 */ 4154 boolean_t 4155 pmap_is_modified(vm_page_t m) 4156 { 4157 boolean_t rv; 4158 4159 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4160 ("pmap_is_modified: page %p is not managed", m)); 4161 4162 /* 4163 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be 4164 * concurrently set while the object is locked. Thus, if PG_WRITEABLE 4165 * is clear, no PTEs can have PG_M set. 4166 */ 4167 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4168 if ((m->oflags & VPO_BUSY) == 0 && 4169 (m->flags & PG_WRITEABLE) == 0) 4170 return (FALSE); 4171 vm_page_lock_queues(); 4172 rv = pmap_is_modified_pvh(&m->md) || 4173 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))); 4174 vm_page_unlock_queues(); 4175 return (rv); 4176 } 4177 4178 /* 4179 * Returns TRUE if any of the given mappings were used to modify 4180 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4181 * mappings are supported. 4182 */ 4183 static boolean_t 4184 pmap_is_modified_pvh(struct md_page *pvh) 4185 { 4186 pv_entry_t pv; 4187 pt_entry_t *pte; 4188 pmap_t pmap; 4189 boolean_t rv; 4190 4191 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4192 rv = FALSE; 4193 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4194 pmap = PV_PMAP(pv); 4195 PMAP_LOCK(pmap); 4196 pte = pmap_pte(pmap, pv->pv_va); 4197 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4198 PMAP_UNLOCK(pmap); 4199 if (rv) 4200 break; 4201 } 4202 return (rv); 4203 } 4204 4205 /* 4206 * pmap_is_prefaultable: 4207 * 4208 * Return whether or not the specified virtual address is elgible 4209 * for prefault. 4210 */ 4211 boolean_t 4212 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4213 { 4214 pd_entry_t *pde; 4215 pt_entry_t *pte; 4216 boolean_t rv; 4217 4218 rv = FALSE; 4219 PMAP_LOCK(pmap); 4220 pde = pmap_pde(pmap, addr); 4221 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 4222 pte = pmap_pde_to_pte(pde, addr); 4223 rv = (*pte & PG_V) == 0; 4224 } 4225 PMAP_UNLOCK(pmap); 4226 return (rv); 4227 } 4228 4229 /* 4230 * pmap_is_referenced: 4231 * 4232 * Return whether or not the specified physical page was referenced 4233 * in any physical maps. 4234 */ 4235 boolean_t 4236 pmap_is_referenced(vm_page_t m) 4237 { 4238 boolean_t rv; 4239 4240 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4241 ("pmap_is_referenced: page %p is not managed", m)); 4242 vm_page_lock_queues(); 4243 rv = pmap_is_referenced_pvh(&m->md) || 4244 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))); 4245 vm_page_unlock_queues(); 4246 return (rv); 4247 } 4248 4249 /* 4250 * Returns TRUE if any of the given mappings were referenced and FALSE 4251 * otherwise. Both page and 2mpage mappings are supported. 4252 */ 4253 static boolean_t 4254 pmap_is_referenced_pvh(struct md_page *pvh) 4255 { 4256 pv_entry_t pv; 4257 pt_entry_t *pte; 4258 pmap_t pmap; 4259 boolean_t rv; 4260 4261 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4262 rv = FALSE; 4263 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4264 pmap = PV_PMAP(pv); 4265 PMAP_LOCK(pmap); 4266 pte = pmap_pte(pmap, pv->pv_va); 4267 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4268 PMAP_UNLOCK(pmap); 4269 if (rv) 4270 break; 4271 } 4272 return (rv); 4273 } 4274 4275 /* 4276 * Clear the write and modified bits in each of the given page's mappings. 4277 */ 4278 void 4279 pmap_remove_write(vm_page_t m) 4280 { 4281 struct md_page *pvh; 4282 pmap_t pmap; 4283 pv_entry_t next_pv, pv; 4284 pd_entry_t *pde; 4285 pt_entry_t oldpte, *pte; 4286 vm_offset_t va; 4287 4288 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4289 ("pmap_remove_write: page %p is not managed", m)); 4290 4291 /* 4292 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be set by 4293 * another thread while the object is locked. Thus, if PG_WRITEABLE 4294 * is clear, no page table entries need updating. 4295 */ 4296 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4297 if ((m->oflags & VPO_BUSY) == 0 && 4298 (m->flags & PG_WRITEABLE) == 0) 4299 return; 4300 vm_page_lock_queues(); 4301 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4302 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4303 pmap = PV_PMAP(pv); 4304 PMAP_LOCK(pmap); 4305 va = pv->pv_va; 4306 pde = pmap_pde(pmap, va); 4307 if ((*pde & PG_RW) != 0) 4308 (void)pmap_demote_pde(pmap, pde, va); 4309 PMAP_UNLOCK(pmap); 4310 } 4311 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4312 pmap = PV_PMAP(pv); 4313 PMAP_LOCK(pmap); 4314 pde = pmap_pde(pmap, pv->pv_va); 4315 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4316 " a 2mpage in page %p's pv list", m)); 4317 pte = pmap_pde_to_pte(pde, pv->pv_va); 4318 retry: 4319 oldpte = *pte; 4320 if (oldpte & PG_RW) { 4321 if (!atomic_cmpset_long(pte, oldpte, oldpte & 4322 ~(PG_RW | PG_M))) 4323 goto retry; 4324 if ((oldpte & PG_M) != 0) 4325 vm_page_dirty(m); 4326 pmap_invalidate_page(pmap, pv->pv_va); 4327 } 4328 PMAP_UNLOCK(pmap); 4329 } 4330 vm_page_flag_clear(m, PG_WRITEABLE); 4331 vm_page_unlock_queues(); 4332 } 4333 4334 /* 4335 * pmap_ts_referenced: 4336 * 4337 * Return a count of reference bits for a page, clearing those bits. 4338 * It is not necessary for every reference bit to be cleared, but it 4339 * is necessary that 0 only be returned when there are truly no 4340 * reference bits set. 4341 * 4342 * XXX: The exact number of bits to check and clear is a matter that 4343 * should be tested and standardized at some point in the future for 4344 * optimal aging of shared pages. 4345 */ 4346 int 4347 pmap_ts_referenced(vm_page_t m) 4348 { 4349 struct md_page *pvh; 4350 pv_entry_t pv, pvf, pvn; 4351 pmap_t pmap; 4352 pd_entry_t oldpde, *pde; 4353 pt_entry_t *pte; 4354 vm_offset_t va; 4355 int rtval = 0; 4356 4357 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4358 ("pmap_ts_referenced: page %p is not managed", m)); 4359 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4360 vm_page_lock_queues(); 4361 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { 4362 pmap = PV_PMAP(pv); 4363 PMAP_LOCK(pmap); 4364 va = pv->pv_va; 4365 pde = pmap_pde(pmap, va); 4366 oldpde = *pde; 4367 if ((oldpde & PG_A) != 0) { 4368 if (pmap_demote_pde(pmap, pde, va)) { 4369 if ((oldpde & PG_W) == 0) { 4370 /* 4371 * Remove the mapping to a single page 4372 * so that a subsequent access may 4373 * repromote. Since the underlying 4374 * page table page is fully populated, 4375 * this removal never frees a page 4376 * table page. 4377 */ 4378 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4379 PG_PS_FRAME); 4380 pmap_remove_page(pmap, va, pde, NULL); 4381 rtval++; 4382 if (rtval > 4) { 4383 PMAP_UNLOCK(pmap); 4384 goto out; 4385 } 4386 } 4387 } 4388 } 4389 PMAP_UNLOCK(pmap); 4390 } 4391 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4392 pvf = pv; 4393 do { 4394 pvn = TAILQ_NEXT(pv, pv_list); 4395 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4396 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 4397 pmap = PV_PMAP(pv); 4398 PMAP_LOCK(pmap); 4399 pde = pmap_pde(pmap, pv->pv_va); 4400 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" 4401 " found a 2mpage in page %p's pv list", m)); 4402 pte = pmap_pde_to_pte(pde, pv->pv_va); 4403 if ((*pte & PG_A) != 0) { 4404 atomic_clear_long(pte, PG_A); 4405 pmap_invalidate_page(pmap, pv->pv_va); 4406 rtval++; 4407 if (rtval > 4) 4408 pvn = NULL; 4409 } 4410 PMAP_UNLOCK(pmap); 4411 } while ((pv = pvn) != NULL && pv != pvf); 4412 } 4413 out: 4414 vm_page_unlock_queues(); 4415 return (rtval); 4416 } 4417 4418 /* 4419 * Clear the modify bits on the specified physical page. 4420 */ 4421 void 4422 pmap_clear_modify(vm_page_t m) 4423 { 4424 struct md_page *pvh; 4425 pmap_t pmap; 4426 pv_entry_t next_pv, pv; 4427 pd_entry_t oldpde, *pde; 4428 pt_entry_t oldpte, *pte; 4429 vm_offset_t va; 4430 4431 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4432 ("pmap_clear_modify: page %p is not managed", m)); 4433 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4434 KASSERT((m->oflags & VPO_BUSY) == 0, 4435 ("pmap_clear_modify: page %p is busy", m)); 4436 4437 /* 4438 * If the page is not PG_WRITEABLE, then no PTEs can have PG_M set. 4439 * If the object containing the page is locked and the page is not 4440 * VPO_BUSY, then PG_WRITEABLE cannot be concurrently set. 4441 */ 4442 if ((m->flags & PG_WRITEABLE) == 0) 4443 return; 4444 vm_page_lock_queues(); 4445 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4446 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4447 pmap = PV_PMAP(pv); 4448 PMAP_LOCK(pmap); 4449 va = pv->pv_va; 4450 pde = pmap_pde(pmap, va); 4451 oldpde = *pde; 4452 if ((oldpde & PG_RW) != 0) { 4453 if (pmap_demote_pde(pmap, pde, va)) { 4454 if ((oldpde & PG_W) == 0) { 4455 /* 4456 * Write protect the mapping to a 4457 * single page so that a subsequent 4458 * write access may repromote. 4459 */ 4460 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4461 PG_PS_FRAME); 4462 pte = pmap_pde_to_pte(pde, va); 4463 oldpte = *pte; 4464 if ((oldpte & PG_V) != 0) { 4465 while (!atomic_cmpset_long(pte, 4466 oldpte, 4467 oldpte & ~(PG_M | PG_RW))) 4468 oldpte = *pte; 4469 vm_page_dirty(m); 4470 pmap_invalidate_page(pmap, va); 4471 } 4472 } 4473 } 4474 } 4475 PMAP_UNLOCK(pmap); 4476 } 4477 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4478 pmap = PV_PMAP(pv); 4479 PMAP_LOCK(pmap); 4480 pde = pmap_pde(pmap, pv->pv_va); 4481 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 4482 " a 2mpage in page %p's pv list", m)); 4483 pte = pmap_pde_to_pte(pde, pv->pv_va); 4484 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4485 atomic_clear_long(pte, PG_M); 4486 pmap_invalidate_page(pmap, pv->pv_va); 4487 } 4488 PMAP_UNLOCK(pmap); 4489 } 4490 vm_page_unlock_queues(); 4491 } 4492 4493 /* 4494 * pmap_clear_reference: 4495 * 4496 * Clear the reference bit on the specified physical page. 4497 */ 4498 void 4499 pmap_clear_reference(vm_page_t m) 4500 { 4501 struct md_page *pvh; 4502 pmap_t pmap; 4503 pv_entry_t next_pv, pv; 4504 pd_entry_t oldpde, *pde; 4505 pt_entry_t *pte; 4506 vm_offset_t va; 4507 4508 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4509 ("pmap_clear_reference: page %p is not managed", m)); 4510 vm_page_lock_queues(); 4511 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4512 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4513 pmap = PV_PMAP(pv); 4514 PMAP_LOCK(pmap); 4515 va = pv->pv_va; 4516 pde = pmap_pde(pmap, va); 4517 oldpde = *pde; 4518 if ((oldpde & PG_A) != 0) { 4519 if (pmap_demote_pde(pmap, pde, va)) { 4520 /* 4521 * Remove the mapping to a single page so 4522 * that a subsequent access may repromote. 4523 * Since the underlying page table page is 4524 * fully populated, this removal never frees 4525 * a page table page. 4526 */ 4527 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4528 PG_PS_FRAME); 4529 pmap_remove_page(pmap, va, pde, NULL); 4530 } 4531 } 4532 PMAP_UNLOCK(pmap); 4533 } 4534 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4535 pmap = PV_PMAP(pv); 4536 PMAP_LOCK(pmap); 4537 pde = pmap_pde(pmap, pv->pv_va); 4538 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" 4539 " a 2mpage in page %p's pv list", m)); 4540 pte = pmap_pde_to_pte(pde, pv->pv_va); 4541 if (*pte & PG_A) { 4542 atomic_clear_long(pte, PG_A); 4543 pmap_invalidate_page(pmap, pv->pv_va); 4544 } 4545 PMAP_UNLOCK(pmap); 4546 } 4547 vm_page_unlock_queues(); 4548 } 4549 4550 /* 4551 * Miscellaneous support routines follow 4552 */ 4553 4554 /* Adjust the cache mode for a 4KB page mapped via a PTE. */ 4555 static __inline void 4556 pmap_pte_attr(pt_entry_t *pte, int cache_bits) 4557 { 4558 u_int opte, npte; 4559 4560 /* 4561 * The cache mode bits are all in the low 32-bits of the 4562 * PTE, so we can just spin on updating the low 32-bits. 4563 */ 4564 do { 4565 opte = *(u_int *)pte; 4566 npte = opte & ~PG_PTE_CACHE; 4567 npte |= cache_bits; 4568 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 4569 } 4570 4571 /* Adjust the cache mode for a 2MB page mapped via a PDE. */ 4572 static __inline void 4573 pmap_pde_attr(pd_entry_t *pde, int cache_bits) 4574 { 4575 u_int opde, npde; 4576 4577 /* 4578 * The cache mode bits are all in the low 32-bits of the 4579 * PDE, so we can just spin on updating the low 32-bits. 4580 */ 4581 do { 4582 opde = *(u_int *)pde; 4583 npde = opde & ~PG_PDE_CACHE; 4584 npde |= cache_bits; 4585 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 4586 } 4587 4588 /* 4589 * Map a set of physical memory pages into the kernel virtual 4590 * address space. Return a pointer to where it is mapped. This 4591 * routine is intended to be used for mapping device memory, 4592 * NOT real memory. 4593 */ 4594 void * 4595 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 4596 { 4597 vm_offset_t va, offset; 4598 vm_size_t tmpsize; 4599 4600 /* 4601 * If the specified range of physical addresses fits within the direct 4602 * map window, use the direct map. 4603 */ 4604 if (pa < dmaplimit && pa + size < dmaplimit) { 4605 va = PHYS_TO_DMAP(pa); 4606 if (!pmap_change_attr(va, size, mode)) 4607 return ((void *)va); 4608 } 4609 offset = pa & PAGE_MASK; 4610 size = roundup(offset + size, PAGE_SIZE); 4611 va = kmem_alloc_nofault(kernel_map, size); 4612 if (!va) 4613 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 4614 pa = trunc_page(pa); 4615 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 4616 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 4617 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 4618 pmap_invalidate_cache_range(va, va + tmpsize); 4619 return ((void *)(va + offset)); 4620 } 4621 4622 void * 4623 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 4624 { 4625 4626 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 4627 } 4628 4629 void * 4630 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4631 { 4632 4633 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 4634 } 4635 4636 void 4637 pmap_unmapdev(vm_offset_t va, vm_size_t size) 4638 { 4639 vm_offset_t base, offset, tmpva; 4640 4641 /* If we gave a direct map region in pmap_mapdev, do nothing */ 4642 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 4643 return; 4644 base = trunc_page(va); 4645 offset = va & PAGE_MASK; 4646 size = roundup(offset + size, PAGE_SIZE); 4647 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) 4648 pmap_kremove(tmpva); 4649 pmap_invalidate_range(kernel_pmap, va, tmpva); 4650 kmem_free(kernel_map, base, size); 4651 } 4652 4653 /* 4654 * Tries to demote a 1GB page mapping. 4655 */ 4656 static boolean_t 4657 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 4658 { 4659 pdp_entry_t newpdpe, oldpdpe; 4660 pd_entry_t *firstpde, newpde, *pde; 4661 vm_paddr_t mpdepa; 4662 vm_page_t mpde; 4663 4664 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4665 oldpdpe = *pdpe; 4666 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 4667 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 4668 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 4669 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 4670 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 4671 " in pmap %p", va, pmap); 4672 return (FALSE); 4673 } 4674 mpdepa = VM_PAGE_TO_PHYS(mpde); 4675 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); 4676 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 4677 KASSERT((oldpdpe & PG_A) != 0, 4678 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 4679 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 4680 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 4681 newpde = oldpdpe; 4682 4683 /* 4684 * Initialize the page directory page. 4685 */ 4686 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 4687 *pde = newpde; 4688 newpde += NBPDR; 4689 } 4690 4691 /* 4692 * Demote the mapping. 4693 */ 4694 *pdpe = newpdpe; 4695 4696 /* 4697 * Invalidate a stale recursive mapping of the page directory page. 4698 */ 4699 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 4700 4701 pmap_pdpe_demotions++; 4702 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 4703 " in pmap %p", va, pmap); 4704 return (TRUE); 4705 } 4706 4707 /* 4708 * Sets the memory attribute for the specified page. 4709 */ 4710 void 4711 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4712 { 4713 4714 m->md.pat_mode = ma; 4715 4716 /* 4717 * If "m" is a normal page, update its direct mapping. This update 4718 * can be relied upon to perform any cache operations that are 4719 * required for data coherence. 4720 */ 4721 if ((m->flags & PG_FICTITIOUS) == 0 && 4722 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 4723 m->md.pat_mode)) 4724 panic("memory attribute change on the direct map failed"); 4725 } 4726 4727 /* 4728 * Changes the specified virtual address range's memory type to that given by 4729 * the parameter "mode". The specified virtual address range must be 4730 * completely contained within either the direct map or the kernel map. If 4731 * the virtual address range is contained within the kernel map, then the 4732 * memory type for each of the corresponding ranges of the direct map is also 4733 * changed. (The corresponding ranges of the direct map are those ranges that 4734 * map the same physical pages as the specified virtual address range.) These 4735 * changes to the direct map are necessary because Intel describes the 4736 * behavior of their processors as "undefined" if two or more mappings to the 4737 * same physical page have different memory types. 4738 * 4739 * Returns zero if the change completed successfully, and either EINVAL or 4740 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4741 * of the virtual address range was not mapped, and ENOMEM is returned if 4742 * there was insufficient memory available to complete the change. In the 4743 * latter case, the memory type may have been changed on some part of the 4744 * virtual address range or the direct map. 4745 */ 4746 int 4747 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4748 { 4749 int error; 4750 4751 PMAP_LOCK(kernel_pmap); 4752 error = pmap_change_attr_locked(va, size, mode); 4753 PMAP_UNLOCK(kernel_pmap); 4754 return (error); 4755 } 4756 4757 static int 4758 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4759 { 4760 vm_offset_t base, offset, tmpva; 4761 vm_paddr_t pa_start, pa_end; 4762 pdp_entry_t *pdpe; 4763 pd_entry_t *pde; 4764 pt_entry_t *pte; 4765 int cache_bits_pte, cache_bits_pde, error; 4766 boolean_t changed; 4767 4768 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4769 base = trunc_page(va); 4770 offset = va & PAGE_MASK; 4771 size = roundup(offset + size, PAGE_SIZE); 4772 4773 /* 4774 * Only supported on kernel virtual addresses, including the direct 4775 * map but excluding the recursive map. 4776 */ 4777 if (base < DMAP_MIN_ADDRESS) 4778 return (EINVAL); 4779 4780 cache_bits_pde = pmap_cache_bits(mode, 1); 4781 cache_bits_pte = pmap_cache_bits(mode, 0); 4782 changed = FALSE; 4783 4784 /* 4785 * Pages that aren't mapped aren't supported. Also break down 2MB pages 4786 * into 4KB pages if required. 4787 */ 4788 for (tmpva = base; tmpva < base + size; ) { 4789 pdpe = pmap_pdpe(kernel_pmap, tmpva); 4790 if (*pdpe == 0) 4791 return (EINVAL); 4792 if (*pdpe & PG_PS) { 4793 /* 4794 * If the current 1GB page already has the required 4795 * memory type, then we need not demote this page. Just 4796 * increment tmpva to the next 1GB page frame. 4797 */ 4798 if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) { 4799 tmpva = trunc_1gpage(tmpva) + NBPDP; 4800 continue; 4801 } 4802 4803 /* 4804 * If the current offset aligns with a 1GB page frame 4805 * and there is at least 1GB left within the range, then 4806 * we need not break down this page into 2MB pages. 4807 */ 4808 if ((tmpva & PDPMASK) == 0 && 4809 tmpva + PDPMASK < base + size) { 4810 tmpva += NBPDP; 4811 continue; 4812 } 4813 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 4814 return (ENOMEM); 4815 } 4816 pde = pmap_pdpe_to_pde(pdpe, tmpva); 4817 if (*pde == 0) 4818 return (EINVAL); 4819 if (*pde & PG_PS) { 4820 /* 4821 * If the current 2MB page already has the required 4822 * memory type, then we need not demote this page. Just 4823 * increment tmpva to the next 2MB page frame. 4824 */ 4825 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 4826 tmpva = trunc_2mpage(tmpva) + NBPDR; 4827 continue; 4828 } 4829 4830 /* 4831 * If the current offset aligns with a 2MB page frame 4832 * and there is at least 2MB left within the range, then 4833 * we need not break down this page into 4KB pages. 4834 */ 4835 if ((tmpva & PDRMASK) == 0 && 4836 tmpva + PDRMASK < base + size) { 4837 tmpva += NBPDR; 4838 continue; 4839 } 4840 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 4841 return (ENOMEM); 4842 } 4843 pte = pmap_pde_to_pte(pde, tmpva); 4844 if (*pte == 0) 4845 return (EINVAL); 4846 tmpva += PAGE_SIZE; 4847 } 4848 error = 0; 4849 4850 /* 4851 * Ok, all the pages exist, so run through them updating their 4852 * cache mode if required. 4853 */ 4854 pa_start = pa_end = 0; 4855 for (tmpva = base; tmpva < base + size; ) { 4856 pdpe = pmap_pdpe(kernel_pmap, tmpva); 4857 if (*pdpe & PG_PS) { 4858 if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) { 4859 pmap_pde_attr(pdpe, cache_bits_pde); 4860 changed = TRUE; 4861 } 4862 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 4863 if (pa_start == pa_end) { 4864 /* Start physical address run. */ 4865 pa_start = *pdpe & PG_PS_FRAME; 4866 pa_end = pa_start + NBPDP; 4867 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 4868 pa_end += NBPDP; 4869 else { 4870 /* Run ended, update direct map. */ 4871 error = pmap_change_attr_locked( 4872 PHYS_TO_DMAP(pa_start), 4873 pa_end - pa_start, mode); 4874 if (error != 0) 4875 break; 4876 /* Start physical address run. */ 4877 pa_start = *pdpe & PG_PS_FRAME; 4878 pa_end = pa_start + NBPDP; 4879 } 4880 } 4881 tmpva = trunc_1gpage(tmpva) + NBPDP; 4882 continue; 4883 } 4884 pde = pmap_pdpe_to_pde(pdpe, tmpva); 4885 if (*pde & PG_PS) { 4886 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 4887 pmap_pde_attr(pde, cache_bits_pde); 4888 changed = TRUE; 4889 } 4890 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 4891 if (pa_start == pa_end) { 4892 /* Start physical address run. */ 4893 pa_start = *pde & PG_PS_FRAME; 4894 pa_end = pa_start + NBPDR; 4895 } else if (pa_end == (*pde & PG_PS_FRAME)) 4896 pa_end += NBPDR; 4897 else { 4898 /* Run ended, update direct map. */ 4899 error = pmap_change_attr_locked( 4900 PHYS_TO_DMAP(pa_start), 4901 pa_end - pa_start, mode); 4902 if (error != 0) 4903 break; 4904 /* Start physical address run. */ 4905 pa_start = *pde & PG_PS_FRAME; 4906 pa_end = pa_start + NBPDR; 4907 } 4908 } 4909 tmpva = trunc_2mpage(tmpva) + NBPDR; 4910 } else { 4911 pte = pmap_pde_to_pte(pde, tmpva); 4912 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 4913 pmap_pte_attr(pte, cache_bits_pte); 4914 changed = TRUE; 4915 } 4916 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 4917 if (pa_start == pa_end) { 4918 /* Start physical address run. */ 4919 pa_start = *pte & PG_FRAME; 4920 pa_end = pa_start + PAGE_SIZE; 4921 } else if (pa_end == (*pte & PG_FRAME)) 4922 pa_end += PAGE_SIZE; 4923 else { 4924 /* Run ended, update direct map. */ 4925 error = pmap_change_attr_locked( 4926 PHYS_TO_DMAP(pa_start), 4927 pa_end - pa_start, mode); 4928 if (error != 0) 4929 break; 4930 /* Start physical address run. */ 4931 pa_start = *pte & PG_FRAME; 4932 pa_end = pa_start + PAGE_SIZE; 4933 } 4934 } 4935 tmpva += PAGE_SIZE; 4936 } 4937 } 4938 if (error == 0 && pa_start != pa_end) 4939 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 4940 pa_end - pa_start, mode); 4941 4942 /* 4943 * Flush CPU caches if required to make sure any data isn't cached that 4944 * shouldn't be, etc. 4945 */ 4946 if (changed) { 4947 pmap_invalidate_range(kernel_pmap, base, tmpva); 4948 pmap_invalidate_cache_range(base, tmpva); 4949 } 4950 return (error); 4951 } 4952 4953 /* 4954 * Demotes any mapping within the direct map region that covers more than the 4955 * specified range of physical addresses. This range's size must be a power 4956 * of two and its starting address must be a multiple of its size. Since the 4957 * demotion does not change any attributes of the mapping, a TLB invalidation 4958 * is not mandatory. The caller may, however, request a TLB invalidation. 4959 */ 4960 void 4961 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 4962 { 4963 pdp_entry_t *pdpe; 4964 pd_entry_t *pde; 4965 vm_offset_t va; 4966 boolean_t changed; 4967 4968 if (len == 0) 4969 return; 4970 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 4971 KASSERT((base & (len - 1)) == 0, 4972 ("pmap_demote_DMAP: base is not a multiple of len")); 4973 if (len < NBPDP && base < dmaplimit) { 4974 va = PHYS_TO_DMAP(base); 4975 changed = FALSE; 4976 PMAP_LOCK(kernel_pmap); 4977 pdpe = pmap_pdpe(kernel_pmap, va); 4978 if ((*pdpe & PG_V) == 0) 4979 panic("pmap_demote_DMAP: invalid PDPE"); 4980 if ((*pdpe & PG_PS) != 0) { 4981 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 4982 panic("pmap_demote_DMAP: PDPE failed"); 4983 changed = TRUE; 4984 } 4985 if (len < NBPDR) { 4986 pde = pmap_pdpe_to_pde(pdpe, va); 4987 if ((*pde & PG_V) == 0) 4988 panic("pmap_demote_DMAP: invalid PDE"); 4989 if ((*pde & PG_PS) != 0) { 4990 if (!pmap_demote_pde(kernel_pmap, pde, va)) 4991 panic("pmap_demote_DMAP: PDE failed"); 4992 changed = TRUE; 4993 } 4994 } 4995 if (changed && invalidate) 4996 pmap_invalidate_page(kernel_pmap, va); 4997 PMAP_UNLOCK(kernel_pmap); 4998 } 4999 } 5000 5001 /* 5002 * perform the pmap work for mincore 5003 */ 5004 int 5005 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5006 { 5007 pd_entry_t *pdep; 5008 pt_entry_t pte; 5009 vm_paddr_t pa; 5010 int val; 5011 5012 PMAP_LOCK(pmap); 5013 retry: 5014 pdep = pmap_pde(pmap, addr); 5015 if (pdep != NULL && (*pdep & PG_V)) { 5016 if (*pdep & PG_PS) { 5017 pte = *pdep; 5018 /* Compute the physical address of the 4KB page. */ 5019 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5020 PG_FRAME; 5021 val = MINCORE_SUPER; 5022 } else { 5023 pte = *pmap_pde_to_pte(pdep, addr); 5024 pa = pte & PG_FRAME; 5025 val = 0; 5026 } 5027 } else { 5028 pte = 0; 5029 pa = 0; 5030 val = 0; 5031 } 5032 if ((pte & PG_V) != 0) { 5033 val |= MINCORE_INCORE; 5034 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5035 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5036 if ((pte & PG_A) != 0) 5037 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5038 } 5039 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5040 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5041 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5042 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5043 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5044 goto retry; 5045 } else 5046 PA_UNLOCK_COND(*locked_pa); 5047 PMAP_UNLOCK(pmap); 5048 return (val); 5049 } 5050 5051 void 5052 pmap_activate(struct thread *td) 5053 { 5054 pmap_t pmap, oldpmap; 5055 u_int64_t cr3; 5056 5057 critical_enter(); 5058 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5059 oldpmap = PCPU_GET(curpmap); 5060 #ifdef SMP 5061 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); 5062 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); 5063 #else 5064 oldpmap->pm_active &= ~PCPU_GET(cpumask); 5065 pmap->pm_active |= PCPU_GET(cpumask); 5066 #endif 5067 cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4); 5068 td->td_pcb->pcb_cr3 = cr3; 5069 load_cr3(cr3); 5070 PCPU_SET(curpmap, pmap); 5071 critical_exit(); 5072 } 5073 5074 void 5075 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5076 { 5077 } 5078 5079 /* 5080 * Increase the starting virtual address of the given mapping if a 5081 * different alignment might result in more superpage mappings. 5082 */ 5083 void 5084 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5085 vm_offset_t *addr, vm_size_t size) 5086 { 5087 vm_offset_t superpage_offset; 5088 5089 if (size < NBPDR) 5090 return; 5091 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5092 offset += ptoa(object->pg_color); 5093 superpage_offset = offset & PDRMASK; 5094 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5095 (*addr & PDRMASK) == superpage_offset) 5096 return; 5097 if ((*addr & PDRMASK) < superpage_offset) 5098 *addr = (*addr & ~PDRMASK) + superpage_offset; 5099 else 5100 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5101 } 5102