1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2003 Peter Wemm 8 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 9 * Copyright (c) 2008, 2009 The DragonFly Project. 10 * Copyright (c) 2008, 2009 Jordan Gordeev. 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 47 */ 48 49 /* 50 * Manages physical address maps. 51 * 52 * In addition to hardware address maps, this 53 * module is called upon to provide software-use-only 54 * maps which may or may not be stored in the same 55 * form as hardware maps. These pseudo-maps are 56 * used to store intermediate results from copy 57 * operations to and from address spaces. 58 * 59 * Since the information managed by this module is 60 * also stored by the logical address mapping module, 61 * this module may throw away valid virtual-to-physical 62 * mappings at almost any time. However, invalidations 63 * of virtual-to-physical mappings must be done as 64 * requested. 65 * 66 * In order to cope with hardware architectures which 67 * make virtual-to-physical map invalidates expensive, 68 * this module may delay invalidate or reduced protection 69 * operations until such time as they are actually 70 * necessary. This module is given full information as 71 * to which processors are currently using which maps, 72 * and to when physical maps must be made correct. 73 */ 74 75 #if JG 76 #include "opt_disable_pse.h" 77 #include "opt_pmap.h" 78 #endif 79 #include "opt_msgbuf.h" 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/kernel.h> 84 #include <sys/proc.h> 85 #include <sys/msgbuf.h> 86 #include <sys/vmmeter.h> 87 #include <sys/mman.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <sys/sysctl.h> 92 #include <sys/lock.h> 93 #include <vm/vm_kern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vm_map.h> 96 #include <vm/vm_object.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_pageout.h> 99 #include <vm/vm_pager.h> 100 #include <vm/vm_zone.h> 101 102 #include <sys/user.h> 103 #include <sys/thread2.h> 104 #include <sys/sysref2.h> 105 106 #include <machine/cputypes.h> 107 #include <machine/md_var.h> 108 #include <machine/specialreg.h> 109 #include <machine/smp.h> 110 #include <machine_base/apic/apicreg.h> 111 #include <machine/globaldata.h> 112 #include <machine/pmap.h> 113 #include <machine/pmap_inval.h> 114 115 #include <ddb/ddb.h> 116 117 #define PMAP_KEEP_PDIRS 118 #ifndef PMAP_SHPGPERPROC 119 #define PMAP_SHPGPERPROC 200 120 #endif 121 122 #if defined(DIAGNOSTIC) 123 #define PMAP_DIAGNOSTIC 124 #endif 125 126 #define MINPV 2048 127 128 /* 129 * Get PDEs and PTEs for user/kernel address space 130 */ 131 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 132 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 133 134 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & PG_V) != 0) 135 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & PG_W) != 0) 136 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & PG_M) != 0) 137 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & PG_A) != 0) 138 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & PG_V) != 0) 139 140 141 /* 142 * Given a map and a machine independent protection code, 143 * convert to a vax protection code. 144 */ 145 #define pte_prot(m, p) \ 146 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 147 static int protection_codes[8]; 148 149 struct pmap kernel_pmap; 150 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); 151 152 vm_paddr_t avail_start; /* PA of first available physical page */ 153 vm_paddr_t avail_end; /* PA of last available physical page */ 154 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 155 vm_offset_t virtual2_end; 156 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 157 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 158 vm_offset_t KvaStart; /* VA start of KVA space */ 159 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 160 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 161 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 162 static int pgeflag; /* PG_G or-in */ 163 static int pseflag; /* PG_PS or-in */ 164 165 static vm_object_t kptobj; 166 167 static int ndmpdp; 168 static vm_paddr_t dmaplimit; 169 static int nkpt; 170 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 171 172 static uint64_t KPTbase; 173 static uint64_t KPTphys; 174 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 175 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 176 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 177 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 178 179 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 180 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 181 182 /* 183 * Data for the pv entry allocation mechanism 184 */ 185 static vm_zone_t pvzone; 186 static struct vm_zone pvzone_store; 187 static struct vm_object pvzone_obj; 188 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; 189 static int pmap_pagedaemon_waken = 0; 190 static struct pv_entry *pvinit; 191 192 /* 193 * All those kernel PT submaps that BSD is so fond of 194 */ 195 pt_entry_t *CMAP1 = 0, *ptmmap; 196 caddr_t CADDR1 = 0, ptvmmap = 0; 197 static pt_entry_t *msgbufmap; 198 struct msgbuf *msgbufp=0; 199 200 /* 201 * Crashdump maps. 202 */ 203 static pt_entry_t *pt_crashdumpmap; 204 static caddr_t crashdumpmap; 205 206 extern pt_entry_t *SMPpt; 207 extern uint64_t SMPptpa; 208 209 #define DISABLE_PSE 210 211 static pv_entry_t get_pv_entry (void); 212 static void i386_protection_init (void); 213 static void create_pagetables(vm_paddr_t *firstaddr); 214 static void pmap_remove_all (vm_page_t m); 215 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 216 vm_offset_t sva, pmap_inval_info_t info); 217 static void pmap_remove_page (struct pmap *pmap, 218 vm_offset_t va, pmap_inval_info_t info); 219 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 220 vm_offset_t va, pmap_inval_info_t info); 221 static boolean_t pmap_testbit (vm_page_t m, int bit); 222 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 223 vm_page_t mpte, vm_page_t m); 224 225 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 226 227 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 228 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 229 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 230 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 231 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 232 pmap_inval_info_t info); 233 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t); 234 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 235 236 static unsigned pdir4mb; 237 238 /* 239 * Move the kernel virtual free pointer to the next 240 * 2MB. This is used to help improve performance 241 * by using a large (2MB) page for much of the kernel 242 * (.text, .data, .bss) 243 */ 244 static 245 vm_offset_t 246 pmap_kmem_choose(vm_offset_t addr) 247 { 248 vm_offset_t newaddr = addr; 249 250 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 251 return newaddr; 252 } 253 254 /* 255 * pmap_pte_quick: 256 * 257 * Super fast pmap_pte routine best used when scanning the pv lists. 258 * This eliminates many course-grained invltlb calls. Note that many of 259 * the pv list scans are across different pmaps and it is very wasteful 260 * to do an entire invltlb when checking a single mapping. 261 * 262 * Should only be called while in a critical section. 263 */ 264 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 265 266 static 267 pt_entry_t * 268 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 269 { 270 return pmap_pte(pmap, va); 271 } 272 273 /* Return a non-clipped PD index for a given VA */ 274 static __inline 275 vm_pindex_t 276 pmap_pde_pindex(vm_offset_t va) 277 { 278 return va >> PDRSHIFT; 279 } 280 281 /* Return various clipped indexes for a given VA */ 282 static __inline 283 vm_pindex_t 284 pmap_pte_index(vm_offset_t va) 285 { 286 287 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 288 } 289 290 static __inline 291 vm_pindex_t 292 pmap_pde_index(vm_offset_t va) 293 { 294 295 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 296 } 297 298 static __inline 299 vm_pindex_t 300 pmap_pdpe_index(vm_offset_t va) 301 { 302 303 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 304 } 305 306 static __inline 307 vm_pindex_t 308 pmap_pml4e_index(vm_offset_t va) 309 { 310 311 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 312 } 313 314 /* Return a pointer to the PML4 slot that corresponds to a VA */ 315 static __inline 316 pml4_entry_t * 317 pmap_pml4e(pmap_t pmap, vm_offset_t va) 318 { 319 320 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 321 } 322 323 /* Return a pointer to the PDP slot that corresponds to a VA */ 324 static __inline 325 pdp_entry_t * 326 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 327 { 328 pdp_entry_t *pdpe; 329 330 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 331 return (&pdpe[pmap_pdpe_index(va)]); 332 } 333 334 /* Return a pointer to the PDP slot that corresponds to a VA */ 335 static __inline 336 pdp_entry_t * 337 pmap_pdpe(pmap_t pmap, vm_offset_t va) 338 { 339 pml4_entry_t *pml4e; 340 341 pml4e = pmap_pml4e(pmap, va); 342 if ((*pml4e & PG_V) == 0) 343 return NULL; 344 return (pmap_pml4e_to_pdpe(pml4e, va)); 345 } 346 347 /* Return a pointer to the PD slot that corresponds to a VA */ 348 static __inline 349 pd_entry_t * 350 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 351 { 352 pd_entry_t *pde; 353 354 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 355 return (&pde[pmap_pde_index(va)]); 356 } 357 358 /* Return a pointer to the PD slot that corresponds to a VA */ 359 static __inline 360 pd_entry_t * 361 pmap_pde(pmap_t pmap, vm_offset_t va) 362 { 363 pdp_entry_t *pdpe; 364 365 pdpe = pmap_pdpe(pmap, va); 366 if (pdpe == NULL || (*pdpe & PG_V) == 0) 367 return NULL; 368 return (pmap_pdpe_to_pde(pdpe, va)); 369 } 370 371 /* Return a pointer to the PT slot that corresponds to a VA */ 372 static __inline 373 pt_entry_t * 374 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 375 { 376 pt_entry_t *pte; 377 378 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 379 return (&pte[pmap_pte_index(va)]); 380 } 381 382 /* Return a pointer to the PT slot that corresponds to a VA */ 383 static __inline 384 pt_entry_t * 385 pmap_pte(pmap_t pmap, vm_offset_t va) 386 { 387 pd_entry_t *pde; 388 389 pde = pmap_pde(pmap, va); 390 if (pde == NULL || (*pde & PG_V) == 0) 391 return NULL; 392 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 393 return ((pt_entry_t *)pde); 394 return (pmap_pde_to_pte(pde, va)); 395 } 396 397 static __inline 398 pt_entry_t * 399 vtopte(vm_offset_t va) 400 { 401 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 402 403 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 404 } 405 406 static __inline 407 pd_entry_t * 408 vtopde(vm_offset_t va) 409 { 410 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 411 412 return (PDmap + ((va >> PDRSHIFT) & mask)); 413 } 414 415 static uint64_t 416 allocpages(vm_paddr_t *firstaddr, int n) 417 { 418 uint64_t ret; 419 420 ret = *firstaddr; 421 bzero((void *)ret, n * PAGE_SIZE); 422 *firstaddr += n * PAGE_SIZE; 423 return (ret); 424 } 425 426 static 427 void 428 create_pagetables(vm_paddr_t *firstaddr) 429 { 430 int i; 431 432 /* we are running (mostly) V=P at this point */ 433 434 /* Allocate pages */ 435 KPTbase = allocpages(firstaddr, NKPT); 436 KPTphys = allocpages(firstaddr, NKPT); 437 KPML4phys = allocpages(firstaddr, 1); 438 KPDPphys = allocpages(firstaddr, NKPML4E); 439 440 /* 441 * Calculate the page directory base for KERNBASE, 442 * that is where we start populating the page table pages. 443 * Basically this is the end - 2. 444 */ 445 KPDphys = allocpages(firstaddr, NKPDPE); 446 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 447 448 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 449 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 450 ndmpdp = 4; 451 DMPDPphys = allocpages(firstaddr, NDMPML4E); 452 if ((amd_feature & AMDID_PAGE1GB) == 0) 453 DMPDphys = allocpages(firstaddr, ndmpdp); 454 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 455 456 /* 457 * Fill in the underlying page table pages for the area around 458 * KERNBASE. This remaps low physical memory to KERNBASE. 459 * 460 * Read-only from zero to physfree 461 * XXX not fully used, underneath 2M pages 462 */ 463 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 464 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 465 ((pt_entry_t *)KPTbase)[i] |= PG_RW | PG_V | PG_G; 466 } 467 468 /* 469 * Now map the initial kernel page tables. One block of page 470 * tables is placed at the beginning of kernel virtual memory, 471 * and another block is placed at KERNBASE to map the kernel binary, 472 * data, bss, and initial pre-allocations. 473 */ 474 for (i = 0; i < NKPT; i++) { 475 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 476 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V; 477 } 478 for (i = 0; i < NKPT; i++) { 479 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 480 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 481 } 482 483 /* 484 * Map from zero to end of allocations using 2M pages as an 485 * optimization. This will bypass some of the KPTBase pages 486 * above in the KERNBASE area. 487 */ 488 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 489 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 490 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V | PG_PS | PG_G; 491 } 492 493 /* 494 * And connect up the PD to the PDP. The kernel pmap is expected 495 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 496 */ 497 for (i = 0; i < NKPDPE; i++) { 498 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 499 KPDphys + (i << PAGE_SHIFT); 500 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 501 PG_RW | PG_V | PG_U; 502 } 503 504 /* Now set up the direct map space using either 2MB or 1GB pages */ 505 /* Preset PG_M and PG_A because demotion expects it */ 506 if ((amd_feature & AMDID_PAGE1GB) == 0) { 507 for (i = 0; i < NPDEPG * ndmpdp; i++) { 508 ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; 509 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | 510 PG_G | PG_M | PG_A; 511 } 512 /* And the direct map space's PDP */ 513 for (i = 0; i < ndmpdp; i++) { 514 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 515 (i << PAGE_SHIFT); 516 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 517 } 518 } else { 519 for (i = 0; i < ndmpdp; i++) { 520 ((pdp_entry_t *)DMPDPphys)[i] = 521 (vm_paddr_t)i << PDPSHIFT; 522 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | 523 PG_G | PG_M | PG_A; 524 } 525 } 526 527 /* And recursively map PML4 to itself in order to get PTmap */ 528 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 529 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 530 531 /* Connect the Direct Map slot up to the PML4 */ 532 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; 533 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; 534 535 /* Connect the KVA slot up to the PML4 */ 536 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 537 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 538 } 539 540 void 541 init_paging(vm_paddr_t *firstaddr) 542 { 543 create_pagetables(firstaddr); 544 } 545 546 /* 547 * Bootstrap the system enough to run with virtual memory. 548 * 549 * On the i386 this is called after mapping has already been enabled 550 * and just syncs the pmap module with what has already been done. 551 * [We can't call it easily with mapping off since the kernel is not 552 * mapped with PA == VA, hence we would have to relocate every address 553 * from the linked base (virtual) address "KERNBASE" to the actual 554 * (physical) address starting relative to 0] 555 */ 556 void 557 pmap_bootstrap(vm_paddr_t *firstaddr) 558 { 559 vm_offset_t va; 560 pt_entry_t *pte; 561 struct mdglobaldata *gd; 562 int pg; 563 564 KvaStart = VM_MIN_KERNEL_ADDRESS; 565 KvaEnd = VM_MAX_KERNEL_ADDRESS; 566 KvaSize = KvaEnd - KvaStart; 567 568 avail_start = *firstaddr; 569 570 /* 571 * Create an initial set of page tables to run the kernel in. 572 */ 573 create_pagetables(firstaddr); 574 575 virtual2_start = KvaStart; 576 virtual2_end = PTOV_OFFSET; 577 578 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 579 virtual_start = pmap_kmem_choose(virtual_start); 580 581 virtual_end = VM_MAX_KERNEL_ADDRESS; 582 583 /* XXX do %cr0 as well */ 584 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 585 load_cr3(KPML4phys); 586 587 /* 588 * Initialize protection array. 589 */ 590 i386_protection_init(); 591 592 /* 593 * The kernel's pmap is statically allocated so we don't have to use 594 * pmap_create, which is unlikely to work correctly at this part of 595 * the boot sequence (XXX and which no longer exists). 596 */ 597 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 598 kernel_pmap.pm_count = 1; 599 kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK; 600 TAILQ_INIT(&kernel_pmap.pm_pvlist); 601 nkpt = NKPT; 602 603 /* 604 * Reserve some special page table entries/VA space for temporary 605 * mapping of pages. 606 */ 607 #define SYSMAP(c, p, v, n) \ 608 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 609 610 va = virtual_start; 611 #ifdef JG 612 pte = (pt_entry_t *) pmap_pte(&kernel_pmap, va); 613 #else 614 pte = vtopte(va); 615 #endif 616 617 /* 618 * CMAP1/CMAP2 are used for zeroing and copying pages. 619 */ 620 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 621 622 /* 623 * Crashdump maps. 624 */ 625 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 626 627 /* 628 * ptvmmap is used for reading arbitrary physical pages via 629 * /dev/mem. 630 */ 631 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 632 633 /* 634 * msgbufp is used to map the system message buffer. 635 * XXX msgbufmap is not used. 636 */ 637 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 638 atop(round_page(MSGBUF_SIZE))) 639 640 virtual_start = va; 641 642 *CMAP1 = 0; 643 644 /* 645 * PG_G is terribly broken on SMP because we IPI invltlb's in some 646 * cases rather then invl1pg. Actually, I don't even know why it 647 * works under UP because self-referential page table mappings 648 */ 649 #ifdef SMP 650 pgeflag = 0; 651 #else 652 if (cpu_feature & CPUID_PGE) 653 pgeflag = PG_G; 654 #endif 655 656 /* 657 * Initialize the 4MB page size flag 658 */ 659 pseflag = 0; 660 /* 661 * The 4MB page version of the initial 662 * kernel page mapping. 663 */ 664 pdir4mb = 0; 665 666 #if !defined(DISABLE_PSE) 667 if (cpu_feature & CPUID_PSE) { 668 pt_entry_t ptditmp; 669 /* 670 * Note that we have enabled PSE mode 671 */ 672 pseflag = PG_PS; 673 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 674 ptditmp &= ~(NBPDR - 1); 675 ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; 676 pdir4mb = ptditmp; 677 678 #ifndef SMP 679 /* 680 * Enable the PSE mode. If we are SMP we can't do this 681 * now because the APs will not be able to use it when 682 * they boot up. 683 */ 684 load_cr4(rcr4() | CR4_PSE); 685 686 /* 687 * We can do the mapping here for the single processor 688 * case. We simply ignore the old page table page from 689 * now on. 690 */ 691 /* 692 * For SMP, we still need 4K pages to bootstrap APs, 693 * PSE will be enabled as soon as all APs are up. 694 */ 695 PTD[KPTDI] = (pd_entry_t)ptditmp; 696 cpu_invltlb(); 697 #endif 698 } 699 #endif 700 #ifdef SMP 701 if (cpu_apic_address == 0) 702 panic("pmap_bootstrap: no local apic!"); 703 #endif 704 705 /* 706 * We need to finish setting up the globaldata page for the BSP. 707 * locore has already populated the page table for the mdglobaldata 708 * portion. 709 */ 710 pg = MDGLOBALDATA_BASEALLOC_PAGES; 711 gd = &CPU_prvspace[0].mdglobaldata; 712 gd->gd_CMAP1 = &SMPpt[pg + 0]; 713 gd->gd_CMAP2 = &SMPpt[pg + 1]; 714 gd->gd_CMAP3 = &SMPpt[pg + 2]; 715 gd->gd_PMAP1 = &SMPpt[pg + 3]; 716 gd->gd_CADDR1 = CPU_prvspace[0].CPAGE1; 717 gd->gd_CADDR2 = CPU_prvspace[0].CPAGE2; 718 gd->gd_CADDR3 = CPU_prvspace[0].CPAGE3; 719 gd->gd_PADDR1 = (pt_entry_t *)CPU_prvspace[0].PPAGE1; 720 721 cpu_invltlb(); 722 } 723 724 #ifdef SMP 725 /* 726 * Set 4mb pdir for mp startup 727 */ 728 void 729 pmap_set_opt(void) 730 { 731 if (pseflag && (cpu_feature & CPUID_PSE)) { 732 load_cr4(rcr4() | CR4_PSE); 733 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 734 cpu_invltlb(); 735 } 736 } 737 } 738 #endif 739 740 /* 741 * Initialize the pmap module. 742 * Called by vm_init, to initialize any structures that the pmap 743 * system needs to map virtual memory. 744 * pmap_init has been enhanced to support in a fairly consistant 745 * way, discontiguous physical memory. 746 */ 747 void 748 pmap_init(void) 749 { 750 int i; 751 int initial_pvs; 752 753 /* 754 * object for kernel page table pages 755 */ 756 /* JG I think the number can be arbitrary */ 757 kptobj = vm_object_allocate(OBJT_DEFAULT, 5); 758 759 /* 760 * Allocate memory for random pmap data structures. Includes the 761 * pv_head_table. 762 */ 763 764 for(i = 0; i < vm_page_array_size; i++) { 765 vm_page_t m; 766 767 m = &vm_page_array[i]; 768 TAILQ_INIT(&m->md.pv_list); 769 m->md.pv_list_count = 0; 770 } 771 772 /* 773 * init the pv free list 774 */ 775 initial_pvs = vm_page_array_size; 776 if (initial_pvs < MINPV) 777 initial_pvs = MINPV; 778 pvzone = &pvzone_store; 779 pvinit = (struct pv_entry *) kmem_alloc(&kernel_map, 780 initial_pvs * sizeof (struct pv_entry)); 781 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, 782 initial_pvs); 783 784 /* 785 * Now it is safe to enable pv_table recording. 786 */ 787 pmap_initialized = TRUE; 788 #ifdef SMP 789 lapic = pmap_mapdev_uncacheable(cpu_apic_address, sizeof(struct LAPIC)); 790 #endif 791 } 792 793 /* 794 * Initialize the address space (zone) for the pv_entries. Set a 795 * high water mark so that the system can recover from excessive 796 * numbers of pv entries. 797 */ 798 void 799 pmap_init2(void) 800 { 801 int shpgperproc = PMAP_SHPGPERPROC; 802 803 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 804 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 805 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 806 pv_entry_high_water = 9 * (pv_entry_max / 10); 807 zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); 808 } 809 810 811 /*************************************************** 812 * Low level helper routines..... 813 ***************************************************/ 814 815 #if defined(PMAP_DIAGNOSTIC) 816 817 /* 818 * This code checks for non-writeable/modified pages. 819 * This should be an invalid condition. 820 */ 821 static 822 int 823 pmap_nw_modified(pt_entry_t pte) 824 { 825 if ((pte & (PG_M|PG_RW)) == PG_M) 826 return 1; 827 else 828 return 0; 829 } 830 #endif 831 832 833 /* 834 * this routine defines the region(s) of memory that should 835 * not be tested for the modified bit. 836 */ 837 static __inline 838 int 839 pmap_track_modified(vm_offset_t va) 840 { 841 if ((va < clean_sva) || (va >= clean_eva)) 842 return 1; 843 else 844 return 0; 845 } 846 847 /* 848 * Extract the physical page address associated with the map/VA pair. 849 * 850 * The caller must hold vm_token if non-blocking operation is desired. 851 */ 852 vm_paddr_t 853 pmap_extract(pmap_t pmap, vm_offset_t va) 854 { 855 vm_paddr_t rtval; 856 pt_entry_t *pte; 857 pd_entry_t pde, *pdep; 858 859 lwkt_gettoken(&vm_token); 860 rtval = 0; 861 pdep = pmap_pde(pmap, va); 862 if (pdep != NULL) { 863 pde = *pdep; 864 if (pde) { 865 if ((pde & PG_PS) != 0) { 866 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 867 } else { 868 pte = pmap_pde_to_pte(pdep, va); 869 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 870 } 871 } 872 } 873 lwkt_reltoken(&vm_token); 874 return rtval; 875 } 876 877 /* 878 * Extract the physical page address associated kernel virtual address. 879 */ 880 vm_paddr_t 881 pmap_kextract(vm_offset_t va) 882 { 883 pd_entry_t pde; 884 vm_paddr_t pa; 885 886 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 887 pa = DMAP_TO_PHYS(va); 888 } else { 889 pde = *vtopde(va); 890 if (pde & PG_PS) { 891 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 892 } else { 893 /* 894 * Beware of a concurrent promotion that changes the 895 * PDE at this point! For example, vtopte() must not 896 * be used to access the PTE because it would use the 897 * new PDE. It is, however, safe to use the old PDE 898 * because the page table page is preserved by the 899 * promotion. 900 */ 901 pa = *pmap_pde_to_pte(&pde, va); 902 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 903 } 904 } 905 return pa; 906 } 907 908 /*************************************************** 909 * Low level mapping routines..... 910 ***************************************************/ 911 912 /* 913 * Routine: pmap_kenter 914 * Function: 915 * Add a wired page to the KVA 916 * NOTE! note that in order for the mapping to take effect -- you 917 * should do an invltlb after doing the pmap_kenter(). 918 */ 919 void 920 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 921 { 922 pt_entry_t *pte; 923 pt_entry_t npte; 924 pmap_inval_info info; 925 926 pmap_inval_init(&info); 927 npte = pa | PG_RW | PG_V | pgeflag; 928 pte = vtopte(va); 929 pmap_inval_interlock(&info, &kernel_pmap, va); 930 *pte = npte; 931 pmap_inval_deinterlock(&info, &kernel_pmap); 932 pmap_inval_done(&info); 933 } 934 935 /* 936 * Routine: pmap_kenter_quick 937 * Function: 938 * Similar to pmap_kenter(), except we only invalidate the 939 * mapping on the current CPU. 940 */ 941 void 942 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 943 { 944 pt_entry_t *pte; 945 pt_entry_t npte; 946 947 npte = pa | PG_RW | PG_V | pgeflag; 948 pte = vtopte(va); 949 *pte = npte; 950 cpu_invlpg((void *)va); 951 } 952 953 void 954 pmap_kenter_sync(vm_offset_t va) 955 { 956 pmap_inval_info info; 957 958 pmap_inval_init(&info); 959 pmap_inval_interlock(&info, &kernel_pmap, va); 960 pmap_inval_deinterlock(&info, &kernel_pmap); 961 pmap_inval_done(&info); 962 } 963 964 void 965 pmap_kenter_sync_quick(vm_offset_t va) 966 { 967 cpu_invlpg((void *)va); 968 } 969 970 /* 971 * remove a page from the kernel pagetables 972 */ 973 void 974 pmap_kremove(vm_offset_t va) 975 { 976 pt_entry_t *pte; 977 pmap_inval_info info; 978 979 pmap_inval_init(&info); 980 pte = vtopte(va); 981 pmap_inval_interlock(&info, &kernel_pmap, va); 982 *pte = 0; 983 pmap_inval_deinterlock(&info, &kernel_pmap); 984 pmap_inval_done(&info); 985 } 986 987 void 988 pmap_kremove_quick(vm_offset_t va) 989 { 990 pt_entry_t *pte; 991 pte = vtopte(va); 992 *pte = 0; 993 cpu_invlpg((void *)va); 994 } 995 996 /* 997 * XXX these need to be recoded. They are not used in any critical path. 998 */ 999 void 1000 pmap_kmodify_rw(vm_offset_t va) 1001 { 1002 *vtopte(va) |= PG_RW; 1003 cpu_invlpg((void *)va); 1004 } 1005 1006 void 1007 pmap_kmodify_nc(vm_offset_t va) 1008 { 1009 *vtopte(va) |= PG_N; 1010 cpu_invlpg((void *)va); 1011 } 1012 1013 /* 1014 * Used to map a range of physical addresses into kernel 1015 * virtual address space. 1016 * 1017 * For now, VM is already on, we only need to map the 1018 * specified memory. 1019 */ 1020 vm_offset_t 1021 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1022 { 1023 return PHYS_TO_DMAP(start); 1024 } 1025 1026 1027 /* 1028 * Add a list of wired pages to the kva 1029 * this routine is only used for temporary 1030 * kernel mappings that do not need to have 1031 * page modification or references recorded. 1032 * Note that old mappings are simply written 1033 * over. The page *must* be wired. 1034 */ 1035 void 1036 pmap_qenter(vm_offset_t va, vm_page_t *m, int count) 1037 { 1038 vm_offset_t end_va; 1039 1040 end_va = va + count * PAGE_SIZE; 1041 1042 while (va < end_va) { 1043 pt_entry_t *pte; 1044 1045 pte = vtopte(va); 1046 *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag; 1047 cpu_invlpg((void *)va); 1048 va += PAGE_SIZE; 1049 m++; 1050 } 1051 #ifdef SMP 1052 smp_invltlb(); /* XXX */ 1053 #endif 1054 } 1055 1056 /* 1057 * This routine jerks page mappings from the 1058 * kernel -- it is meant only for temporary mappings. 1059 * 1060 * MPSAFE, INTERRUPT SAFE (cluster callback) 1061 */ 1062 void 1063 pmap_qremove(vm_offset_t va, int count) 1064 { 1065 vm_offset_t end_va; 1066 1067 end_va = va + count * PAGE_SIZE; 1068 1069 while (va < end_va) { 1070 pt_entry_t *pte; 1071 1072 pte = vtopte(va); 1073 *pte = 0; 1074 cpu_invlpg((void *)va); 1075 va += PAGE_SIZE; 1076 } 1077 #ifdef SMP 1078 smp_invltlb(); 1079 #endif 1080 } 1081 1082 /* 1083 * This routine works like vm_page_lookup() but also blocks as long as the 1084 * page is busy. This routine does not busy the page it returns. 1085 * 1086 * Unless the caller is managing objects whos pages are in a known state, 1087 * the call should be made with a critical section held so the page's object 1088 * association remains valid on return. 1089 */ 1090 static 1091 vm_page_t 1092 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1093 { 1094 vm_page_t m; 1095 1096 do { 1097 m = vm_page_lookup(object, pindex); 1098 } while (m && vm_page_sleep_busy(m, FALSE, "pplookp")); 1099 1100 return(m); 1101 } 1102 1103 /* 1104 * Create a new thread and optionally associate it with a (new) process. 1105 * NOTE! the new thread's cpu may not equal the current cpu. 1106 */ 1107 void 1108 pmap_init_thread(thread_t td) 1109 { 1110 /* enforce pcb placement */ 1111 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1112 td->td_savefpu = &td->td_pcb->pcb_save; 1113 td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */ 1114 } 1115 1116 /* 1117 * This routine directly affects the fork perf for a process. 1118 */ 1119 void 1120 pmap_init_proc(struct proc *p) 1121 { 1122 } 1123 1124 /* 1125 * Dispose the UPAGES for a process that has exited. 1126 * This routine directly impacts the exit perf of a process. 1127 */ 1128 void 1129 pmap_dispose_proc(struct proc *p) 1130 { 1131 KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p)); 1132 } 1133 1134 /*************************************************** 1135 * Page table page management routines..... 1136 ***************************************************/ 1137 1138 /* 1139 * This routine unholds page table pages, and if the hold count 1140 * drops to zero, then it decrements the wire count. 1141 */ 1142 static __inline 1143 int 1144 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1145 pmap_inval_info_t info) 1146 { 1147 KKASSERT(m->hold_count > 0); 1148 if (m->hold_count > 1) { 1149 vm_page_unhold(m); 1150 return 0; 1151 } else { 1152 return _pmap_unwire_pte_hold(pmap, va, m, info); 1153 } 1154 } 1155 1156 static 1157 int 1158 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1159 pmap_inval_info_t info) 1160 { 1161 /* 1162 * Wait until we can busy the page ourselves. We cannot have 1163 * any active flushes if we block. We own one hold count on the 1164 * page so it cannot be freed out from under us. 1165 */ 1166 if (m->flags & PG_BUSY) { 1167 pmap_inval_flush(info); 1168 while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) 1169 ; 1170 } 1171 KASSERT(m->queue == PQ_NONE, 1172 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m)); 1173 1174 /* 1175 * This case can occur if new references were acquired while 1176 * we were blocked. 1177 */ 1178 if (m->hold_count > 1) { 1179 KKASSERT(m->hold_count > 1); 1180 vm_page_unhold(m); 1181 return 0; 1182 } 1183 1184 /* 1185 * Unmap the page table page 1186 */ 1187 KKASSERT(m->hold_count == 1); 1188 vm_page_busy(m); 1189 pmap_inval_interlock(info, pmap, -1); 1190 1191 if (m->pindex >= (NUPDE + NUPDPE)) { 1192 /* PDP page */ 1193 pml4_entry_t *pml4; 1194 pml4 = pmap_pml4e(pmap, va); 1195 *pml4 = 0; 1196 } else if (m->pindex >= NUPDE) { 1197 /* PD page */ 1198 pdp_entry_t *pdp; 1199 pdp = pmap_pdpe(pmap, va); 1200 *pdp = 0; 1201 } else { 1202 /* PT page */ 1203 pd_entry_t *pd; 1204 pd = pmap_pde(pmap, va); 1205 *pd = 0; 1206 } 1207 1208 KKASSERT(pmap->pm_stats.resident_count > 0); 1209 --pmap->pm_stats.resident_count; 1210 1211 if (pmap->pm_ptphint == m) 1212 pmap->pm_ptphint = NULL; 1213 pmap_inval_deinterlock(info, pmap); 1214 1215 if (m->pindex < NUPDE) { 1216 /* We just released a PT, unhold the matching PD */ 1217 vm_page_t pdpg; 1218 1219 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1220 pmap_unwire_pte_hold(pmap, va, pdpg, info); 1221 } 1222 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1223 /* We just released a PD, unhold the matching PDP */ 1224 vm_page_t pdppg; 1225 1226 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1227 pmap_unwire_pte_hold(pmap, va, pdppg, info); 1228 } 1229 1230 /* 1231 * This was our last hold, the page had better be unwired 1232 * after we decrement wire_count. 1233 * 1234 * FUTURE NOTE: shared page directory page could result in 1235 * multiple wire counts. 1236 */ 1237 vm_page_unhold(m); 1238 --m->wire_count; 1239 KKASSERT(m->wire_count == 0); 1240 --vmstats.v_wire_count; 1241 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1242 vm_page_flash(m); 1243 vm_page_free_zero(m); 1244 1245 return 1; 1246 } 1247 1248 /* 1249 * After removing a page table entry, this routine is used to 1250 * conditionally free the page, and manage the hold/wire counts. 1251 */ 1252 static 1253 int 1254 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte, 1255 pmap_inval_info_t info) 1256 { 1257 vm_pindex_t ptepindex; 1258 1259 if (va >= VM_MAX_USER_ADDRESS) 1260 return 0; 1261 1262 if (mpte == NULL) { 1263 ptepindex = pmap_pde_pindex(va); 1264 #if JGHINT 1265 if (pmap->pm_ptphint && 1266 (pmap->pm_ptphint->pindex == ptepindex)) { 1267 mpte = pmap->pm_ptphint; 1268 } else { 1269 #endif 1270 pmap_inval_flush(info); 1271 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1272 pmap->pm_ptphint = mpte; 1273 #if JGHINT 1274 } 1275 #endif 1276 } 1277 return pmap_unwire_pte_hold(pmap, va, mpte, info); 1278 } 1279 1280 /* 1281 * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because 1282 * it, and IdlePTD, represents the template used to update all other pmaps. 1283 * 1284 * On architectures where the kernel pmap is not integrated into the user 1285 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1286 * kernel_pmap should be used to directly access the kernel_pmap. 1287 */ 1288 void 1289 pmap_pinit0(struct pmap *pmap) 1290 { 1291 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1292 pmap->pm_count = 1; 1293 pmap->pm_active = 0; 1294 pmap->pm_ptphint = NULL; 1295 TAILQ_INIT(&pmap->pm_pvlist); 1296 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1297 } 1298 1299 /* 1300 * Initialize a preallocated and zeroed pmap structure, 1301 * such as one in a vmspace structure. 1302 */ 1303 void 1304 pmap_pinit(struct pmap *pmap) 1305 { 1306 vm_page_t ptdpg; 1307 1308 /* 1309 * No need to allocate page table space yet but we do need a valid 1310 * page directory table. 1311 */ 1312 if (pmap->pm_pml4 == NULL) { 1313 pmap->pm_pml4 = 1314 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 1315 } 1316 1317 /* 1318 * Allocate an object for the ptes 1319 */ 1320 if (pmap->pm_pteobj == NULL) 1321 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + PML4PML4I + 1); 1322 1323 /* 1324 * Allocate the page directory page, unless we already have 1325 * one cached. If we used the cached page the wire_count will 1326 * already be set appropriately. 1327 */ 1328 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1329 ptdpg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I, 1330 VM_ALLOC_NORMAL | VM_ALLOC_RETRY); 1331 pmap->pm_pdirm = ptdpg; 1332 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); 1333 ptdpg->valid = VM_PAGE_BITS_ALL; 1334 if (ptdpg->wire_count == 0) 1335 ++vmstats.v_wire_count; 1336 ptdpg->wire_count = 1; 1337 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1338 } 1339 if ((ptdpg->flags & PG_ZERO) == 0) 1340 bzero(pmap->pm_pml4, PAGE_SIZE); 1341 1342 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1343 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; 1344 1345 /* install self-referential address mapping entry */ 1346 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; 1347 1348 pmap->pm_count = 1; 1349 pmap->pm_active = 0; 1350 pmap->pm_ptphint = NULL; 1351 TAILQ_INIT(&pmap->pm_pvlist); 1352 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1353 pmap->pm_stats.resident_count = 1; 1354 } 1355 1356 /* 1357 * Clean up a pmap structure so it can be physically freed. This routine 1358 * is called by the vmspace dtor function. A great deal of pmap data is 1359 * left passively mapped to improve vmspace management so we have a bit 1360 * of cleanup work to do here. 1361 */ 1362 void 1363 pmap_puninit(pmap_t pmap) 1364 { 1365 vm_page_t p; 1366 1367 KKASSERT(pmap->pm_active == 0); 1368 lwkt_gettoken(&vm_token); 1369 if ((p = pmap->pm_pdirm) != NULL) { 1370 KKASSERT(pmap->pm_pml4 != NULL); 1371 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1372 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1373 p->wire_count--; 1374 vmstats.v_wire_count--; 1375 KKASSERT((p->flags & PG_BUSY) == 0); 1376 vm_page_busy(p); 1377 vm_page_free_zero(p); 1378 pmap->pm_pdirm = NULL; 1379 } 1380 if (pmap->pm_pml4) { 1381 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1382 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1383 pmap->pm_pml4 = NULL; 1384 } 1385 if (pmap->pm_pteobj) { 1386 vm_object_deallocate(pmap->pm_pteobj); 1387 pmap->pm_pteobj = NULL; 1388 } 1389 lwkt_reltoken(&vm_token); 1390 } 1391 1392 /* 1393 * Wire in kernel global address entries. To avoid a race condition 1394 * between pmap initialization and pmap_growkernel, this procedure 1395 * adds the pmap to the master list (which growkernel scans to update), 1396 * then copies the template. 1397 */ 1398 void 1399 pmap_pinit2(struct pmap *pmap) 1400 { 1401 crit_enter(); 1402 lwkt_gettoken(&vm_token); 1403 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); 1404 /* XXX copies current process, does not fill in MPPTDI */ 1405 lwkt_reltoken(&vm_token); 1406 crit_exit(); 1407 } 1408 1409 /* 1410 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1411 * 0 on failure (if the procedure had to sleep). 1412 * 1413 * When asked to remove the page directory page itself, we actually just 1414 * leave it cached so we do not have to incur the SMP inval overhead of 1415 * removing the kernel mapping. pmap_puninit() will take care of it. 1416 */ 1417 static 1418 int 1419 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1420 { 1421 /* 1422 * This code optimizes the case of freeing non-busy 1423 * page-table pages. Those pages are zero now, and 1424 * might as well be placed directly into the zero queue. 1425 */ 1426 if (vm_page_sleep_busy(p, FALSE, "pmaprl")) 1427 return 0; 1428 1429 vm_page_busy(p); 1430 1431 /* 1432 * Remove the page table page from the processes address space. 1433 */ 1434 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1435 /* 1436 * We are the pml4 table itself. 1437 */ 1438 /* XXX anything to do here? */ 1439 } else if (p->pindex >= (NUPDE + NUPDPE)) { 1440 /* 1441 * Remove a PDP page from the PML4. We do not maintain 1442 * hold counts on the PML4 page. 1443 */ 1444 pml4_entry_t *pml4; 1445 vm_page_t m4; 1446 int idx; 1447 1448 m4 = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I); 1449 KKASSERT(m4 != NULL); 1450 pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1451 idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG; 1452 KKASSERT(pml4[idx] != 0); 1453 pml4[idx] = 0; 1454 } else if (p->pindex >= NUPDE) { 1455 /* 1456 * Remove a PD page from the PDP and drop the hold count 1457 * on the PDP. The PDP is left cached in the pmap if 1458 * the hold count drops to 0 so the wire count remains 1459 * intact. 1460 */ 1461 vm_page_t m3; 1462 pdp_entry_t *pdp; 1463 int idx; 1464 1465 m3 = vm_page_lookup(pmap->pm_pteobj, 1466 NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG); 1467 KKASSERT(m3 != NULL); 1468 pdp = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1469 idx = (p->pindex - NUPDE) % NPDPEPG; 1470 KKASSERT(pdp[idx] != 0); 1471 pdp[idx] = 0; 1472 m3->hold_count--; 1473 } else { 1474 /* 1475 * Remove a PT page from the PD and drop the hold count 1476 * on the PD. The PD is left cached in the pmap if 1477 * the hold count drops to 0 so the wire count remains 1478 * intact. 1479 */ 1480 vm_page_t m2; 1481 pd_entry_t *pd; 1482 int idx; 1483 1484 m2 = vm_page_lookup(pmap->pm_pteobj, 1485 NUPDE + p->pindex / NPDEPG); 1486 KKASSERT(m2 != NULL); 1487 pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1488 idx = p->pindex % NPDEPG; 1489 pd[idx] = 0; 1490 m2->hold_count--; 1491 } 1492 1493 /* 1494 * One fewer mappings in the pmap. p's hold count had better 1495 * be zero. 1496 */ 1497 KKASSERT(pmap->pm_stats.resident_count > 0); 1498 --pmap->pm_stats.resident_count; 1499 if (p->hold_count) 1500 panic("pmap_release: freeing held page table page"); 1501 if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) 1502 pmap->pm_ptphint = NULL; 1503 1504 /* 1505 * We leave the top-level page table page cached, wired, and mapped in 1506 * the pmap until the dtor function (pmap_puninit()) gets called. 1507 * However, still clean it up so we can set PG_ZERO. 1508 */ 1509 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1510 bzero(pmap->pm_pml4, PAGE_SIZE); 1511 vm_page_flag_set(p, PG_ZERO); 1512 vm_page_wakeup(p); 1513 } else { 1514 p->wire_count--; 1515 KKASSERT(p->wire_count == 0); 1516 vmstats.v_wire_count--; 1517 /* JG eventually revert to using vm_page_free_zero() */ 1518 vm_page_free(p); 1519 } 1520 return 1; 1521 } 1522 1523 /* 1524 * This routine is called when various levels in the page table need to 1525 * be populated. This routine cannot fail. 1526 */ 1527 static 1528 vm_page_t 1529 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1530 { 1531 vm_page_t m; 1532 1533 /* 1534 * Find or fabricate a new pagetable page. This will busy the page. 1535 */ 1536 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1537 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1538 if ((m->flags & PG_ZERO) == 0) { 1539 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 1540 } 1541 1542 KASSERT(m->queue == PQ_NONE, 1543 ("_pmap_allocpte: %p->queue != PQ_NONE", m)); 1544 1545 /* 1546 * Increment the hold count for the page we will be returning to 1547 * the caller. 1548 */ 1549 m->hold_count++; 1550 if (m->wire_count++ == 0) 1551 vmstats.v_wire_count++; 1552 1553 /* 1554 * Map the pagetable page into the process address space, if 1555 * it isn't already there. 1556 * 1557 * It is possible that someone else got in and mapped the page 1558 * directory page while we were blocked, if so just unbusy and 1559 * return the held page. 1560 */ 1561 if (ptepindex >= (NUPDE + NUPDPE)) { 1562 /* 1563 * Wire up a new PDP page in the PML4 1564 */ 1565 vm_pindex_t pml4index; 1566 pml4_entry_t *pml4; 1567 1568 pml4index = ptepindex - (NUPDE + NUPDPE); 1569 pml4 = &pmap->pm_pml4[pml4index]; 1570 if (*pml4 & PG_V) { 1571 if (--m->wire_count == 0) 1572 --vmstats.v_wire_count; 1573 vm_page_wakeup(m); 1574 return(m); 1575 } 1576 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1577 } else if (ptepindex >= NUPDE) { 1578 /* 1579 * Wire up a new PD page in the PDP 1580 */ 1581 vm_pindex_t pml4index; 1582 vm_pindex_t pdpindex; 1583 vm_page_t pdppg; 1584 pml4_entry_t *pml4; 1585 pdp_entry_t *pdp; 1586 1587 pdpindex = ptepindex - NUPDE; 1588 pml4index = pdpindex >> NPML4EPGSHIFT; 1589 1590 pml4 = &pmap->pm_pml4[pml4index]; 1591 if ((*pml4 & PG_V) == 0) { 1592 /* 1593 * Have to allocate a new PDP page, recurse. 1594 * This always succeeds. Returned page will 1595 * be held. 1596 */ 1597 pdppg = _pmap_allocpte(pmap, 1598 NUPDE + NUPDPE + pml4index); 1599 } else { 1600 /* 1601 * Add a held reference to the PDP page. 1602 */ 1603 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1604 pdppg->hold_count++; 1605 } 1606 1607 /* 1608 * Now find the pdp_entry and map the PDP. If the PDP 1609 * has already been mapped unwind and return the 1610 * already-mapped PDP held. 1611 * 1612 * pdppg is left held (hold_count is incremented for 1613 * each PD in the PDP). 1614 */ 1615 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1616 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1617 if (*pdp & PG_V) { 1618 vm_page_unhold(pdppg); 1619 if (--m->wire_count == 0) 1620 --vmstats.v_wire_count; 1621 vm_page_wakeup(m); 1622 return(m); 1623 } 1624 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1625 } else { 1626 /* 1627 * Wire up the new PT page in the PD 1628 */ 1629 vm_pindex_t pml4index; 1630 vm_pindex_t pdpindex; 1631 pml4_entry_t *pml4; 1632 pdp_entry_t *pdp; 1633 pd_entry_t *pd; 1634 vm_page_t pdpg; 1635 1636 pdpindex = ptepindex >> NPDPEPGSHIFT; 1637 pml4index = pdpindex >> NPML4EPGSHIFT; 1638 1639 /* 1640 * Locate the PDP page in the PML4, then the PD page in 1641 * the PDP. If either does not exist we simply recurse 1642 * to allocate them. 1643 * 1644 * We can just recurse on the PD page as it will recurse 1645 * on the PDP if necessary. 1646 */ 1647 pml4 = &pmap->pm_pml4[pml4index]; 1648 if ((*pml4 & PG_V) == 0) { 1649 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1650 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1651 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1652 } else { 1653 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1654 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1655 if ((*pdp & PG_V) == 0) { 1656 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1657 } else { 1658 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1659 pdpg->hold_count++; 1660 } 1661 } 1662 1663 /* 1664 * Now fill in the pte in the PD. If the pte already exists 1665 * (again, if we raced the grab), unhold pdpg and unwire 1666 * m, returning a held m. 1667 * 1668 * pdpg is left held (hold_count is incremented for 1669 * each PT in the PD). 1670 */ 1671 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1672 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1673 if (*pd != 0) { 1674 vm_page_unhold(pdpg); 1675 if (--m->wire_count == 0) 1676 --vmstats.v_wire_count; 1677 vm_page_wakeup(m); 1678 return(m); 1679 } 1680 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1681 } 1682 1683 /* 1684 * We successfully loaded a PDP, PD, or PTE. Set the page table hint, 1685 * valid bits, mapped flag, unbusy, and we're done. 1686 */ 1687 pmap->pm_ptphint = m; 1688 ++pmap->pm_stats.resident_count; 1689 1690 m->valid = VM_PAGE_BITS_ALL; 1691 vm_page_flag_clear(m, PG_ZERO); 1692 vm_page_flag_set(m, PG_MAPPED); 1693 vm_page_wakeup(m); 1694 1695 return (m); 1696 } 1697 1698 static 1699 vm_page_t 1700 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1701 { 1702 vm_pindex_t ptepindex; 1703 pd_entry_t *pd; 1704 vm_page_t m; 1705 1706 /* 1707 * Calculate pagetable page index 1708 */ 1709 ptepindex = pmap_pde_pindex(va); 1710 1711 /* 1712 * Get the page directory entry 1713 */ 1714 pd = pmap_pde(pmap, va); 1715 1716 /* 1717 * This supports switching from a 2MB page to a 1718 * normal 4K page. 1719 */ 1720 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1721 panic("no promotion/demotion yet"); 1722 *pd = 0; 1723 pd = NULL; 1724 cpu_invltlb(); 1725 smp_invltlb(); 1726 } 1727 1728 /* 1729 * If the page table page is mapped, we just increment the 1730 * hold count, and activate it. 1731 */ 1732 if (pd != NULL && (*pd & PG_V) != 0) { 1733 /* YYY hint is used here on i386 */ 1734 m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 1735 pmap->pm_ptphint = m; 1736 m->hold_count++; 1737 return m; 1738 } 1739 /* 1740 * Here if the pte page isn't mapped, or if it has been deallocated. 1741 */ 1742 return _pmap_allocpte(pmap, ptepindex); 1743 } 1744 1745 1746 /*************************************************** 1747 * Pmap allocation/deallocation routines. 1748 ***************************************************/ 1749 1750 /* 1751 * Release any resources held by the given physical map. 1752 * Called when a pmap initialized by pmap_pinit is being released. 1753 * Should only be called if the map contains no valid mappings. 1754 */ 1755 static int pmap_release_callback(struct vm_page *p, void *data); 1756 1757 void 1758 pmap_release(struct pmap *pmap) 1759 { 1760 vm_object_t object = pmap->pm_pteobj; 1761 struct rb_vm_page_scan_info info; 1762 1763 KASSERT(pmap->pm_active == 0, ("pmap still active! %08x", pmap->pm_active)); 1764 #if defined(DIAGNOSTIC) 1765 if (object->ref_count != 1) 1766 panic("pmap_release: pteobj reference count != 1"); 1767 #endif 1768 1769 info.pmap = pmap; 1770 info.object = object; 1771 crit_enter(); 1772 lwkt_gettoken(&vm_token); 1773 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); 1774 crit_exit(); 1775 1776 do { 1777 crit_enter(); 1778 info.error = 0; 1779 info.mpte = NULL; 1780 info.limit = object->generation; 1781 1782 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1783 pmap_release_callback, &info); 1784 if (info.error == 0 && info.mpte) { 1785 if (!pmap_release_free_page(pmap, info.mpte)) 1786 info.error = 1; 1787 } 1788 crit_exit(); 1789 } while (info.error); 1790 lwkt_reltoken(&vm_token); 1791 } 1792 1793 static 1794 int 1795 pmap_release_callback(struct vm_page *p, void *data) 1796 { 1797 struct rb_vm_page_scan_info *info = data; 1798 1799 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1800 info->mpte = p; 1801 return(0); 1802 } 1803 if (!pmap_release_free_page(info->pmap, p)) { 1804 info->error = 1; 1805 return(-1); 1806 } 1807 if (info->object->generation != info->limit) { 1808 info->error = 1; 1809 return(-1); 1810 } 1811 return(0); 1812 } 1813 1814 /* 1815 * Grow the number of kernel page table entries, if needed. 1816 */ 1817 void 1818 pmap_growkernel(vm_offset_t addr) 1819 { 1820 vm_paddr_t paddr; 1821 vm_offset_t ptppaddr; 1822 vm_page_t nkpg; 1823 pd_entry_t *pde, newpdir; 1824 pdp_entry_t newpdp; 1825 1826 crit_enter(); 1827 lwkt_gettoken(&vm_token); 1828 if (kernel_vm_end == 0) { 1829 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 1830 nkpt = 0; 1831 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & PG_V) != 0) { 1832 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1833 nkpt++; 1834 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1835 kernel_vm_end = kernel_map.max_offset; 1836 break; 1837 } 1838 } 1839 } 1840 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1841 if (addr - 1 >= kernel_map.max_offset) 1842 addr = kernel_map.max_offset; 1843 while (kernel_vm_end < addr) { 1844 pde = pmap_pde(&kernel_pmap, kernel_vm_end); 1845 if (pde == NULL) { 1846 /* We need a new PDP entry */ 1847 nkpg = vm_page_alloc(kptobj, nkpt, 1848 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM 1849 | VM_ALLOC_INTERRUPT); 1850 if (nkpg == NULL) 1851 panic("pmap_growkernel: no memory to grow kernel"); 1852 paddr = VM_PAGE_TO_PHYS(nkpg); 1853 if ((nkpg->flags & PG_ZERO) == 0) 1854 pmap_zero_page(paddr); 1855 vm_page_flag_clear(nkpg, PG_ZERO); 1856 newpdp = (pdp_entry_t) 1857 (paddr | PG_V | PG_RW | PG_A | PG_M); 1858 *pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp; 1859 nkpt++; 1860 continue; /* try again */ 1861 } 1862 if ((*pde & PG_V) != 0) { 1863 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1864 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1865 kernel_vm_end = kernel_map.max_offset; 1866 break; 1867 } 1868 continue; 1869 } 1870 1871 /* 1872 * This index is bogus, but out of the way 1873 */ 1874 nkpg = vm_page_alloc(kptobj, nkpt, 1875 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT); 1876 if (nkpg == NULL) 1877 panic("pmap_growkernel: no memory to grow kernel"); 1878 1879 vm_page_wire(nkpg); 1880 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1881 pmap_zero_page(ptppaddr); 1882 vm_page_flag_clear(nkpg, PG_ZERO); 1883 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 1884 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir; 1885 nkpt++; 1886 1887 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1888 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1889 kernel_vm_end = kernel_map.max_offset; 1890 break; 1891 } 1892 } 1893 lwkt_reltoken(&vm_token); 1894 crit_exit(); 1895 } 1896 1897 /* 1898 * Retire the given physical map from service. 1899 * Should only be called if the map contains 1900 * no valid mappings. 1901 */ 1902 void 1903 pmap_destroy(pmap_t pmap) 1904 { 1905 int count; 1906 1907 if (pmap == NULL) 1908 return; 1909 1910 lwkt_gettoken(&vm_token); 1911 count = --pmap->pm_count; 1912 if (count == 0) { 1913 pmap_release(pmap); 1914 panic("destroying a pmap is not yet implemented"); 1915 } 1916 lwkt_reltoken(&vm_token); 1917 } 1918 1919 /* 1920 * Add a reference to the specified pmap. 1921 */ 1922 void 1923 pmap_reference(pmap_t pmap) 1924 { 1925 if (pmap != NULL) { 1926 lwkt_gettoken(&vm_token); 1927 pmap->pm_count++; 1928 lwkt_reltoken(&vm_token); 1929 } 1930 } 1931 1932 /*************************************************** 1933 * page management routines. 1934 ***************************************************/ 1935 1936 /* 1937 * free the pv_entry back to the free list. This function may be 1938 * called from an interrupt. 1939 */ 1940 static __inline 1941 void 1942 free_pv_entry(pv_entry_t pv) 1943 { 1944 pv_entry_count--; 1945 KKASSERT(pv_entry_count >= 0); 1946 zfree(pvzone, pv); 1947 } 1948 1949 /* 1950 * get a new pv_entry, allocating a block from the system 1951 * when needed. This function may be called from an interrupt. 1952 */ 1953 static 1954 pv_entry_t 1955 get_pv_entry(void) 1956 { 1957 pv_entry_count++; 1958 if (pv_entry_high_water && 1959 (pv_entry_count > pv_entry_high_water) && 1960 (pmap_pagedaemon_waken == 0)) { 1961 pmap_pagedaemon_waken = 1; 1962 wakeup(&vm_pages_needed); 1963 } 1964 return zalloc(pvzone); 1965 } 1966 1967 /* 1968 * This routine is very drastic, but can save the system 1969 * in a pinch. 1970 */ 1971 void 1972 pmap_collect(void) 1973 { 1974 int i; 1975 vm_page_t m; 1976 static int warningdone=0; 1977 1978 if (pmap_pagedaemon_waken == 0) 1979 return; 1980 lwkt_gettoken(&vm_token); 1981 if (warningdone < 5) { 1982 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); 1983 warningdone++; 1984 } 1985 1986 for(i = 0; i < vm_page_array_size; i++) { 1987 m = &vm_page_array[i]; 1988 if (m->wire_count || m->hold_count || m->busy || 1989 (m->flags & PG_BUSY)) 1990 continue; 1991 pmap_remove_all(m); 1992 } 1993 pmap_pagedaemon_waken = 0; 1994 lwkt_reltoken(&vm_token); 1995 } 1996 1997 1998 /* 1999 * If it is the first entry on the list, it is actually 2000 * in the header and we must copy the following entry up 2001 * to the header. Otherwise we must search the list for 2002 * the entry. In either case we free the now unused entry. 2003 */ 2004 static 2005 int 2006 pmap_remove_entry(struct pmap *pmap, vm_page_t m, 2007 vm_offset_t va, pmap_inval_info_t info) 2008 { 2009 pv_entry_t pv; 2010 int rtval; 2011 2012 crit_enter(); 2013 if (m->md.pv_list_count < pmap->pm_stats.resident_count) { 2014 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2015 if (pmap == pv->pv_pmap && va == pv->pv_va) 2016 break; 2017 } 2018 } else { 2019 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { 2020 if (va == pv->pv_va) 2021 break; 2022 } 2023 } 2024 2025 rtval = 0; 2026 KKASSERT(pv); 2027 2028 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2029 m->md.pv_list_count--; 2030 KKASSERT(m->md.pv_list_count >= 0); 2031 if (TAILQ_EMPTY(&m->md.pv_list)) 2032 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2033 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2034 ++pmap->pm_generation; 2035 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info); 2036 free_pv_entry(pv); 2037 2038 crit_exit(); 2039 return rtval; 2040 } 2041 2042 /* 2043 * Create a pv entry for page at pa for 2044 * (pmap, va). 2045 */ 2046 static 2047 void 2048 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) 2049 { 2050 pv_entry_t pv; 2051 2052 crit_enter(); 2053 pv = get_pv_entry(); 2054 pv->pv_va = va; 2055 pv->pv_pmap = pmap; 2056 pv->pv_ptem = mpte; 2057 2058 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); 2059 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2060 ++pmap->pm_generation; 2061 m->md.pv_list_count++; 2062 2063 crit_exit(); 2064 } 2065 2066 /* 2067 * pmap_remove_pte: do the things to unmap a page in a process 2068 */ 2069 static 2070 int 2071 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, 2072 pmap_inval_info_t info) 2073 { 2074 pt_entry_t oldpte; 2075 vm_page_t m; 2076 2077 pmap_inval_interlock(info, pmap, va); 2078 oldpte = pte_load_clear(ptq); 2079 pmap_inval_deinterlock(info, pmap); 2080 if (oldpte & PG_W) 2081 pmap->pm_stats.wired_count -= 1; 2082 /* 2083 * Machines that don't support invlpg, also don't support 2084 * PG_G. XXX PG_G is disabled for SMP so don't worry about 2085 * the SMP case. 2086 */ 2087 if (oldpte & PG_G) 2088 cpu_invlpg((void *)va); 2089 KKASSERT(pmap->pm_stats.resident_count > 0); 2090 --pmap->pm_stats.resident_count; 2091 if (oldpte & PG_MANAGED) { 2092 m = PHYS_TO_VM_PAGE(oldpte); 2093 if (oldpte & PG_M) { 2094 #if defined(PMAP_DIAGNOSTIC) 2095 if (pmap_nw_modified((pt_entry_t) oldpte)) { 2096 kprintf( 2097 "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2098 va, oldpte); 2099 } 2100 #endif 2101 if (pmap_track_modified(va)) 2102 vm_page_dirty(m); 2103 } 2104 if (oldpte & PG_A) 2105 vm_page_flag_set(m, PG_REFERENCED); 2106 return pmap_remove_entry(pmap, m, va, info); 2107 } else { 2108 return pmap_unuse_pt(pmap, va, NULL, info); 2109 } 2110 2111 return 0; 2112 } 2113 2114 /* 2115 * pmap_remove_page: 2116 * 2117 * Remove a single page from a process address space. 2118 * 2119 * This function may not be called from an interrupt if the pmap is 2120 * not kernel_pmap. 2121 */ 2122 static 2123 void 2124 pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info) 2125 { 2126 pt_entry_t *pte; 2127 2128 pte = pmap_pte(pmap, va); 2129 if (pte == NULL) 2130 return; 2131 if ((*pte & PG_V) == 0) 2132 return; 2133 pmap_remove_pte(pmap, pte, va, info); 2134 } 2135 2136 /* 2137 * pmap_remove: 2138 * 2139 * Remove the given range of addresses from the specified map. 2140 * 2141 * It is assumed that the start and end are properly 2142 * rounded to the page size. 2143 * 2144 * This function may not be called from an interrupt if the pmap is 2145 * not kernel_pmap. 2146 */ 2147 void 2148 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 2149 { 2150 vm_offset_t va_next; 2151 pml4_entry_t *pml4e; 2152 pdp_entry_t *pdpe; 2153 pd_entry_t ptpaddr, *pde; 2154 pt_entry_t *pte; 2155 struct pmap_inval_info info; 2156 2157 if (pmap == NULL) 2158 return; 2159 2160 lwkt_gettoken(&vm_token); 2161 if (pmap->pm_stats.resident_count == 0) { 2162 lwkt_reltoken(&vm_token); 2163 return; 2164 } 2165 2166 pmap_inval_init(&info); 2167 2168 /* 2169 * special handling of removing one page. a very 2170 * common operation and easy to short circuit some 2171 * code. 2172 */ 2173 if (sva + PAGE_SIZE == eva) { 2174 pde = pmap_pde(pmap, sva); 2175 if (pde && (*pde & PG_PS) == 0) { 2176 pmap_remove_page(pmap, sva, &info); 2177 pmap_inval_done(&info); 2178 lwkt_reltoken(&vm_token); 2179 return; 2180 } 2181 } 2182 2183 for (; sva < eva; sva = va_next) { 2184 pml4e = pmap_pml4e(pmap, sva); 2185 if ((*pml4e & PG_V) == 0) { 2186 va_next = (sva + NBPML4) & ~PML4MASK; 2187 if (va_next < sva) 2188 va_next = eva; 2189 continue; 2190 } 2191 2192 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2193 if ((*pdpe & PG_V) == 0) { 2194 va_next = (sva + NBPDP) & ~PDPMASK; 2195 if (va_next < sva) 2196 va_next = eva; 2197 continue; 2198 } 2199 2200 /* 2201 * Calculate index for next page table. 2202 */ 2203 va_next = (sva + NBPDR) & ~PDRMASK; 2204 if (va_next < sva) 2205 va_next = eva; 2206 2207 pde = pmap_pdpe_to_pde(pdpe, sva); 2208 ptpaddr = *pde; 2209 2210 /* 2211 * Weed out invalid mappings. 2212 */ 2213 if (ptpaddr == 0) 2214 continue; 2215 2216 /* 2217 * Check for large page. 2218 */ 2219 if ((ptpaddr & PG_PS) != 0) { 2220 /* JG FreeBSD has more complex treatment here */ 2221 pmap_inval_interlock(&info, pmap, -1); 2222 *pde = 0; 2223 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2224 pmap_inval_deinterlock(&info, pmap); 2225 continue; 2226 } 2227 2228 /* 2229 * Limit our scan to either the end of the va represented 2230 * by the current page table page, or to the end of the 2231 * range being removed. 2232 */ 2233 if (va_next > eva) 2234 va_next = eva; 2235 2236 /* 2237 * NOTE: pmap_remove_pte() can block. 2238 */ 2239 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2240 sva += PAGE_SIZE) { 2241 if (*pte == 0) 2242 continue; 2243 if (pmap_remove_pte(pmap, pte, sva, &info)) 2244 break; 2245 } 2246 } 2247 pmap_inval_done(&info); 2248 lwkt_reltoken(&vm_token); 2249 } 2250 2251 /* 2252 * pmap_remove_all: 2253 * 2254 * Removes this physical page from all physical maps in which it resides. 2255 * Reflects back modify bits to the pager. 2256 * 2257 * This routine may not be called from an interrupt. 2258 */ 2259 2260 static 2261 void 2262 pmap_remove_all(vm_page_t m) 2263 { 2264 struct pmap_inval_info info; 2265 pt_entry_t *pte, tpte; 2266 pv_entry_t pv; 2267 2268 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2269 return; 2270 2271 lwkt_gettoken(&vm_token); 2272 pmap_inval_init(&info); 2273 crit_enter(); 2274 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2275 KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0); 2276 --pv->pv_pmap->pm_stats.resident_count; 2277 2278 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2279 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 2280 tpte = pte_load_clear(pte); 2281 if (tpte & PG_W) 2282 pv->pv_pmap->pm_stats.wired_count--; 2283 pmap_inval_deinterlock(&info, pv->pv_pmap); 2284 if (tpte & PG_A) 2285 vm_page_flag_set(m, PG_REFERENCED); 2286 2287 /* 2288 * Update the vm_page_t clean and reference bits. 2289 */ 2290 if (tpte & PG_M) { 2291 #if defined(PMAP_DIAGNOSTIC) 2292 if (pmap_nw_modified(tpte)) { 2293 kprintf( 2294 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2295 pv->pv_va, tpte); 2296 } 2297 #endif 2298 if (pmap_track_modified(pv->pv_va)) 2299 vm_page_dirty(m); 2300 } 2301 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2302 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 2303 ++pv->pv_pmap->pm_generation; 2304 m->md.pv_list_count--; 2305 KKASSERT(m->md.pv_list_count >= 0); 2306 if (TAILQ_EMPTY(&m->md.pv_list)) 2307 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2308 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info); 2309 free_pv_entry(pv); 2310 } 2311 crit_exit(); 2312 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2313 pmap_inval_done(&info); 2314 lwkt_reltoken(&vm_token); 2315 } 2316 2317 /* 2318 * pmap_protect: 2319 * 2320 * Set the physical protection on the specified range of this map 2321 * as requested. 2322 * 2323 * This function may not be called from an interrupt if the map is 2324 * not the kernel_pmap. 2325 */ 2326 void 2327 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2328 { 2329 vm_offset_t va_next; 2330 pml4_entry_t *pml4e; 2331 pdp_entry_t *pdpe; 2332 pd_entry_t ptpaddr, *pde; 2333 pt_entry_t *pte; 2334 pmap_inval_info info; 2335 2336 /* JG review for NX */ 2337 2338 if (pmap == NULL) 2339 return; 2340 2341 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2342 pmap_remove(pmap, sva, eva); 2343 return; 2344 } 2345 2346 if (prot & VM_PROT_WRITE) 2347 return; 2348 2349 lwkt_gettoken(&vm_token); 2350 pmap_inval_init(&info); 2351 2352 for (; sva < eva; sva = va_next) { 2353 2354 pml4e = pmap_pml4e(pmap, sva); 2355 if ((*pml4e & PG_V) == 0) { 2356 va_next = (sva + NBPML4) & ~PML4MASK; 2357 if (va_next < sva) 2358 va_next = eva; 2359 continue; 2360 } 2361 2362 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2363 if ((*pdpe & PG_V) == 0) { 2364 va_next = (sva + NBPDP) & ~PDPMASK; 2365 if (va_next < sva) 2366 va_next = eva; 2367 continue; 2368 } 2369 2370 va_next = (sva + NBPDR) & ~PDRMASK; 2371 if (va_next < sva) 2372 va_next = eva; 2373 2374 pde = pmap_pdpe_to_pde(pdpe, sva); 2375 ptpaddr = *pde; 2376 2377 /* 2378 * Check for large page. 2379 */ 2380 if ((ptpaddr & PG_PS) != 0) { 2381 pmap_inval_interlock(&info, pmap, -1); 2382 *pde &= ~(PG_M|PG_RW); 2383 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2384 pmap_inval_deinterlock(&info, pmap); 2385 continue; 2386 } 2387 2388 /* 2389 * Weed out invalid mappings. Note: we assume that the page 2390 * directory table is always allocated, and in kernel virtual. 2391 */ 2392 if (ptpaddr == 0) 2393 continue; 2394 2395 if (va_next > eva) 2396 va_next = eva; 2397 2398 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2399 sva += PAGE_SIZE) { 2400 pt_entry_t pbits; 2401 pt_entry_t cbits; 2402 vm_page_t m; 2403 2404 /* 2405 * XXX non-optimal. Note also that there can be 2406 * no pmap_inval_flush() calls until after we modify 2407 * ptbase[sindex] (or otherwise we have to do another 2408 * pmap_inval_add() call). 2409 */ 2410 pmap_inval_interlock(&info, pmap, sva); 2411 again: 2412 pbits = *pte; 2413 cbits = pbits; 2414 if ((pbits & PG_V) == 0) { 2415 pmap_inval_deinterlock(&info, pmap); 2416 continue; 2417 } 2418 if (pbits & PG_MANAGED) { 2419 m = NULL; 2420 if (pbits & PG_A) { 2421 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2422 vm_page_flag_set(m, PG_REFERENCED); 2423 cbits &= ~PG_A; 2424 } 2425 if (pbits & PG_M) { 2426 if (pmap_track_modified(sva)) { 2427 if (m == NULL) 2428 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2429 vm_page_dirty(m); 2430 cbits &= ~PG_M; 2431 } 2432 } 2433 } 2434 cbits &= ~PG_RW; 2435 if (pbits != cbits && 2436 !atomic_cmpset_long(pte, pbits, cbits)) { 2437 goto again; 2438 } 2439 pmap_inval_deinterlock(&info, pmap); 2440 } 2441 } 2442 pmap_inval_done(&info); 2443 lwkt_reltoken(&vm_token); 2444 } 2445 2446 /* 2447 * Insert the given physical page (p) at 2448 * the specified virtual address (v) in the 2449 * target physical map with the protection requested. 2450 * 2451 * If specified, the page will be wired down, meaning 2452 * that the related pte can not be reclaimed. 2453 * 2454 * NB: This is the only routine which MAY NOT lazy-evaluate 2455 * or lose information. That is, this routine must actually 2456 * insert this page into the given map NOW. 2457 */ 2458 void 2459 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2460 boolean_t wired) 2461 { 2462 vm_paddr_t pa; 2463 pd_entry_t *pde; 2464 pt_entry_t *pte; 2465 vm_paddr_t opa; 2466 pt_entry_t origpte, newpte; 2467 vm_page_t mpte; 2468 pmap_inval_info info; 2469 2470 if (pmap == NULL) 2471 return; 2472 2473 va = trunc_page(va); 2474 #ifdef PMAP_DIAGNOSTIC 2475 if (va >= KvaEnd) 2476 panic("pmap_enter: toobig"); 2477 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 2478 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); 2479 #endif 2480 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2481 kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n"); 2482 #ifdef DDB 2483 db_print_backtrace(); 2484 #endif 2485 } 2486 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2487 kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n"); 2488 #ifdef DDB 2489 db_print_backtrace(); 2490 #endif 2491 } 2492 2493 lwkt_gettoken(&vm_token); 2494 2495 /* 2496 * In the case that a page table page is not 2497 * resident, we are creating it here. 2498 */ 2499 if (va < VM_MAX_USER_ADDRESS) 2500 mpte = pmap_allocpte(pmap, va); 2501 else 2502 mpte = NULL; 2503 2504 pmap_inval_init(&info); 2505 pde = pmap_pde(pmap, va); 2506 if (pde != NULL && (*pde & PG_V) != 0) { 2507 if ((*pde & PG_PS) != 0) 2508 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2509 pte = pmap_pde_to_pte(pde, va); 2510 } else 2511 panic("pmap_enter: invalid page directory va=%#lx", va); 2512 2513 KKASSERT(pte != NULL); 2514 pa = VM_PAGE_TO_PHYS(m); 2515 origpte = *pte; 2516 opa = origpte & PG_FRAME; 2517 2518 /* 2519 * Mapping has not changed, must be protection or wiring change. 2520 */ 2521 if (origpte && (opa == pa)) { 2522 /* 2523 * Wiring change, just update stats. We don't worry about 2524 * wiring PT pages as they remain resident as long as there 2525 * are valid mappings in them. Hence, if a user page is wired, 2526 * the PT page will be also. 2527 */ 2528 if (wired && ((origpte & PG_W) == 0)) 2529 pmap->pm_stats.wired_count++; 2530 else if (!wired && (origpte & PG_W)) 2531 pmap->pm_stats.wired_count--; 2532 2533 #if defined(PMAP_DIAGNOSTIC) 2534 if (pmap_nw_modified(origpte)) { 2535 kprintf( 2536 "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2537 va, origpte); 2538 } 2539 #endif 2540 2541 /* 2542 * Remove the extra pte reference. Note that we cannot 2543 * optimize the RO->RW case because we have adjusted the 2544 * wiring count above and may need to adjust the wiring 2545 * bits below. 2546 */ 2547 if (mpte) 2548 mpte->hold_count--; 2549 2550 /* 2551 * We might be turning off write access to the page, 2552 * so we go ahead and sense modify status. 2553 */ 2554 if (origpte & PG_MANAGED) { 2555 if ((origpte & PG_M) && pmap_track_modified(va)) { 2556 vm_page_t om; 2557 om = PHYS_TO_VM_PAGE(opa); 2558 vm_page_dirty(om); 2559 } 2560 pa |= PG_MANAGED; 2561 KKASSERT(m->flags & PG_MAPPED); 2562 } 2563 goto validate; 2564 } 2565 /* 2566 * Mapping has changed, invalidate old range and fall through to 2567 * handle validating new mapping. 2568 */ 2569 while (opa) { 2570 int err; 2571 err = pmap_remove_pte(pmap, pte, va, &info); 2572 if (err) 2573 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2574 origpte = *pte; 2575 opa = origpte & PG_FRAME; 2576 if (opa) { 2577 kprintf("pmap_enter: Warning, raced pmap %p va %p\n", 2578 pmap, (void *)va); 2579 } 2580 } 2581 2582 /* 2583 * Enter on the PV list if part of our managed memory. Note that we 2584 * raise IPL while manipulating pv_table since pmap_enter can be 2585 * called at interrupt time. 2586 */ 2587 if (pmap_initialized && 2588 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2589 pmap_insert_entry(pmap, va, mpte, m); 2590 pa |= PG_MANAGED; 2591 vm_page_flag_set(m, PG_MAPPED); 2592 } 2593 2594 /* 2595 * Increment counters 2596 */ 2597 ++pmap->pm_stats.resident_count; 2598 if (wired) 2599 pmap->pm_stats.wired_count++; 2600 2601 validate: 2602 /* 2603 * Now validate mapping with desired protection/wiring. 2604 */ 2605 newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | PG_V); 2606 2607 if (wired) 2608 newpte |= PG_W; 2609 if (va < VM_MAX_USER_ADDRESS) 2610 newpte |= PG_U; 2611 if (pmap == &kernel_pmap) 2612 newpte |= pgeflag; 2613 2614 /* 2615 * if the mapping or permission bits are different, we need 2616 * to update the pte. 2617 */ 2618 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2619 pmap_inval_interlock(&info, pmap, va); 2620 *pte = newpte | PG_A; 2621 pmap_inval_deinterlock(&info, pmap); 2622 if (newpte & PG_RW) 2623 vm_page_flag_set(m, PG_WRITEABLE); 2624 } 2625 KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2626 pmap_inval_done(&info); 2627 lwkt_reltoken(&vm_token); 2628 } 2629 2630 /* 2631 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 2632 * This code also assumes that the pmap has no pre-existing entry for this 2633 * VA. 2634 * 2635 * This code currently may only be used on user pmaps, not kernel_pmap. 2636 */ 2637 void 2638 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 2639 { 2640 pt_entry_t *pte; 2641 vm_paddr_t pa; 2642 vm_page_t mpte; 2643 vm_pindex_t ptepindex; 2644 pd_entry_t *ptepa; 2645 pmap_inval_info info; 2646 2647 lwkt_gettoken(&vm_token); 2648 pmap_inval_init(&info); 2649 2650 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2651 kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n"); 2652 #ifdef DDB 2653 db_print_backtrace(); 2654 #endif 2655 } 2656 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2657 kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n"); 2658 #ifdef DDB 2659 db_print_backtrace(); 2660 #endif 2661 } 2662 2663 KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */ 2664 2665 /* 2666 * Calculate the page table page (mpte), allocating it if necessary. 2667 * 2668 * A held page table page (mpte), or NULL, is passed onto the 2669 * section following. 2670 */ 2671 if (va < VM_MAX_USER_ADDRESS) { 2672 /* 2673 * Calculate pagetable page index 2674 */ 2675 ptepindex = pmap_pde_pindex(va); 2676 2677 do { 2678 /* 2679 * Get the page directory entry 2680 */ 2681 ptepa = pmap_pde(pmap, va); 2682 2683 /* 2684 * If the page table page is mapped, we just increment 2685 * the hold count, and activate it. 2686 */ 2687 if (ptepa && (*ptepa & PG_V) != 0) { 2688 if (*ptepa & PG_PS) 2689 panic("pmap_enter_quick: unexpected mapping into 2MB page"); 2690 // if (pmap->pm_ptphint && 2691 // (pmap->pm_ptphint->pindex == ptepindex)) { 2692 // mpte = pmap->pm_ptphint; 2693 // } else { 2694 mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 2695 pmap->pm_ptphint = mpte; 2696 // } 2697 if (mpte) 2698 mpte->hold_count++; 2699 } else { 2700 mpte = _pmap_allocpte(pmap, ptepindex); 2701 } 2702 } while (mpte == NULL); 2703 } else { 2704 mpte = NULL; 2705 /* this code path is not yet used */ 2706 } 2707 2708 /* 2709 * With a valid (and held) page directory page, we can just use 2710 * vtopte() to get to the pte. If the pte is already present 2711 * we do not disturb it. 2712 */ 2713 pte = vtopte(va); 2714 if (*pte & PG_V) { 2715 if (mpte) 2716 pmap_unwire_pte_hold(pmap, va, mpte, &info); 2717 pa = VM_PAGE_TO_PHYS(m); 2718 KKASSERT(((*pte ^ pa) & PG_FRAME) == 0); 2719 pmap_inval_done(&info); 2720 lwkt_reltoken(&vm_token); 2721 return; 2722 } 2723 2724 /* 2725 * Enter on the PV list if part of our managed memory 2726 */ 2727 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2728 pmap_insert_entry(pmap, va, mpte, m); 2729 vm_page_flag_set(m, PG_MAPPED); 2730 } 2731 2732 /* 2733 * Increment counters 2734 */ 2735 ++pmap->pm_stats.resident_count; 2736 2737 pa = VM_PAGE_TO_PHYS(m); 2738 2739 /* 2740 * Now validate mapping with RO protection 2741 */ 2742 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2743 *pte = pa | PG_V | PG_U; 2744 else 2745 *pte = pa | PG_V | PG_U | PG_MANAGED; 2746 /* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */ 2747 pmap_inval_done(&info); 2748 lwkt_reltoken(&vm_token); 2749 } 2750 2751 /* 2752 * Make a temporary mapping for a physical address. This is only intended 2753 * to be used for panic dumps. 2754 */ 2755 /* JG Needed on x86_64? */ 2756 void * 2757 pmap_kenter_temporary(vm_paddr_t pa, int i) 2758 { 2759 pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 2760 return ((void *)crashdumpmap); 2761 } 2762 2763 #define MAX_INIT_PT (96) 2764 2765 /* 2766 * This routine preloads the ptes for a given object into the specified pmap. 2767 * This eliminates the blast of soft faults on process startup and 2768 * immediately after an mmap. 2769 */ 2770 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2771 2772 void 2773 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 2774 vm_object_t object, vm_pindex_t pindex, 2775 vm_size_t size, int limit) 2776 { 2777 struct rb_vm_page_scan_info info; 2778 struct lwp *lp; 2779 vm_size_t psize; 2780 2781 /* 2782 * We can't preinit if read access isn't set or there is no pmap 2783 * or object. 2784 */ 2785 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2786 return; 2787 2788 /* 2789 * We can't preinit if the pmap is not the current pmap 2790 */ 2791 lp = curthread->td_lwp; 2792 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2793 return; 2794 2795 psize = x86_64_btop(size); 2796 2797 if ((object->type != OBJT_VNODE) || 2798 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2799 (object->resident_page_count > MAX_INIT_PT))) { 2800 return; 2801 } 2802 2803 if (psize + pindex > object->size) { 2804 if (object->size < pindex) 2805 return; 2806 psize = object->size - pindex; 2807 } 2808 2809 if (psize == 0) 2810 return; 2811 2812 /* 2813 * Use a red-black scan to traverse the requested range and load 2814 * any valid pages found into the pmap. 2815 * 2816 * We cannot safely scan the object's memq unless we are in a 2817 * critical section since interrupts can remove pages from objects. 2818 */ 2819 info.start_pindex = pindex; 2820 info.end_pindex = pindex + psize - 1; 2821 info.limit = limit; 2822 info.mpte = NULL; 2823 info.addr = addr; 2824 info.pmap = pmap; 2825 2826 crit_enter(); 2827 lwkt_gettoken(&vm_token); 2828 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2829 pmap_object_init_pt_callback, &info); 2830 lwkt_reltoken(&vm_token); 2831 crit_exit(); 2832 } 2833 2834 static 2835 int 2836 pmap_object_init_pt_callback(vm_page_t p, void *data) 2837 { 2838 struct rb_vm_page_scan_info *info = data; 2839 vm_pindex_t rel_index; 2840 /* 2841 * don't allow an madvise to blow away our really 2842 * free pages allocating pv entries. 2843 */ 2844 if ((info->limit & MAP_PREFAULT_MADVISE) && 2845 vmstats.v_free_count < vmstats.v_free_reserved) { 2846 return(-1); 2847 } 2848 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2849 (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { 2850 if ((p->queue - p->pc) == PQ_CACHE) 2851 vm_page_deactivate(p); 2852 vm_page_busy(p); 2853 rel_index = p->pindex - info->start_pindex; 2854 pmap_enter_quick(info->pmap, 2855 info->addr + x86_64_ptob(rel_index), p); 2856 vm_page_wakeup(p); 2857 } 2858 return(0); 2859 } 2860 2861 /* 2862 * Return TRUE if the pmap is in shape to trivially 2863 * pre-fault the specified address. 2864 * 2865 * Returns FALSE if it would be non-trivial or if a 2866 * pte is already loaded into the slot. 2867 */ 2868 int 2869 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2870 { 2871 pt_entry_t *pte; 2872 pd_entry_t *pde; 2873 int ret; 2874 2875 lwkt_gettoken(&vm_token); 2876 pde = pmap_pde(pmap, addr); 2877 if (pde == NULL || *pde == 0) { 2878 ret = 0; 2879 } else { 2880 pte = vtopte(addr); 2881 ret = (*pte) ? 0 : 1; 2882 } 2883 lwkt_reltoken(&vm_token); 2884 return(ret); 2885 } 2886 2887 /* 2888 * Routine: pmap_change_wiring 2889 * Function: Change the wiring attribute for a map/virtual-address 2890 * pair. 2891 * In/out conditions: 2892 * The mapping must already exist in the pmap. 2893 */ 2894 void 2895 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 2896 { 2897 pt_entry_t *pte; 2898 2899 if (pmap == NULL) 2900 return; 2901 2902 lwkt_gettoken(&vm_token); 2903 pte = pmap_pte(pmap, va); 2904 2905 if (wired && !pmap_pte_w(pte)) 2906 pmap->pm_stats.wired_count++; 2907 else if (!wired && pmap_pte_w(pte)) 2908 pmap->pm_stats.wired_count--; 2909 2910 /* 2911 * Wiring is not a hardware characteristic so there is no need to 2912 * invalidate TLB. However, in an SMP environment we must use 2913 * a locked bus cycle to update the pte (if we are not using 2914 * the pmap_inval_*() API that is)... it's ok to do this for simple 2915 * wiring changes. 2916 */ 2917 #ifdef SMP 2918 if (wired) 2919 atomic_set_long(pte, PG_W); 2920 else 2921 atomic_clear_long(pte, PG_W); 2922 #else 2923 if (wired) 2924 atomic_set_long_nonlocked(pte, PG_W); 2925 else 2926 atomic_clear_long_nonlocked(pte, PG_W); 2927 #endif 2928 lwkt_reltoken(&vm_token); 2929 } 2930 2931 2932 2933 /* 2934 * Copy the range specified by src_addr/len 2935 * from the source map to the range dst_addr/len 2936 * in the destination map. 2937 * 2938 * This routine is only advisory and need not do anything. 2939 */ 2940 void 2941 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2942 vm_size_t len, vm_offset_t src_addr) 2943 { 2944 return; 2945 #if 0 2946 pmap_inval_info info; 2947 vm_offset_t addr; 2948 vm_offset_t end_addr = src_addr + len; 2949 vm_offset_t pdnxt; 2950 pd_entry_t src_frame, dst_frame; 2951 vm_page_t m; 2952 2953 if (dst_addr != src_addr) 2954 return; 2955 #if JGPMAP32 2956 src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 2957 if (src_frame != (PTDpde & PG_FRAME)) { 2958 return; 2959 } 2960 2961 dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 2962 if (dst_frame != (APTDpde & PG_FRAME)) { 2963 APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); 2964 /* The page directory is not shared between CPUs */ 2965 cpu_invltlb(); 2966 } 2967 #endif 2968 pmap_inval_init(&info); 2969 pmap_inval_add(&info, dst_pmap, -1); 2970 pmap_inval_add(&info, src_pmap, -1); 2971 2972 /* 2973 * critical section protection is required to maintain the page/object 2974 * association, interrupts can free pages and remove them from 2975 * their objects. 2976 */ 2977 crit_enter(); 2978 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 2979 pt_entry_t *src_pte, *dst_pte; 2980 vm_page_t dstmpte, srcmpte; 2981 vm_offset_t srcptepaddr; 2982 vm_pindex_t ptepindex; 2983 2984 if (addr >= UPT_MIN_ADDRESS) 2985 panic("pmap_copy: invalid to pmap_copy page tables\n"); 2986 2987 /* 2988 * Don't let optional prefaulting of pages make us go 2989 * way below the low water mark of free pages or way 2990 * above high water mark of used pv entries. 2991 */ 2992 if (vmstats.v_free_count < vmstats.v_free_reserved || 2993 pv_entry_count > pv_entry_high_water) 2994 break; 2995 2996 pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); 2997 ptepindex = addr >> PDRSHIFT; 2998 2999 #if JGPMAP32 3000 srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; 3001 #endif 3002 if (srcptepaddr == 0) 3003 continue; 3004 3005 if (srcptepaddr & PG_PS) { 3006 #if JGPMAP32 3007 if (dst_pmap->pm_pdir[ptepindex] == 0) { 3008 dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; 3009 dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3010 } 3011 #endif 3012 continue; 3013 } 3014 3015 srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); 3016 if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || 3017 (srcmpte->flags & PG_BUSY)) { 3018 continue; 3019 } 3020 3021 if (pdnxt > end_addr) 3022 pdnxt = end_addr; 3023 3024 src_pte = vtopte(addr); 3025 #if JGPMAP32 3026 dst_pte = avtopte(addr); 3027 #endif 3028 while (addr < pdnxt) { 3029 pt_entry_t ptetemp; 3030 3031 ptetemp = *src_pte; 3032 /* 3033 * we only virtual copy managed pages 3034 */ 3035 if ((ptetemp & PG_MANAGED) != 0) { 3036 /* 3037 * We have to check after allocpte for the 3038 * pte still being around... allocpte can 3039 * block. 3040 * 3041 * pmap_allocpte() can block. If we lose 3042 * our page directory mappings we stop. 3043 */ 3044 dstmpte = pmap_allocpte(dst_pmap, addr); 3045 3046 #if JGPMAP32 3047 if (src_frame != (PTDpde & PG_FRAME) || 3048 dst_frame != (APTDpde & PG_FRAME) 3049 ) { 3050 kprintf("WARNING: pmap_copy: detected and corrected race\n"); 3051 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3052 goto failed; 3053 } else if ((*dst_pte == 0) && 3054 (ptetemp = *src_pte) != 0 && 3055 (ptetemp & PG_MANAGED)) { 3056 /* 3057 * Clear the modified and 3058 * accessed (referenced) bits 3059 * during the copy. 3060 */ 3061 m = PHYS_TO_VM_PAGE(ptetemp); 3062 *dst_pte = ptetemp & ~(PG_M | PG_A); 3063 ++dst_pmap->pm_stats.resident_count; 3064 pmap_insert_entry(dst_pmap, addr, 3065 dstmpte, m); 3066 KKASSERT(m->flags & PG_MAPPED); 3067 } else { 3068 kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n"); 3069 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3070 goto failed; 3071 } 3072 #endif 3073 if (dstmpte->hold_count >= srcmpte->hold_count) 3074 break; 3075 } 3076 addr += PAGE_SIZE; 3077 src_pte++; 3078 dst_pte++; 3079 } 3080 } 3081 failed: 3082 crit_exit(); 3083 pmap_inval_done(&info); 3084 #endif 3085 } 3086 3087 /* 3088 * pmap_zero_page: 3089 * 3090 * Zero the specified physical page. 3091 * 3092 * This function may be called from an interrupt and no locking is 3093 * required. 3094 */ 3095 void 3096 pmap_zero_page(vm_paddr_t phys) 3097 { 3098 vm_offset_t va = PHYS_TO_DMAP(phys); 3099 3100 pagezero((void *)va); 3101 } 3102 3103 /* 3104 * pmap_page_assertzero: 3105 * 3106 * Assert that a page is empty, panic if it isn't. 3107 */ 3108 void 3109 pmap_page_assertzero(vm_paddr_t phys) 3110 { 3111 vm_offset_t virt = PHYS_TO_DMAP(phys); 3112 int i; 3113 3114 for (i = 0; i < PAGE_SIZE; i += sizeof(long)) { 3115 if (*(long *)((char *)virt + i) != 0) { 3116 panic("pmap_page_assertzero() @ %p not zero!\n", (void *)virt); 3117 } 3118 } 3119 } 3120 3121 /* 3122 * pmap_zero_page: 3123 * 3124 * Zero part of a physical page by mapping it into memory and clearing 3125 * its contents with bzero. 3126 * 3127 * off and size may not cover an area beyond a single hardware page. 3128 */ 3129 void 3130 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 3131 { 3132 vm_offset_t virt = PHYS_TO_DMAP(phys); 3133 3134 bzero((char *)virt + off, size); 3135 } 3136 3137 /* 3138 * pmap_copy_page: 3139 * 3140 * Copy the physical page from the source PA to the target PA. 3141 * This function may be called from an interrupt. No locking 3142 * is required. 3143 */ 3144 void 3145 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 3146 { 3147 vm_offset_t src_virt, dst_virt; 3148 3149 src_virt = PHYS_TO_DMAP(src); 3150 dst_virt = PHYS_TO_DMAP(dst); 3151 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 3152 } 3153 3154 /* 3155 * pmap_copy_page_frag: 3156 * 3157 * Copy the physical page from the source PA to the target PA. 3158 * This function may be called from an interrupt. No locking 3159 * is required. 3160 */ 3161 void 3162 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 3163 { 3164 vm_offset_t src_virt, dst_virt; 3165 3166 src_virt = PHYS_TO_DMAP(src); 3167 dst_virt = PHYS_TO_DMAP(dst); 3168 3169 bcopy((char *)src_virt + (src & PAGE_MASK), 3170 (char *)dst_virt + (dst & PAGE_MASK), 3171 bytes); 3172 } 3173 3174 /* 3175 * Returns true if the pmap's pv is one of the first 3176 * 16 pvs linked to from this page. This count may 3177 * be changed upwards or downwards in the future; it 3178 * is only necessary that true be returned for a small 3179 * subset of pmaps for proper page aging. 3180 */ 3181 boolean_t 3182 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3183 { 3184 pv_entry_t pv; 3185 int loops = 0; 3186 3187 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3188 return FALSE; 3189 3190 crit_enter(); 3191 lwkt_gettoken(&vm_token); 3192 3193 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3194 if (pv->pv_pmap == pmap) { 3195 lwkt_reltoken(&vm_token); 3196 crit_exit(); 3197 return TRUE; 3198 } 3199 loops++; 3200 if (loops >= 16) 3201 break; 3202 } 3203 lwkt_reltoken(&vm_token); 3204 crit_exit(); 3205 return (FALSE); 3206 } 3207 3208 /* 3209 * Remove all pages from specified address space 3210 * this aids process exit speeds. Also, this code 3211 * is special cased for current process only, but 3212 * can have the more generic (and slightly slower) 3213 * mode enabled. This is much faster than pmap_remove 3214 * in the case of running down an entire address space. 3215 */ 3216 void 3217 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3218 { 3219 struct lwp *lp; 3220 pt_entry_t *pte, tpte; 3221 pv_entry_t pv, npv; 3222 vm_page_t m; 3223 pmap_inval_info info; 3224 int iscurrentpmap; 3225 int save_generation; 3226 3227 lp = curthread->td_lwp; 3228 if (lp && pmap == vmspace_pmap(lp->lwp_vmspace)) 3229 iscurrentpmap = 1; 3230 else 3231 iscurrentpmap = 0; 3232 3233 lwkt_gettoken(&vm_token); 3234 pmap_inval_init(&info); 3235 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 3236 if (pv->pv_va >= eva || pv->pv_va < sva) { 3237 npv = TAILQ_NEXT(pv, pv_plist); 3238 continue; 3239 } 3240 3241 KKASSERT(pmap == pv->pv_pmap); 3242 3243 if (iscurrentpmap) 3244 pte = vtopte(pv->pv_va); 3245 else 3246 pte = pmap_pte_quick(pmap, pv->pv_va); 3247 pmap_inval_interlock(&info, pmap, pv->pv_va); 3248 3249 /* 3250 * We cannot remove wired pages from a process' mapping 3251 * at this time 3252 */ 3253 if (*pte & PG_W) { 3254 pmap_inval_deinterlock(&info, pmap); 3255 npv = TAILQ_NEXT(pv, pv_plist); 3256 continue; 3257 } 3258 tpte = pte_load_clear(pte); 3259 3260 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3261 3262 KASSERT(m < &vm_page_array[vm_page_array_size], 3263 ("pmap_remove_pages: bad tpte %lx", tpte)); 3264 3265 KKASSERT(pmap->pm_stats.resident_count > 0); 3266 --pmap->pm_stats.resident_count; 3267 pmap_inval_deinterlock(&info, pmap); 3268 3269 /* 3270 * Update the vm_page_t clean and reference bits. 3271 */ 3272 if (tpte & PG_M) { 3273 vm_page_dirty(m); 3274 } 3275 3276 npv = TAILQ_NEXT(pv, pv_plist); 3277 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 3278 save_generation = ++pmap->pm_generation; 3279 3280 m->md.pv_list_count--; 3281 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3282 if (TAILQ_EMPTY(&m->md.pv_list)) 3283 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3284 3285 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info); 3286 free_pv_entry(pv); 3287 3288 /* 3289 * Restart the scan if we blocked during the unuse or free 3290 * calls and other removals were made. 3291 */ 3292 if (save_generation != pmap->pm_generation) { 3293 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 3294 npv = TAILQ_FIRST(&pmap->pm_pvlist); 3295 } 3296 } 3297 pmap_inval_done(&info); 3298 lwkt_reltoken(&vm_token); 3299 } 3300 3301 /* 3302 * pmap_testbit tests bits in pte's 3303 * note that the testbit/clearbit routines are inline, 3304 * and a lot of things compile-time evaluate. 3305 */ 3306 static 3307 boolean_t 3308 pmap_testbit(vm_page_t m, int bit) 3309 { 3310 pv_entry_t pv; 3311 pt_entry_t *pte; 3312 3313 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3314 return FALSE; 3315 3316 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 3317 return FALSE; 3318 3319 crit_enter(); 3320 3321 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3322 /* 3323 * if the bit being tested is the modified bit, then 3324 * mark clean_map and ptes as never 3325 * modified. 3326 */ 3327 if (bit & (PG_A|PG_M)) { 3328 if (!pmap_track_modified(pv->pv_va)) 3329 continue; 3330 } 3331 3332 #if defined(PMAP_DIAGNOSTIC) 3333 if (pv->pv_pmap == NULL) { 3334 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 3335 continue; 3336 } 3337 #endif 3338 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3339 if (*pte & bit) { 3340 crit_exit(); 3341 return TRUE; 3342 } 3343 } 3344 crit_exit(); 3345 return (FALSE); 3346 } 3347 3348 /* 3349 * this routine is used to modify bits in ptes 3350 */ 3351 static __inline 3352 void 3353 pmap_clearbit(vm_page_t m, int bit) 3354 { 3355 struct pmap_inval_info info; 3356 pv_entry_t pv; 3357 pt_entry_t *pte; 3358 pt_entry_t pbits; 3359 3360 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3361 return; 3362 3363 pmap_inval_init(&info); 3364 3365 /* 3366 * Loop over all current mappings setting/clearing as appropos If 3367 * setting RO do we need to clear the VAC? 3368 */ 3369 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3370 /* 3371 * don't write protect pager mappings 3372 */ 3373 if (bit == PG_RW) { 3374 if (!pmap_track_modified(pv->pv_va)) 3375 continue; 3376 } 3377 3378 #if defined(PMAP_DIAGNOSTIC) 3379 if (pv->pv_pmap == NULL) { 3380 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 3381 continue; 3382 } 3383 #endif 3384 3385 /* 3386 * Careful here. We can use a locked bus instruction to 3387 * clear PG_A or PG_M safely but we need to synchronize 3388 * with the target cpus when we mess with PG_RW. 3389 * 3390 * We do not have to force synchronization when clearing 3391 * PG_M even for PTEs generated via virtual memory maps, 3392 * because the virtual kernel will invalidate the pmap 3393 * entry when/if it needs to resynchronize the Modify bit. 3394 */ 3395 if (bit & PG_RW) 3396 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 3397 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3398 again: 3399 pbits = *pte; 3400 if (pbits & bit) { 3401 if (bit == PG_RW) { 3402 if (pbits & PG_M) { 3403 vm_page_dirty(m); 3404 atomic_clear_long(pte, PG_M|PG_RW); 3405 } else { 3406 /* 3407 * The cpu may be trying to set PG_M 3408 * simultaniously with our clearing 3409 * of PG_RW. 3410 */ 3411 if (!atomic_cmpset_long(pte, pbits, 3412 pbits & ~PG_RW)) 3413 goto again; 3414 } 3415 } else if (bit == PG_M) { 3416 /* 3417 * We could also clear PG_RW here to force 3418 * a fault on write to redetect PG_M for 3419 * virtual kernels, but it isn't necessary 3420 * since virtual kernels invalidate the pte 3421 * when they clear the VPTE_M bit in their 3422 * virtual page tables. 3423 */ 3424 atomic_clear_long(pte, PG_M); 3425 } else { 3426 atomic_clear_long(pte, bit); 3427 } 3428 } 3429 if (bit & PG_RW) 3430 pmap_inval_deinterlock(&info, pv->pv_pmap); 3431 } 3432 pmap_inval_done(&info); 3433 } 3434 3435 /* 3436 * pmap_page_protect: 3437 * 3438 * Lower the permission for all mappings to a given page. 3439 */ 3440 void 3441 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3442 { 3443 /* JG NX support? */ 3444 if ((prot & VM_PROT_WRITE) == 0) { 3445 lwkt_gettoken(&vm_token); 3446 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3447 pmap_clearbit(m, PG_RW); 3448 vm_page_flag_clear(m, PG_WRITEABLE); 3449 } else { 3450 pmap_remove_all(m); 3451 } 3452 lwkt_reltoken(&vm_token); 3453 } 3454 } 3455 3456 vm_paddr_t 3457 pmap_phys_address(vm_pindex_t ppn) 3458 { 3459 return (x86_64_ptob(ppn)); 3460 } 3461 3462 /* 3463 * pmap_ts_referenced: 3464 * 3465 * Return a count of reference bits for a page, clearing those bits. 3466 * It is not necessary for every reference bit to be cleared, but it 3467 * is necessary that 0 only be returned when there are truly no 3468 * reference bits set. 3469 * 3470 * XXX: The exact number of bits to check and clear is a matter that 3471 * should be tested and standardized at some point in the future for 3472 * optimal aging of shared pages. 3473 */ 3474 int 3475 pmap_ts_referenced(vm_page_t m) 3476 { 3477 pv_entry_t pv, pvf, pvn; 3478 pt_entry_t *pte; 3479 int rtval = 0; 3480 3481 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3482 return (rtval); 3483 3484 crit_enter(); 3485 lwkt_gettoken(&vm_token); 3486 3487 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3488 3489 pvf = pv; 3490 3491 do { 3492 pvn = TAILQ_NEXT(pv, pv_list); 3493 3494 crit_enter(); 3495 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3496 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3497 crit_exit(); 3498 3499 if (!pmap_track_modified(pv->pv_va)) 3500 continue; 3501 3502 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3503 3504 if (pte && (*pte & PG_A)) { 3505 #ifdef SMP 3506 atomic_clear_long(pte, PG_A); 3507 #else 3508 atomic_clear_long_nonlocked(pte, PG_A); 3509 #endif 3510 rtval++; 3511 if (rtval > 4) { 3512 break; 3513 } 3514 } 3515 } while ((pv = pvn) != NULL && pv != pvf); 3516 } 3517 lwkt_reltoken(&vm_token); 3518 crit_exit(); 3519 3520 return (rtval); 3521 } 3522 3523 /* 3524 * pmap_is_modified: 3525 * 3526 * Return whether or not the specified physical page was modified 3527 * in any physical maps. 3528 */ 3529 boolean_t 3530 pmap_is_modified(vm_page_t m) 3531 { 3532 boolean_t res; 3533 3534 lwkt_gettoken(&vm_token); 3535 res = pmap_testbit(m, PG_M); 3536 lwkt_reltoken(&vm_token); 3537 return (res); 3538 } 3539 3540 /* 3541 * Clear the modify bits on the specified physical page. 3542 */ 3543 void 3544 pmap_clear_modify(vm_page_t m) 3545 { 3546 lwkt_gettoken(&vm_token); 3547 pmap_clearbit(m, PG_M); 3548 lwkt_reltoken(&vm_token); 3549 } 3550 3551 /* 3552 * pmap_clear_reference: 3553 * 3554 * Clear the reference bit on the specified physical page. 3555 */ 3556 void 3557 pmap_clear_reference(vm_page_t m) 3558 { 3559 lwkt_gettoken(&vm_token); 3560 pmap_clearbit(m, PG_A); 3561 lwkt_reltoken(&vm_token); 3562 } 3563 3564 /* 3565 * Miscellaneous support routines follow 3566 */ 3567 3568 static 3569 void 3570 i386_protection_init(void) 3571 { 3572 int *kp, prot; 3573 3574 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 3575 kp = protection_codes; 3576 for (prot = 0; prot < 8; prot++) { 3577 switch (prot) { 3578 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 3579 /* 3580 * Read access is also 0. There isn't any execute bit, 3581 * so just make it readable. 3582 */ 3583 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 3584 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 3585 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 3586 *kp++ = 0; 3587 break; 3588 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 3589 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 3590 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 3591 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 3592 *kp++ = PG_RW; 3593 break; 3594 } 3595 } 3596 } 3597 3598 /* 3599 * Map a set of physical memory pages into the kernel virtual 3600 * address space. Return a pointer to where it is mapped. This 3601 * routine is intended to be used for mapping device memory, 3602 * NOT real memory. 3603 * 3604 * NOTE: we can't use pgeflag unless we invalidate the pages one at 3605 * a time. 3606 */ 3607 void * 3608 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 3609 { 3610 vm_offset_t va, tmpva, offset; 3611 pt_entry_t *pte; 3612 3613 offset = pa & PAGE_MASK; 3614 size = roundup(offset + size, PAGE_SIZE); 3615 3616 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3617 if (va == 0) 3618 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3619 3620 pa = pa & ~PAGE_MASK; 3621 for (tmpva = va; size > 0;) { 3622 pte = vtopte(tmpva); 3623 *pte = pa | PG_RW | PG_V; /* | pgeflag; */ 3624 size -= PAGE_SIZE; 3625 tmpva += PAGE_SIZE; 3626 pa += PAGE_SIZE; 3627 } 3628 cpu_invltlb(); 3629 smp_invltlb(); 3630 3631 return ((void *)(va + offset)); 3632 } 3633 3634 void * 3635 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 3636 { 3637 vm_offset_t va, tmpva, offset; 3638 pt_entry_t *pte; 3639 3640 offset = pa & PAGE_MASK; 3641 size = roundup(offset + size, PAGE_SIZE); 3642 3643 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3644 if (va == 0) 3645 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3646 3647 pa = pa & ~PAGE_MASK; 3648 for (tmpva = va; size > 0;) { 3649 pte = vtopte(tmpva); 3650 *pte = pa | PG_RW | PG_V | PG_N; /* | pgeflag; */ 3651 size -= PAGE_SIZE; 3652 tmpva += PAGE_SIZE; 3653 pa += PAGE_SIZE; 3654 } 3655 cpu_invltlb(); 3656 smp_invltlb(); 3657 3658 return ((void *)(va + offset)); 3659 } 3660 3661 void 3662 pmap_unmapdev(vm_offset_t va, vm_size_t size) 3663 { 3664 vm_offset_t base, offset; 3665 3666 base = va & ~PAGE_MASK; 3667 offset = va & PAGE_MASK; 3668 size = roundup(offset + size, PAGE_SIZE); 3669 pmap_qremove(va, size >> PAGE_SHIFT); 3670 kmem_free(&kernel_map, base, size); 3671 } 3672 3673 /* 3674 * perform the pmap work for mincore 3675 */ 3676 int 3677 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3678 { 3679 pt_entry_t *ptep, pte; 3680 vm_page_t m; 3681 int val = 0; 3682 3683 lwkt_gettoken(&vm_token); 3684 ptep = pmap_pte(pmap, addr); 3685 3686 if (ptep && (pte = *ptep) != 0) { 3687 vm_offset_t pa; 3688 3689 val = MINCORE_INCORE; 3690 if ((pte & PG_MANAGED) == 0) 3691 goto done; 3692 3693 pa = pte & PG_FRAME; 3694 3695 m = PHYS_TO_VM_PAGE(pa); 3696 3697 /* 3698 * Modified by us 3699 */ 3700 if (pte & PG_M) 3701 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3702 /* 3703 * Modified by someone 3704 */ 3705 else if (m->dirty || pmap_is_modified(m)) 3706 val |= MINCORE_MODIFIED_OTHER; 3707 /* 3708 * Referenced by us 3709 */ 3710 if (pte & PG_A) 3711 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3712 3713 /* 3714 * Referenced by someone 3715 */ 3716 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3717 val |= MINCORE_REFERENCED_OTHER; 3718 vm_page_flag_set(m, PG_REFERENCED); 3719 } 3720 } 3721 done: 3722 lwkt_reltoken(&vm_token); 3723 return val; 3724 } 3725 3726 /* 3727 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3728 * vmspace will be ref'd and the old one will be deref'd. 3729 * 3730 * The vmspace for all lwps associated with the process will be adjusted 3731 * and cr3 will be reloaded if any lwp is the current lwp. 3732 */ 3733 void 3734 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3735 { 3736 struct vmspace *oldvm; 3737 struct lwp *lp; 3738 3739 crit_enter(); 3740 oldvm = p->p_vmspace; 3741 if (oldvm != newvm) { 3742 p->p_vmspace = newvm; 3743 KKASSERT(p->p_nthreads == 1); 3744 lp = RB_ROOT(&p->p_lwp_tree); 3745 pmap_setlwpvm(lp, newvm); 3746 if (adjrefs) { 3747 sysref_get(&newvm->vm_sysref); 3748 sysref_put(&oldvm->vm_sysref); 3749 } 3750 } 3751 crit_exit(); 3752 } 3753 3754 /* 3755 * Set the vmspace for a LWP. The vmspace is almost universally set the 3756 * same as the process vmspace, but virtual kernels need to swap out contexts 3757 * on a per-lwp basis. 3758 */ 3759 void 3760 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3761 { 3762 struct vmspace *oldvm; 3763 struct pmap *pmap; 3764 3765 crit_enter(); 3766 oldvm = lp->lwp_vmspace; 3767 3768 if (oldvm != newvm) { 3769 lp->lwp_vmspace = newvm; 3770 if (curthread->td_lwp == lp) { 3771 pmap = vmspace_pmap(newvm); 3772 #if defined(SMP) 3773 atomic_set_int(&pmap->pm_active, mycpu->gd_cpumask); 3774 if (pmap->pm_active & CPUMASK_LOCK) 3775 pmap_interlock_wait(newvm); 3776 #else 3777 pmap->pm_active |= 1; 3778 #endif 3779 #if defined(SWTCH_OPTIM_STATS) 3780 tlb_flush_count++; 3781 #endif 3782 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 3783 curthread->td_pcb->pcb_cr3 |= PG_RW | PG_U | PG_V; 3784 load_cr3(curthread->td_pcb->pcb_cr3); 3785 pmap = vmspace_pmap(oldvm); 3786 #if defined(SMP) 3787 atomic_clear_int(&pmap->pm_active, mycpu->gd_cpumask); 3788 #else 3789 pmap->pm_active &= ~1; 3790 #endif 3791 } 3792 } 3793 crit_exit(); 3794 } 3795 3796 #ifdef SMP 3797 3798 /* 3799 * Called when switching to a locked pmap 3800 */ 3801 void 3802 pmap_interlock_wait(struct vmspace *vm) 3803 { 3804 struct pmap *pmap = &vm->vm_pmap; 3805 3806 if (pmap->pm_active & CPUMASK_LOCK) { 3807 while (pmap->pm_active & CPUMASK_LOCK) { 3808 cpu_pause(); 3809 cpu_ccfence(); 3810 lwkt_process_ipiq(); 3811 } 3812 } 3813 } 3814 3815 #endif 3816 3817 vm_offset_t 3818 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3819 { 3820 3821 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3822 return addr; 3823 } 3824 3825 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 3826 return addr; 3827 } 3828