1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2003 Peter Wemm 8 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 9 * Copyright (c) 2008, 2009 The DragonFly Project. 10 * Copyright (c) 2008, 2009 Jordan Gordeev. 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 47 */ 48 49 /* 50 * Manages physical address maps. 51 * 52 * In addition to hardware address maps, this 53 * module is called upon to provide software-use-only 54 * maps which may or may not be stored in the same 55 * form as hardware maps. These pseudo-maps are 56 * used to store intermediate results from copy 57 * operations to and from address spaces. 58 * 59 * Since the information managed by this module is 60 * also stored by the logical address mapping module, 61 * this module may throw away valid virtual-to-physical 62 * mappings at almost any time. However, invalidations 63 * of virtual-to-physical mappings must be done as 64 * requested. 65 * 66 * In order to cope with hardware architectures which 67 * make virtual-to-physical map invalidates expensive, 68 * this module may delay invalidate or reduced protection 69 * operations until such time as they are actually 70 * necessary. This module is given full information as 71 * to which processors are currently using which maps, 72 * and to when physical maps must be made correct. 73 */ 74 75 #if JG 76 #include "opt_disable_pse.h" 77 #include "opt_pmap.h" 78 #endif 79 #include "opt_msgbuf.h" 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/kernel.h> 84 #include <sys/proc.h> 85 #include <sys/msgbuf.h> 86 #include <sys/vmmeter.h> 87 #include <sys/mman.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <sys/sysctl.h> 92 #include <sys/lock.h> 93 #include <vm/vm_kern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vm_map.h> 96 #include <vm/vm_object.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_pageout.h> 99 #include <vm/vm_pager.h> 100 #include <vm/vm_zone.h> 101 102 #include <sys/user.h> 103 #include <sys/thread2.h> 104 #include <sys/sysref2.h> 105 106 #include <machine/cputypes.h> 107 #include <machine/md_var.h> 108 #include <machine/specialreg.h> 109 #include <machine/smp.h> 110 #include <machine_base/apic/apicreg.h> 111 #include <machine/globaldata.h> 112 #include <machine/pmap.h> 113 #include <machine/pmap_inval.h> 114 115 #include <ddb/ddb.h> 116 117 #define PMAP_KEEP_PDIRS 118 #ifndef PMAP_SHPGPERPROC 119 #define PMAP_SHPGPERPROC 200 120 #endif 121 122 #if defined(DIAGNOSTIC) 123 #define PMAP_DIAGNOSTIC 124 #endif 125 126 #define MINPV 2048 127 128 /* 129 * Get PDEs and PTEs for user/kernel address space 130 */ 131 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 132 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 133 134 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & PG_V) != 0) 135 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & PG_W) != 0) 136 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & PG_M) != 0) 137 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & PG_A) != 0) 138 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & PG_V) != 0) 139 140 141 /* 142 * Given a map and a machine independent protection code, 143 * convert to a vax protection code. 144 */ 145 #define pte_prot(m, p) \ 146 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 147 static int protection_codes[8]; 148 149 struct pmap kernel_pmap; 150 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); 151 152 vm_paddr_t avail_start; /* PA of first available physical page */ 153 vm_paddr_t avail_end; /* PA of last available physical page */ 154 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 155 vm_offset_t virtual2_end; 156 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 157 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 158 vm_offset_t KvaStart; /* VA start of KVA space */ 159 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 160 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 161 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 162 static int pgeflag; /* PG_G or-in */ 163 static int pseflag; /* PG_PS or-in */ 164 165 static vm_object_t kptobj; 166 167 static int ndmpdp; 168 static vm_paddr_t dmaplimit; 169 static int nkpt; 170 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 171 172 static uint64_t KPTbase; 173 static uint64_t KPTphys; 174 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 175 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 176 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 177 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 178 179 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 180 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 181 182 /* 183 * Data for the pv entry allocation mechanism 184 */ 185 static vm_zone_t pvzone; 186 static struct vm_zone pvzone_store; 187 static struct vm_object pvzone_obj; 188 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; 189 static int pmap_pagedaemon_waken = 0; 190 static struct pv_entry *pvinit; 191 192 /* 193 * All those kernel PT submaps that BSD is so fond of 194 */ 195 pt_entry_t *CMAP1 = 0, *ptmmap; 196 caddr_t CADDR1 = 0, ptvmmap = 0; 197 static pt_entry_t *msgbufmap; 198 struct msgbuf *msgbufp=0; 199 200 /* 201 * Crashdump maps. 202 */ 203 static pt_entry_t *pt_crashdumpmap; 204 static caddr_t crashdumpmap; 205 206 extern pt_entry_t *SMPpt; 207 extern uint64_t SMPptpa; 208 209 #define DISABLE_PSE 210 211 static pv_entry_t get_pv_entry (void); 212 static void i386_protection_init (void); 213 static void create_pagetables(vm_paddr_t *firstaddr); 214 static void pmap_remove_all (vm_page_t m); 215 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 216 vm_offset_t sva, pmap_inval_info_t info); 217 static void pmap_remove_page (struct pmap *pmap, 218 vm_offset_t va, pmap_inval_info_t info); 219 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 220 vm_offset_t va, pmap_inval_info_t info); 221 static boolean_t pmap_testbit (vm_page_t m, int bit); 222 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 223 vm_page_t mpte, vm_page_t m); 224 225 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 226 227 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 228 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 229 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 230 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 231 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 232 pmap_inval_info_t info); 233 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t); 234 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 235 236 static unsigned pdir4mb; 237 238 /* 239 * Move the kernel virtual free pointer to the next 240 * 2MB. This is used to help improve performance 241 * by using a large (2MB) page for much of the kernel 242 * (.text, .data, .bss) 243 */ 244 static 245 vm_offset_t 246 pmap_kmem_choose(vm_offset_t addr) 247 { 248 vm_offset_t newaddr = addr; 249 250 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 251 return newaddr; 252 } 253 254 /* 255 * pmap_pte_quick: 256 * 257 * Super fast pmap_pte routine best used when scanning the pv lists. 258 * This eliminates many course-grained invltlb calls. Note that many of 259 * the pv list scans are across different pmaps and it is very wasteful 260 * to do an entire invltlb when checking a single mapping. 261 * 262 * Should only be called while in a critical section. 263 */ 264 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 265 266 static 267 pt_entry_t * 268 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 269 { 270 return pmap_pte(pmap, va); 271 } 272 273 /* Return a non-clipped PD index for a given VA */ 274 static __inline 275 vm_pindex_t 276 pmap_pde_pindex(vm_offset_t va) 277 { 278 return va >> PDRSHIFT; 279 } 280 281 /* Return various clipped indexes for a given VA */ 282 static __inline 283 vm_pindex_t 284 pmap_pte_index(vm_offset_t va) 285 { 286 287 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 288 } 289 290 static __inline 291 vm_pindex_t 292 pmap_pde_index(vm_offset_t va) 293 { 294 295 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 296 } 297 298 static __inline 299 vm_pindex_t 300 pmap_pdpe_index(vm_offset_t va) 301 { 302 303 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 304 } 305 306 static __inline 307 vm_pindex_t 308 pmap_pml4e_index(vm_offset_t va) 309 { 310 311 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 312 } 313 314 /* Return a pointer to the PML4 slot that corresponds to a VA */ 315 static __inline 316 pml4_entry_t * 317 pmap_pml4e(pmap_t pmap, vm_offset_t va) 318 { 319 320 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 321 } 322 323 /* Return a pointer to the PDP slot that corresponds to a VA */ 324 static __inline 325 pdp_entry_t * 326 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 327 { 328 pdp_entry_t *pdpe; 329 330 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 331 return (&pdpe[pmap_pdpe_index(va)]); 332 } 333 334 /* Return a pointer to the PDP slot that corresponds to a VA */ 335 static __inline 336 pdp_entry_t * 337 pmap_pdpe(pmap_t pmap, vm_offset_t va) 338 { 339 pml4_entry_t *pml4e; 340 341 pml4e = pmap_pml4e(pmap, va); 342 if ((*pml4e & PG_V) == 0) 343 return NULL; 344 return (pmap_pml4e_to_pdpe(pml4e, va)); 345 } 346 347 /* Return a pointer to the PD slot that corresponds to a VA */ 348 static __inline 349 pd_entry_t * 350 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 351 { 352 pd_entry_t *pde; 353 354 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 355 return (&pde[pmap_pde_index(va)]); 356 } 357 358 /* Return a pointer to the PD slot that corresponds to a VA */ 359 static __inline 360 pd_entry_t * 361 pmap_pde(pmap_t pmap, vm_offset_t va) 362 { 363 pdp_entry_t *pdpe; 364 365 pdpe = pmap_pdpe(pmap, va); 366 if (pdpe == NULL || (*pdpe & PG_V) == 0) 367 return NULL; 368 return (pmap_pdpe_to_pde(pdpe, va)); 369 } 370 371 /* Return a pointer to the PT slot that corresponds to a VA */ 372 static __inline 373 pt_entry_t * 374 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 375 { 376 pt_entry_t *pte; 377 378 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 379 return (&pte[pmap_pte_index(va)]); 380 } 381 382 /* Return a pointer to the PT slot that corresponds to a VA */ 383 static __inline 384 pt_entry_t * 385 pmap_pte(pmap_t pmap, vm_offset_t va) 386 { 387 pd_entry_t *pde; 388 389 pde = pmap_pde(pmap, va); 390 if (pde == NULL || (*pde & PG_V) == 0) 391 return NULL; 392 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 393 return ((pt_entry_t *)pde); 394 return (pmap_pde_to_pte(pde, va)); 395 } 396 397 static __inline 398 pt_entry_t * 399 vtopte(vm_offset_t va) 400 { 401 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 402 403 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 404 } 405 406 static __inline 407 pd_entry_t * 408 vtopde(vm_offset_t va) 409 { 410 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 411 412 return (PDmap + ((va >> PDRSHIFT) & mask)); 413 } 414 415 static uint64_t 416 allocpages(vm_paddr_t *firstaddr, int n) 417 { 418 uint64_t ret; 419 420 ret = *firstaddr; 421 bzero((void *)ret, n * PAGE_SIZE); 422 *firstaddr += n * PAGE_SIZE; 423 return (ret); 424 } 425 426 static 427 void 428 create_pagetables(vm_paddr_t *firstaddr) 429 { 430 int i; 431 432 /* 433 * We are running (mostly) V=P at this point 434 * 435 * Calculate NKPT - number of kernel page tables. We have to 436 * accomodoate prealloction of the vm_page_array, dump bitmap, 437 * MSGBUF_SIZE, and other stuff. Be generous. 438 * 439 * Maxmem is in pages. 440 */ 441 nkpt = (Maxmem * (sizeof(struct vm_page) * 2) + MSGBUF_SIZE) / NBPDR; 442 443 /* 444 * Allocate pages 445 */ 446 KPTbase = allocpages(firstaddr, nkpt); 447 KPTphys = allocpages(firstaddr, nkpt); 448 KPML4phys = allocpages(firstaddr, 1); 449 KPDPphys = allocpages(firstaddr, NKPML4E); 450 451 /* 452 * Calculate the page directory base for KERNBASE, 453 * that is where we start populating the page table pages. 454 * Basically this is the end - 2. 455 */ 456 KPDphys = allocpages(firstaddr, NKPDPE); 457 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 458 459 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 460 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 461 ndmpdp = 4; 462 DMPDPphys = allocpages(firstaddr, NDMPML4E); 463 if ((amd_feature & AMDID_PAGE1GB) == 0) 464 DMPDphys = allocpages(firstaddr, ndmpdp); 465 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 466 467 /* 468 * Fill in the underlying page table pages for the area around 469 * KERNBASE. This remaps low physical memory to KERNBASE. 470 * 471 * Read-only from zero to physfree 472 * XXX not fully used, underneath 2M pages 473 */ 474 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 475 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 476 ((pt_entry_t *)KPTbase)[i] |= PG_RW | PG_V | PG_G; 477 } 478 479 /* 480 * Now map the initial kernel page tables. One block of page 481 * tables is placed at the beginning of kernel virtual memory, 482 * and another block is placed at KERNBASE to map the kernel binary, 483 * data, bss, and initial pre-allocations. 484 */ 485 for (i = 0; i < nkpt; i++) { 486 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 487 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V; 488 } 489 for (i = 0; i < nkpt; i++) { 490 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 491 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 492 } 493 494 /* 495 * Map from zero to end of allocations using 2M pages as an 496 * optimization. This will bypass some of the KPTBase pages 497 * above in the KERNBASE area. 498 */ 499 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 500 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 501 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V | PG_PS | PG_G; 502 } 503 504 /* 505 * And connect up the PD to the PDP. The kernel pmap is expected 506 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 507 */ 508 for (i = 0; i < NKPDPE; i++) { 509 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 510 KPDphys + (i << PAGE_SHIFT); 511 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 512 PG_RW | PG_V | PG_U; 513 } 514 515 /* Now set up the direct map space using either 2MB or 1GB pages */ 516 /* Preset PG_M and PG_A because demotion expects it */ 517 if ((amd_feature & AMDID_PAGE1GB) == 0) { 518 for (i = 0; i < NPDEPG * ndmpdp; i++) { 519 ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; 520 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | 521 PG_G | PG_M | PG_A; 522 } 523 /* And the direct map space's PDP */ 524 for (i = 0; i < ndmpdp; i++) { 525 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 526 (i << PAGE_SHIFT); 527 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 528 } 529 } else { 530 for (i = 0; i < ndmpdp; i++) { 531 ((pdp_entry_t *)DMPDPphys)[i] = 532 (vm_paddr_t)i << PDPSHIFT; 533 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | 534 PG_G | PG_M | PG_A; 535 } 536 } 537 538 /* And recursively map PML4 to itself in order to get PTmap */ 539 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 540 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 541 542 /* Connect the Direct Map slot up to the PML4 */ 543 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; 544 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; 545 546 /* Connect the KVA slot up to the PML4 */ 547 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 548 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 549 } 550 551 /* 552 * Bootstrap the system enough to run with virtual memory. 553 * 554 * On the i386 this is called after mapping has already been enabled 555 * and just syncs the pmap module with what has already been done. 556 * [We can't call it easily with mapping off since the kernel is not 557 * mapped with PA == VA, hence we would have to relocate every address 558 * from the linked base (virtual) address "KERNBASE" to the actual 559 * (physical) address starting relative to 0] 560 */ 561 void 562 pmap_bootstrap(vm_paddr_t *firstaddr) 563 { 564 vm_offset_t va; 565 pt_entry_t *pte; 566 struct mdglobaldata *gd; 567 int pg; 568 569 KvaStart = VM_MIN_KERNEL_ADDRESS; 570 KvaEnd = VM_MAX_KERNEL_ADDRESS; 571 KvaSize = KvaEnd - KvaStart; 572 573 avail_start = *firstaddr; 574 575 /* 576 * Create an initial set of page tables to run the kernel in. 577 */ 578 create_pagetables(firstaddr); 579 580 virtual2_start = KvaStart; 581 virtual2_end = PTOV_OFFSET; 582 583 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 584 virtual_start = pmap_kmem_choose(virtual_start); 585 586 virtual_end = VM_MAX_KERNEL_ADDRESS; 587 588 /* XXX do %cr0 as well */ 589 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 590 load_cr3(KPML4phys); 591 592 /* 593 * Initialize protection array. 594 */ 595 i386_protection_init(); 596 597 /* 598 * The kernel's pmap is statically allocated so we don't have to use 599 * pmap_create, which is unlikely to work correctly at this part of 600 * the boot sequence (XXX and which no longer exists). 601 */ 602 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 603 kernel_pmap.pm_count = 1; 604 kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK; 605 TAILQ_INIT(&kernel_pmap.pm_pvlist); 606 607 /* 608 * Reserve some special page table entries/VA space for temporary 609 * mapping of pages. 610 */ 611 #define SYSMAP(c, p, v, n) \ 612 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 613 614 va = virtual_start; 615 pte = vtopte(va); 616 617 /* 618 * CMAP1/CMAP2 are used for zeroing and copying pages. 619 */ 620 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 621 622 /* 623 * Crashdump maps. 624 */ 625 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 626 627 /* 628 * ptvmmap is used for reading arbitrary physical pages via 629 * /dev/mem. 630 */ 631 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 632 633 /* 634 * msgbufp is used to map the system message buffer. 635 * XXX msgbufmap is not used. 636 */ 637 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 638 atop(round_page(MSGBUF_SIZE))) 639 640 virtual_start = va; 641 642 *CMAP1 = 0; 643 644 /* 645 * PG_G is terribly broken on SMP because we IPI invltlb's in some 646 * cases rather then invl1pg. Actually, I don't even know why it 647 * works under UP because self-referential page table mappings 648 */ 649 #ifdef SMP 650 pgeflag = 0; 651 #else 652 if (cpu_feature & CPUID_PGE) 653 pgeflag = PG_G; 654 #endif 655 656 /* 657 * Initialize the 4MB page size flag 658 */ 659 pseflag = 0; 660 /* 661 * The 4MB page version of the initial 662 * kernel page mapping. 663 */ 664 pdir4mb = 0; 665 666 #if !defined(DISABLE_PSE) 667 if (cpu_feature & CPUID_PSE) { 668 pt_entry_t ptditmp; 669 /* 670 * Note that we have enabled PSE mode 671 */ 672 pseflag = PG_PS; 673 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 674 ptditmp &= ~(NBPDR - 1); 675 ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; 676 pdir4mb = ptditmp; 677 678 #ifndef SMP 679 /* 680 * Enable the PSE mode. If we are SMP we can't do this 681 * now because the APs will not be able to use it when 682 * they boot up. 683 */ 684 load_cr4(rcr4() | CR4_PSE); 685 686 /* 687 * We can do the mapping here for the single processor 688 * case. We simply ignore the old page table page from 689 * now on. 690 */ 691 /* 692 * For SMP, we still need 4K pages to bootstrap APs, 693 * PSE will be enabled as soon as all APs are up. 694 */ 695 PTD[KPTDI] = (pd_entry_t)ptditmp; 696 cpu_invltlb(); 697 #endif 698 } 699 #endif 700 #ifdef SMP 701 if (cpu_apic_address == 0) 702 panic("pmap_bootstrap: no local apic!"); 703 #endif 704 705 /* 706 * We need to finish setting up the globaldata page for the BSP. 707 * locore has already populated the page table for the mdglobaldata 708 * portion. 709 */ 710 pg = MDGLOBALDATA_BASEALLOC_PAGES; 711 gd = &CPU_prvspace[0].mdglobaldata; 712 gd->gd_CMAP1 = &SMPpt[pg + 0]; 713 gd->gd_CMAP2 = &SMPpt[pg + 1]; 714 gd->gd_CMAP3 = &SMPpt[pg + 2]; 715 gd->gd_PMAP1 = &SMPpt[pg + 3]; 716 gd->gd_CADDR1 = CPU_prvspace[0].CPAGE1; 717 gd->gd_CADDR2 = CPU_prvspace[0].CPAGE2; 718 gd->gd_CADDR3 = CPU_prvspace[0].CPAGE3; 719 gd->gd_PADDR1 = (pt_entry_t *)CPU_prvspace[0].PPAGE1; 720 721 cpu_invltlb(); 722 } 723 724 #ifdef SMP 725 /* 726 * Set 4mb pdir for mp startup 727 */ 728 void 729 pmap_set_opt(void) 730 { 731 if (pseflag && (cpu_feature & CPUID_PSE)) { 732 load_cr4(rcr4() | CR4_PSE); 733 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 734 cpu_invltlb(); 735 } 736 } 737 } 738 #endif 739 740 /* 741 * Initialize the pmap module. 742 * Called by vm_init, to initialize any structures that the pmap 743 * system needs to map virtual memory. 744 * pmap_init has been enhanced to support in a fairly consistant 745 * way, discontiguous physical memory. 746 */ 747 void 748 pmap_init(void) 749 { 750 int i; 751 int initial_pvs; 752 753 /* 754 * object for kernel page table pages 755 */ 756 /* JG I think the number can be arbitrary */ 757 kptobj = vm_object_allocate(OBJT_DEFAULT, 5); 758 759 /* 760 * Allocate memory for random pmap data structures. Includes the 761 * pv_head_table. 762 */ 763 764 for(i = 0; i < vm_page_array_size; i++) { 765 vm_page_t m; 766 767 m = &vm_page_array[i]; 768 TAILQ_INIT(&m->md.pv_list); 769 m->md.pv_list_count = 0; 770 } 771 772 /* 773 * init the pv free list 774 */ 775 initial_pvs = vm_page_array_size; 776 if (initial_pvs < MINPV) 777 initial_pvs = MINPV; 778 pvzone = &pvzone_store; 779 pvinit = (struct pv_entry *) kmem_alloc(&kernel_map, 780 initial_pvs * sizeof (struct pv_entry)); 781 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, 782 initial_pvs); 783 784 /* 785 * Now it is safe to enable pv_table recording. 786 */ 787 pmap_initialized = TRUE; 788 #ifdef SMP 789 lapic = pmap_mapdev_uncacheable(cpu_apic_address, sizeof(struct LAPIC)); 790 #endif 791 } 792 793 /* 794 * Initialize the address space (zone) for the pv_entries. Set a 795 * high water mark so that the system can recover from excessive 796 * numbers of pv entries. 797 */ 798 void 799 pmap_init2(void) 800 { 801 int shpgperproc = PMAP_SHPGPERPROC; 802 803 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 804 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 805 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 806 pv_entry_high_water = 9 * (pv_entry_max / 10); 807 zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); 808 } 809 810 811 /*************************************************** 812 * Low level helper routines..... 813 ***************************************************/ 814 815 #if defined(PMAP_DIAGNOSTIC) 816 817 /* 818 * This code checks for non-writeable/modified pages. 819 * This should be an invalid condition. 820 */ 821 static 822 int 823 pmap_nw_modified(pt_entry_t pte) 824 { 825 if ((pte & (PG_M|PG_RW)) == PG_M) 826 return 1; 827 else 828 return 0; 829 } 830 #endif 831 832 833 /* 834 * this routine defines the region(s) of memory that should 835 * not be tested for the modified bit. 836 */ 837 static __inline 838 int 839 pmap_track_modified(vm_offset_t va) 840 { 841 if ((va < clean_sva) || (va >= clean_eva)) 842 return 1; 843 else 844 return 0; 845 } 846 847 /* 848 * Extract the physical page address associated with the map/VA pair. 849 * 850 * The caller must hold vm_token if non-blocking operation is desired. 851 */ 852 vm_paddr_t 853 pmap_extract(pmap_t pmap, vm_offset_t va) 854 { 855 vm_paddr_t rtval; 856 pt_entry_t *pte; 857 pd_entry_t pde, *pdep; 858 859 lwkt_gettoken(&vm_token); 860 rtval = 0; 861 pdep = pmap_pde(pmap, va); 862 if (pdep != NULL) { 863 pde = *pdep; 864 if (pde) { 865 if ((pde & PG_PS) != 0) { 866 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 867 } else { 868 pte = pmap_pde_to_pte(pdep, va); 869 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 870 } 871 } 872 } 873 lwkt_reltoken(&vm_token); 874 return rtval; 875 } 876 877 /* 878 * Extract the physical page address associated kernel virtual address. 879 */ 880 vm_paddr_t 881 pmap_kextract(vm_offset_t va) 882 { 883 pd_entry_t pde; 884 vm_paddr_t pa; 885 886 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 887 pa = DMAP_TO_PHYS(va); 888 } else { 889 pde = *vtopde(va); 890 if (pde & PG_PS) { 891 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 892 } else { 893 /* 894 * Beware of a concurrent promotion that changes the 895 * PDE at this point! For example, vtopte() must not 896 * be used to access the PTE because it would use the 897 * new PDE. It is, however, safe to use the old PDE 898 * because the page table page is preserved by the 899 * promotion. 900 */ 901 pa = *pmap_pde_to_pte(&pde, va); 902 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 903 } 904 } 905 return pa; 906 } 907 908 /*************************************************** 909 * Low level mapping routines..... 910 ***************************************************/ 911 912 /* 913 * Routine: pmap_kenter 914 * Function: 915 * Add a wired page to the KVA 916 * NOTE! note that in order for the mapping to take effect -- you 917 * should do an invltlb after doing the pmap_kenter(). 918 */ 919 void 920 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 921 { 922 pt_entry_t *pte; 923 pt_entry_t npte; 924 pmap_inval_info info; 925 926 pmap_inval_init(&info); 927 npte = pa | PG_RW | PG_V | pgeflag; 928 pte = vtopte(va); 929 pmap_inval_interlock(&info, &kernel_pmap, va); 930 *pte = npte; 931 pmap_inval_deinterlock(&info, &kernel_pmap); 932 pmap_inval_done(&info); 933 } 934 935 /* 936 * Routine: pmap_kenter_quick 937 * Function: 938 * Similar to pmap_kenter(), except we only invalidate the 939 * mapping on the current CPU. 940 */ 941 void 942 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 943 { 944 pt_entry_t *pte; 945 pt_entry_t npte; 946 947 npte = pa | PG_RW | PG_V | pgeflag; 948 pte = vtopte(va); 949 *pte = npte; 950 cpu_invlpg((void *)va); 951 } 952 953 void 954 pmap_kenter_sync(vm_offset_t va) 955 { 956 pmap_inval_info info; 957 958 pmap_inval_init(&info); 959 pmap_inval_interlock(&info, &kernel_pmap, va); 960 pmap_inval_deinterlock(&info, &kernel_pmap); 961 pmap_inval_done(&info); 962 } 963 964 void 965 pmap_kenter_sync_quick(vm_offset_t va) 966 { 967 cpu_invlpg((void *)va); 968 } 969 970 /* 971 * remove a page from the kernel pagetables 972 */ 973 void 974 pmap_kremove(vm_offset_t va) 975 { 976 pt_entry_t *pte; 977 pmap_inval_info info; 978 979 pmap_inval_init(&info); 980 pte = vtopte(va); 981 pmap_inval_interlock(&info, &kernel_pmap, va); 982 *pte = 0; 983 pmap_inval_deinterlock(&info, &kernel_pmap); 984 pmap_inval_done(&info); 985 } 986 987 void 988 pmap_kremove_quick(vm_offset_t va) 989 { 990 pt_entry_t *pte; 991 pte = vtopte(va); 992 *pte = 0; 993 cpu_invlpg((void *)va); 994 } 995 996 /* 997 * XXX these need to be recoded. They are not used in any critical path. 998 */ 999 void 1000 pmap_kmodify_rw(vm_offset_t va) 1001 { 1002 *vtopte(va) |= PG_RW; 1003 cpu_invlpg((void *)va); 1004 } 1005 1006 void 1007 pmap_kmodify_nc(vm_offset_t va) 1008 { 1009 *vtopte(va) |= PG_N; 1010 cpu_invlpg((void *)va); 1011 } 1012 1013 /* 1014 * Used to map a range of physical addresses into kernel virtual 1015 * address space during the low level boot, typically to map the 1016 * dump bitmap, message buffer, and vm_page_array. 1017 * 1018 * These mappings are typically made at some pointer after the end of the 1019 * kernel text+data. 1020 * 1021 * We could return PHYS_TO_DMAP(start) here and not allocate any 1022 * via (*virtp), but then kmem from userland and kernel dumps won't 1023 * have access to the related pointers. 1024 */ 1025 vm_offset_t 1026 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1027 { 1028 vm_offset_t va; 1029 vm_offset_t va_start; 1030 1031 /*return PHYS_TO_DMAP(start);*/ 1032 1033 va_start = *virtp; 1034 va = va_start; 1035 1036 while (start < end) { 1037 pmap_kenter_quick(va, start); 1038 va += PAGE_SIZE; 1039 start += PAGE_SIZE; 1040 } 1041 *virtp = va; 1042 return va_start; 1043 } 1044 1045 1046 /* 1047 * Add a list of wired pages to the kva 1048 * this routine is only used for temporary 1049 * kernel mappings that do not need to have 1050 * page modification or references recorded. 1051 * Note that old mappings are simply written 1052 * over. The page *must* be wired. 1053 */ 1054 void 1055 pmap_qenter(vm_offset_t va, vm_page_t *m, int count) 1056 { 1057 vm_offset_t end_va; 1058 1059 end_va = va + count * PAGE_SIZE; 1060 1061 while (va < end_va) { 1062 pt_entry_t *pte; 1063 1064 pte = vtopte(va); 1065 *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag; 1066 cpu_invlpg((void *)va); 1067 va += PAGE_SIZE; 1068 m++; 1069 } 1070 #ifdef SMP 1071 smp_invltlb(); /* XXX */ 1072 #endif 1073 } 1074 1075 /* 1076 * This routine jerks page mappings from the 1077 * kernel -- it is meant only for temporary mappings. 1078 * 1079 * MPSAFE, INTERRUPT SAFE (cluster callback) 1080 */ 1081 void 1082 pmap_qremove(vm_offset_t va, int count) 1083 { 1084 vm_offset_t end_va; 1085 1086 end_va = va + count * PAGE_SIZE; 1087 1088 while (va < end_va) { 1089 pt_entry_t *pte; 1090 1091 pte = vtopte(va); 1092 *pte = 0; 1093 cpu_invlpg((void *)va); 1094 va += PAGE_SIZE; 1095 } 1096 #ifdef SMP 1097 smp_invltlb(); 1098 #endif 1099 } 1100 1101 /* 1102 * This routine works like vm_page_lookup() but also blocks as long as the 1103 * page is busy. This routine does not busy the page it returns. 1104 * 1105 * Unless the caller is managing objects whos pages are in a known state, 1106 * the call should be made with a critical section held so the page's object 1107 * association remains valid on return. 1108 */ 1109 static 1110 vm_page_t 1111 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1112 { 1113 vm_page_t m; 1114 1115 do { 1116 m = vm_page_lookup(object, pindex); 1117 } while (m && vm_page_sleep_busy(m, FALSE, "pplookp")); 1118 1119 return(m); 1120 } 1121 1122 /* 1123 * Create a new thread and optionally associate it with a (new) process. 1124 * NOTE! the new thread's cpu may not equal the current cpu. 1125 */ 1126 void 1127 pmap_init_thread(thread_t td) 1128 { 1129 /* enforce pcb placement */ 1130 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1131 td->td_savefpu = &td->td_pcb->pcb_save; 1132 td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */ 1133 } 1134 1135 /* 1136 * This routine directly affects the fork perf for a process. 1137 */ 1138 void 1139 pmap_init_proc(struct proc *p) 1140 { 1141 } 1142 1143 /* 1144 * Dispose the UPAGES for a process that has exited. 1145 * This routine directly impacts the exit perf of a process. 1146 */ 1147 void 1148 pmap_dispose_proc(struct proc *p) 1149 { 1150 KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p)); 1151 } 1152 1153 /*************************************************** 1154 * Page table page management routines..... 1155 ***************************************************/ 1156 1157 /* 1158 * This routine unholds page table pages, and if the hold count 1159 * drops to zero, then it decrements the wire count. 1160 */ 1161 static __inline 1162 int 1163 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1164 pmap_inval_info_t info) 1165 { 1166 KKASSERT(m->hold_count > 0); 1167 if (m->hold_count > 1) { 1168 vm_page_unhold(m); 1169 return 0; 1170 } else { 1171 return _pmap_unwire_pte_hold(pmap, va, m, info); 1172 } 1173 } 1174 1175 static 1176 int 1177 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1178 pmap_inval_info_t info) 1179 { 1180 /* 1181 * Wait until we can busy the page ourselves. We cannot have 1182 * any active flushes if we block. We own one hold count on the 1183 * page so it cannot be freed out from under us. 1184 */ 1185 if (m->flags & PG_BUSY) { 1186 pmap_inval_flush(info); 1187 while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) 1188 ; 1189 } 1190 KASSERT(m->queue == PQ_NONE, 1191 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m)); 1192 1193 /* 1194 * This case can occur if new references were acquired while 1195 * we were blocked. 1196 */ 1197 if (m->hold_count > 1) { 1198 KKASSERT(m->hold_count > 1); 1199 vm_page_unhold(m); 1200 return 0; 1201 } 1202 1203 /* 1204 * Unmap the page table page 1205 */ 1206 KKASSERT(m->hold_count == 1); 1207 vm_page_busy(m); 1208 pmap_inval_interlock(info, pmap, -1); 1209 1210 if (m->pindex >= (NUPDE + NUPDPE)) { 1211 /* PDP page */ 1212 pml4_entry_t *pml4; 1213 pml4 = pmap_pml4e(pmap, va); 1214 *pml4 = 0; 1215 } else if (m->pindex >= NUPDE) { 1216 /* PD page */ 1217 pdp_entry_t *pdp; 1218 pdp = pmap_pdpe(pmap, va); 1219 *pdp = 0; 1220 } else { 1221 /* PT page */ 1222 pd_entry_t *pd; 1223 pd = pmap_pde(pmap, va); 1224 *pd = 0; 1225 } 1226 1227 KKASSERT(pmap->pm_stats.resident_count > 0); 1228 --pmap->pm_stats.resident_count; 1229 1230 if (pmap->pm_ptphint == m) 1231 pmap->pm_ptphint = NULL; 1232 pmap_inval_deinterlock(info, pmap); 1233 1234 if (m->pindex < NUPDE) { 1235 /* We just released a PT, unhold the matching PD */ 1236 vm_page_t pdpg; 1237 1238 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1239 pmap_unwire_pte_hold(pmap, va, pdpg, info); 1240 } 1241 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1242 /* We just released a PD, unhold the matching PDP */ 1243 vm_page_t pdppg; 1244 1245 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1246 pmap_unwire_pte_hold(pmap, va, pdppg, info); 1247 } 1248 1249 /* 1250 * This was our last hold, the page had better be unwired 1251 * after we decrement wire_count. 1252 * 1253 * FUTURE NOTE: shared page directory page could result in 1254 * multiple wire counts. 1255 */ 1256 vm_page_unhold(m); 1257 --m->wire_count; 1258 KKASSERT(m->wire_count == 0); 1259 --vmstats.v_wire_count; 1260 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1261 vm_page_flash(m); 1262 vm_page_free_zero(m); 1263 1264 return 1; 1265 } 1266 1267 /* 1268 * After removing a page table entry, this routine is used to 1269 * conditionally free the page, and manage the hold/wire counts. 1270 */ 1271 static 1272 int 1273 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte, 1274 pmap_inval_info_t info) 1275 { 1276 vm_pindex_t ptepindex; 1277 1278 if (va >= VM_MAX_USER_ADDRESS) 1279 return 0; 1280 1281 if (mpte == NULL) { 1282 ptepindex = pmap_pde_pindex(va); 1283 #if JGHINT 1284 if (pmap->pm_ptphint && 1285 (pmap->pm_ptphint->pindex == ptepindex)) { 1286 mpte = pmap->pm_ptphint; 1287 } else { 1288 #endif 1289 pmap_inval_flush(info); 1290 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1291 pmap->pm_ptphint = mpte; 1292 #if JGHINT 1293 } 1294 #endif 1295 } 1296 return pmap_unwire_pte_hold(pmap, va, mpte, info); 1297 } 1298 1299 /* 1300 * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because 1301 * it, and IdlePTD, represents the template used to update all other pmaps. 1302 * 1303 * On architectures where the kernel pmap is not integrated into the user 1304 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1305 * kernel_pmap should be used to directly access the kernel_pmap. 1306 */ 1307 void 1308 pmap_pinit0(struct pmap *pmap) 1309 { 1310 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1311 pmap->pm_count = 1; 1312 pmap->pm_active = 0; 1313 pmap->pm_ptphint = NULL; 1314 TAILQ_INIT(&pmap->pm_pvlist); 1315 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1316 } 1317 1318 /* 1319 * Initialize a preallocated and zeroed pmap structure, 1320 * such as one in a vmspace structure. 1321 */ 1322 void 1323 pmap_pinit(struct pmap *pmap) 1324 { 1325 vm_page_t ptdpg; 1326 1327 /* 1328 * No need to allocate page table space yet but we do need a valid 1329 * page directory table. 1330 */ 1331 if (pmap->pm_pml4 == NULL) { 1332 pmap->pm_pml4 = 1333 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 1334 } 1335 1336 /* 1337 * Allocate an object for the ptes 1338 */ 1339 if (pmap->pm_pteobj == NULL) 1340 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + PML4PML4I + 1); 1341 1342 /* 1343 * Allocate the page directory page, unless we already have 1344 * one cached. If we used the cached page the wire_count will 1345 * already be set appropriately. 1346 */ 1347 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1348 ptdpg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I, 1349 VM_ALLOC_NORMAL | VM_ALLOC_RETRY); 1350 pmap->pm_pdirm = ptdpg; 1351 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); 1352 ptdpg->valid = VM_PAGE_BITS_ALL; 1353 if (ptdpg->wire_count == 0) 1354 ++vmstats.v_wire_count; 1355 ptdpg->wire_count = 1; 1356 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1357 } 1358 if ((ptdpg->flags & PG_ZERO) == 0) 1359 bzero(pmap->pm_pml4, PAGE_SIZE); 1360 1361 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1362 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; 1363 1364 /* install self-referential address mapping entry */ 1365 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; 1366 1367 pmap->pm_count = 1; 1368 pmap->pm_active = 0; 1369 pmap->pm_ptphint = NULL; 1370 TAILQ_INIT(&pmap->pm_pvlist); 1371 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1372 pmap->pm_stats.resident_count = 1; 1373 } 1374 1375 /* 1376 * Clean up a pmap structure so it can be physically freed. This routine 1377 * is called by the vmspace dtor function. A great deal of pmap data is 1378 * left passively mapped to improve vmspace management so we have a bit 1379 * of cleanup work to do here. 1380 */ 1381 void 1382 pmap_puninit(pmap_t pmap) 1383 { 1384 vm_page_t p; 1385 1386 KKASSERT(pmap->pm_active == 0); 1387 lwkt_gettoken(&vm_token); 1388 if ((p = pmap->pm_pdirm) != NULL) { 1389 KKASSERT(pmap->pm_pml4 != NULL); 1390 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1391 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1392 p->wire_count--; 1393 vmstats.v_wire_count--; 1394 KKASSERT((p->flags & PG_BUSY) == 0); 1395 vm_page_busy(p); 1396 vm_page_free_zero(p); 1397 pmap->pm_pdirm = NULL; 1398 } 1399 if (pmap->pm_pml4) { 1400 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1401 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1402 pmap->pm_pml4 = NULL; 1403 } 1404 if (pmap->pm_pteobj) { 1405 vm_object_deallocate(pmap->pm_pteobj); 1406 pmap->pm_pteobj = NULL; 1407 } 1408 lwkt_reltoken(&vm_token); 1409 } 1410 1411 /* 1412 * Wire in kernel global address entries. To avoid a race condition 1413 * between pmap initialization and pmap_growkernel, this procedure 1414 * adds the pmap to the master list (which growkernel scans to update), 1415 * then copies the template. 1416 */ 1417 void 1418 pmap_pinit2(struct pmap *pmap) 1419 { 1420 crit_enter(); 1421 lwkt_gettoken(&vm_token); 1422 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); 1423 /* XXX copies current process, does not fill in MPPTDI */ 1424 lwkt_reltoken(&vm_token); 1425 crit_exit(); 1426 } 1427 1428 /* 1429 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1430 * 0 on failure (if the procedure had to sleep). 1431 * 1432 * When asked to remove the page directory page itself, we actually just 1433 * leave it cached so we do not have to incur the SMP inval overhead of 1434 * removing the kernel mapping. pmap_puninit() will take care of it. 1435 */ 1436 static 1437 int 1438 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1439 { 1440 /* 1441 * This code optimizes the case of freeing non-busy 1442 * page-table pages. Those pages are zero now, and 1443 * might as well be placed directly into the zero queue. 1444 */ 1445 if (vm_page_sleep_busy(p, FALSE, "pmaprl")) 1446 return 0; 1447 1448 vm_page_busy(p); 1449 1450 /* 1451 * Remove the page table page from the processes address space. 1452 */ 1453 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1454 /* 1455 * We are the pml4 table itself. 1456 */ 1457 /* XXX anything to do here? */ 1458 } else if (p->pindex >= (NUPDE + NUPDPE)) { 1459 /* 1460 * Remove a PDP page from the PML4. We do not maintain 1461 * hold counts on the PML4 page. 1462 */ 1463 pml4_entry_t *pml4; 1464 vm_page_t m4; 1465 int idx; 1466 1467 m4 = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I); 1468 KKASSERT(m4 != NULL); 1469 pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1470 idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG; 1471 KKASSERT(pml4[idx] != 0); 1472 pml4[idx] = 0; 1473 } else if (p->pindex >= NUPDE) { 1474 /* 1475 * Remove a PD page from the PDP and drop the hold count 1476 * on the PDP. The PDP is left cached in the pmap if 1477 * the hold count drops to 0 so the wire count remains 1478 * intact. 1479 */ 1480 vm_page_t m3; 1481 pdp_entry_t *pdp; 1482 int idx; 1483 1484 m3 = vm_page_lookup(pmap->pm_pteobj, 1485 NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG); 1486 KKASSERT(m3 != NULL); 1487 pdp = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1488 idx = (p->pindex - NUPDE) % NPDPEPG; 1489 KKASSERT(pdp[idx] != 0); 1490 pdp[idx] = 0; 1491 m3->hold_count--; 1492 } else { 1493 /* 1494 * Remove a PT page from the PD and drop the hold count 1495 * on the PD. The PD is left cached in the pmap if 1496 * the hold count drops to 0 so the wire count remains 1497 * intact. 1498 */ 1499 vm_page_t m2; 1500 pd_entry_t *pd; 1501 int idx; 1502 1503 m2 = vm_page_lookup(pmap->pm_pteobj, 1504 NUPDE + p->pindex / NPDEPG); 1505 KKASSERT(m2 != NULL); 1506 pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1507 idx = p->pindex % NPDEPG; 1508 pd[idx] = 0; 1509 m2->hold_count--; 1510 } 1511 1512 /* 1513 * One fewer mappings in the pmap. p's hold count had better 1514 * be zero. 1515 */ 1516 KKASSERT(pmap->pm_stats.resident_count > 0); 1517 --pmap->pm_stats.resident_count; 1518 if (p->hold_count) 1519 panic("pmap_release: freeing held page table page"); 1520 if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) 1521 pmap->pm_ptphint = NULL; 1522 1523 /* 1524 * We leave the top-level page table page cached, wired, and mapped in 1525 * the pmap until the dtor function (pmap_puninit()) gets called. 1526 * However, still clean it up so we can set PG_ZERO. 1527 */ 1528 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1529 bzero(pmap->pm_pml4, PAGE_SIZE); 1530 vm_page_flag_set(p, PG_ZERO); 1531 vm_page_wakeup(p); 1532 } else { 1533 p->wire_count--; 1534 KKASSERT(p->wire_count == 0); 1535 vmstats.v_wire_count--; 1536 /* JG eventually revert to using vm_page_free_zero() */ 1537 vm_page_free(p); 1538 } 1539 return 1; 1540 } 1541 1542 /* 1543 * This routine is called when various levels in the page table need to 1544 * be populated. This routine cannot fail. 1545 */ 1546 static 1547 vm_page_t 1548 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1549 { 1550 vm_page_t m; 1551 1552 /* 1553 * Find or fabricate a new pagetable page. This will busy the page. 1554 */ 1555 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1556 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1557 if ((m->flags & PG_ZERO) == 0) { 1558 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 1559 } 1560 1561 KASSERT(m->queue == PQ_NONE, 1562 ("_pmap_allocpte: %p->queue != PQ_NONE", m)); 1563 1564 /* 1565 * Increment the hold count for the page we will be returning to 1566 * the caller. 1567 */ 1568 m->hold_count++; 1569 if (m->wire_count++ == 0) 1570 vmstats.v_wire_count++; 1571 1572 /* 1573 * Map the pagetable page into the process address space, if 1574 * it isn't already there. 1575 * 1576 * It is possible that someone else got in and mapped the page 1577 * directory page while we were blocked, if so just unbusy and 1578 * return the held page. 1579 */ 1580 if (ptepindex >= (NUPDE + NUPDPE)) { 1581 /* 1582 * Wire up a new PDP page in the PML4 1583 */ 1584 vm_pindex_t pml4index; 1585 pml4_entry_t *pml4; 1586 1587 pml4index = ptepindex - (NUPDE + NUPDPE); 1588 pml4 = &pmap->pm_pml4[pml4index]; 1589 if (*pml4 & PG_V) { 1590 if (--m->wire_count == 0) 1591 --vmstats.v_wire_count; 1592 vm_page_wakeup(m); 1593 return(m); 1594 } 1595 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1596 } else if (ptepindex >= NUPDE) { 1597 /* 1598 * Wire up a new PD page in the PDP 1599 */ 1600 vm_pindex_t pml4index; 1601 vm_pindex_t pdpindex; 1602 vm_page_t pdppg; 1603 pml4_entry_t *pml4; 1604 pdp_entry_t *pdp; 1605 1606 pdpindex = ptepindex - NUPDE; 1607 pml4index = pdpindex >> NPML4EPGSHIFT; 1608 1609 pml4 = &pmap->pm_pml4[pml4index]; 1610 if ((*pml4 & PG_V) == 0) { 1611 /* 1612 * Have to allocate a new PDP page, recurse. 1613 * This always succeeds. Returned page will 1614 * be held. 1615 */ 1616 pdppg = _pmap_allocpte(pmap, 1617 NUPDE + NUPDPE + pml4index); 1618 } else { 1619 /* 1620 * Add a held reference to the PDP page. 1621 */ 1622 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1623 pdppg->hold_count++; 1624 } 1625 1626 /* 1627 * Now find the pdp_entry and map the PDP. If the PDP 1628 * has already been mapped unwind and return the 1629 * already-mapped PDP held. 1630 * 1631 * pdppg is left held (hold_count is incremented for 1632 * each PD in the PDP). 1633 */ 1634 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1635 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1636 if (*pdp & PG_V) { 1637 vm_page_unhold(pdppg); 1638 if (--m->wire_count == 0) 1639 --vmstats.v_wire_count; 1640 vm_page_wakeup(m); 1641 return(m); 1642 } 1643 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1644 } else { 1645 /* 1646 * Wire up the new PT page in the PD 1647 */ 1648 vm_pindex_t pml4index; 1649 vm_pindex_t pdpindex; 1650 pml4_entry_t *pml4; 1651 pdp_entry_t *pdp; 1652 pd_entry_t *pd; 1653 vm_page_t pdpg; 1654 1655 pdpindex = ptepindex >> NPDPEPGSHIFT; 1656 pml4index = pdpindex >> NPML4EPGSHIFT; 1657 1658 /* 1659 * Locate the PDP page in the PML4, then the PD page in 1660 * the PDP. If either does not exist we simply recurse 1661 * to allocate them. 1662 * 1663 * We can just recurse on the PD page as it will recurse 1664 * on the PDP if necessary. 1665 */ 1666 pml4 = &pmap->pm_pml4[pml4index]; 1667 if ((*pml4 & PG_V) == 0) { 1668 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1669 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1670 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1671 } else { 1672 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1673 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1674 if ((*pdp & PG_V) == 0) { 1675 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1676 } else { 1677 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1678 pdpg->hold_count++; 1679 } 1680 } 1681 1682 /* 1683 * Now fill in the pte in the PD. If the pte already exists 1684 * (again, if we raced the grab), unhold pdpg and unwire 1685 * m, returning a held m. 1686 * 1687 * pdpg is left held (hold_count is incremented for 1688 * each PT in the PD). 1689 */ 1690 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1691 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1692 if (*pd != 0) { 1693 vm_page_unhold(pdpg); 1694 if (--m->wire_count == 0) 1695 --vmstats.v_wire_count; 1696 vm_page_wakeup(m); 1697 return(m); 1698 } 1699 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1700 } 1701 1702 /* 1703 * We successfully loaded a PDP, PD, or PTE. Set the page table hint, 1704 * valid bits, mapped flag, unbusy, and we're done. 1705 */ 1706 pmap->pm_ptphint = m; 1707 ++pmap->pm_stats.resident_count; 1708 1709 m->valid = VM_PAGE_BITS_ALL; 1710 vm_page_flag_clear(m, PG_ZERO); 1711 vm_page_flag_set(m, PG_MAPPED); 1712 vm_page_wakeup(m); 1713 1714 return (m); 1715 } 1716 1717 static 1718 vm_page_t 1719 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1720 { 1721 vm_pindex_t ptepindex; 1722 pd_entry_t *pd; 1723 vm_page_t m; 1724 1725 /* 1726 * Calculate pagetable page index 1727 */ 1728 ptepindex = pmap_pde_pindex(va); 1729 1730 /* 1731 * Get the page directory entry 1732 */ 1733 pd = pmap_pde(pmap, va); 1734 1735 /* 1736 * This supports switching from a 2MB page to a 1737 * normal 4K page. 1738 */ 1739 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1740 panic("no promotion/demotion yet"); 1741 *pd = 0; 1742 pd = NULL; 1743 cpu_invltlb(); 1744 smp_invltlb(); 1745 } 1746 1747 /* 1748 * If the page table page is mapped, we just increment the 1749 * hold count, and activate it. 1750 */ 1751 if (pd != NULL && (*pd & PG_V) != 0) { 1752 /* YYY hint is used here on i386 */ 1753 m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 1754 pmap->pm_ptphint = m; 1755 m->hold_count++; 1756 return m; 1757 } 1758 /* 1759 * Here if the pte page isn't mapped, or if it has been deallocated. 1760 */ 1761 return _pmap_allocpte(pmap, ptepindex); 1762 } 1763 1764 1765 /*************************************************** 1766 * Pmap allocation/deallocation routines. 1767 ***************************************************/ 1768 1769 /* 1770 * Release any resources held by the given physical map. 1771 * Called when a pmap initialized by pmap_pinit is being released. 1772 * Should only be called if the map contains no valid mappings. 1773 */ 1774 static int pmap_release_callback(struct vm_page *p, void *data); 1775 1776 void 1777 pmap_release(struct pmap *pmap) 1778 { 1779 vm_object_t object = pmap->pm_pteobj; 1780 struct rb_vm_page_scan_info info; 1781 1782 KASSERT(pmap->pm_active == 0, ("pmap still active! %08x", pmap->pm_active)); 1783 #if defined(DIAGNOSTIC) 1784 if (object->ref_count != 1) 1785 panic("pmap_release: pteobj reference count != 1"); 1786 #endif 1787 1788 info.pmap = pmap; 1789 info.object = object; 1790 crit_enter(); 1791 lwkt_gettoken(&vm_token); 1792 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); 1793 crit_exit(); 1794 1795 do { 1796 crit_enter(); 1797 info.error = 0; 1798 info.mpte = NULL; 1799 info.limit = object->generation; 1800 1801 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1802 pmap_release_callback, &info); 1803 if (info.error == 0 && info.mpte) { 1804 if (!pmap_release_free_page(pmap, info.mpte)) 1805 info.error = 1; 1806 } 1807 crit_exit(); 1808 } while (info.error); 1809 lwkt_reltoken(&vm_token); 1810 } 1811 1812 static 1813 int 1814 pmap_release_callback(struct vm_page *p, void *data) 1815 { 1816 struct rb_vm_page_scan_info *info = data; 1817 1818 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1819 info->mpte = p; 1820 return(0); 1821 } 1822 if (!pmap_release_free_page(info->pmap, p)) { 1823 info->error = 1; 1824 return(-1); 1825 } 1826 if (info->object->generation != info->limit) { 1827 info->error = 1; 1828 return(-1); 1829 } 1830 return(0); 1831 } 1832 1833 /* 1834 * Grow the number of kernel page table entries, if needed. 1835 */ 1836 void 1837 pmap_growkernel(vm_offset_t addr) 1838 { 1839 vm_paddr_t paddr; 1840 vm_offset_t ptppaddr; 1841 vm_page_t nkpg; 1842 pd_entry_t *pde, newpdir; 1843 pdp_entry_t newpdp; 1844 1845 crit_enter(); 1846 lwkt_gettoken(&vm_token); 1847 if (kernel_vm_end == 0) { 1848 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 1849 nkpt = 0; 1850 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & PG_V) != 0) { 1851 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1852 nkpt++; 1853 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1854 kernel_vm_end = kernel_map.max_offset; 1855 break; 1856 } 1857 } 1858 } 1859 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1860 if (addr - 1 >= kernel_map.max_offset) 1861 addr = kernel_map.max_offset; 1862 while (kernel_vm_end < addr) { 1863 pde = pmap_pde(&kernel_pmap, kernel_vm_end); 1864 if (pde == NULL) { 1865 /* We need a new PDP entry */ 1866 nkpg = vm_page_alloc(kptobj, nkpt, 1867 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM 1868 | VM_ALLOC_INTERRUPT); 1869 if (nkpg == NULL) 1870 panic("pmap_growkernel: no memory to grow kernel"); 1871 paddr = VM_PAGE_TO_PHYS(nkpg); 1872 if ((nkpg->flags & PG_ZERO) == 0) 1873 pmap_zero_page(paddr); 1874 vm_page_flag_clear(nkpg, PG_ZERO); 1875 newpdp = (pdp_entry_t) 1876 (paddr | PG_V | PG_RW | PG_A | PG_M); 1877 *pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp; 1878 nkpt++; 1879 continue; /* try again */ 1880 } 1881 if ((*pde & PG_V) != 0) { 1882 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1883 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1884 kernel_vm_end = kernel_map.max_offset; 1885 break; 1886 } 1887 continue; 1888 } 1889 1890 /* 1891 * This index is bogus, but out of the way 1892 */ 1893 nkpg = vm_page_alloc(kptobj, nkpt, 1894 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT); 1895 if (nkpg == NULL) 1896 panic("pmap_growkernel: no memory to grow kernel"); 1897 1898 vm_page_wire(nkpg); 1899 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1900 pmap_zero_page(ptppaddr); 1901 vm_page_flag_clear(nkpg, PG_ZERO); 1902 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 1903 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir; 1904 nkpt++; 1905 1906 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1907 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1908 kernel_vm_end = kernel_map.max_offset; 1909 break; 1910 } 1911 } 1912 lwkt_reltoken(&vm_token); 1913 crit_exit(); 1914 } 1915 1916 /* 1917 * Retire the given physical map from service. 1918 * Should only be called if the map contains 1919 * no valid mappings. 1920 */ 1921 void 1922 pmap_destroy(pmap_t pmap) 1923 { 1924 int count; 1925 1926 if (pmap == NULL) 1927 return; 1928 1929 lwkt_gettoken(&vm_token); 1930 count = --pmap->pm_count; 1931 if (count == 0) { 1932 pmap_release(pmap); 1933 panic("destroying a pmap is not yet implemented"); 1934 } 1935 lwkt_reltoken(&vm_token); 1936 } 1937 1938 /* 1939 * Add a reference to the specified pmap. 1940 */ 1941 void 1942 pmap_reference(pmap_t pmap) 1943 { 1944 if (pmap != NULL) { 1945 lwkt_gettoken(&vm_token); 1946 pmap->pm_count++; 1947 lwkt_reltoken(&vm_token); 1948 } 1949 } 1950 1951 /*************************************************** 1952 * page management routines. 1953 ***************************************************/ 1954 1955 /* 1956 * free the pv_entry back to the free list. This function may be 1957 * called from an interrupt. 1958 */ 1959 static __inline 1960 void 1961 free_pv_entry(pv_entry_t pv) 1962 { 1963 pv_entry_count--; 1964 KKASSERT(pv_entry_count >= 0); 1965 zfree(pvzone, pv); 1966 } 1967 1968 /* 1969 * get a new pv_entry, allocating a block from the system 1970 * when needed. This function may be called from an interrupt. 1971 */ 1972 static 1973 pv_entry_t 1974 get_pv_entry(void) 1975 { 1976 pv_entry_count++; 1977 if (pv_entry_high_water && 1978 (pv_entry_count > pv_entry_high_water) && 1979 (pmap_pagedaemon_waken == 0)) { 1980 pmap_pagedaemon_waken = 1; 1981 wakeup(&vm_pages_needed); 1982 } 1983 return zalloc(pvzone); 1984 } 1985 1986 /* 1987 * This routine is very drastic, but can save the system 1988 * in a pinch. 1989 */ 1990 void 1991 pmap_collect(void) 1992 { 1993 int i; 1994 vm_page_t m; 1995 static int warningdone=0; 1996 1997 if (pmap_pagedaemon_waken == 0) 1998 return; 1999 lwkt_gettoken(&vm_token); 2000 if (warningdone < 5) { 2001 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); 2002 warningdone++; 2003 } 2004 2005 for(i = 0; i < vm_page_array_size; i++) { 2006 m = &vm_page_array[i]; 2007 if (m->wire_count || m->hold_count || m->busy || 2008 (m->flags & PG_BUSY)) 2009 continue; 2010 pmap_remove_all(m); 2011 } 2012 pmap_pagedaemon_waken = 0; 2013 lwkt_reltoken(&vm_token); 2014 } 2015 2016 2017 /* 2018 * If it is the first entry on the list, it is actually 2019 * in the header and we must copy the following entry up 2020 * to the header. Otherwise we must search the list for 2021 * the entry. In either case we free the now unused entry. 2022 */ 2023 static 2024 int 2025 pmap_remove_entry(struct pmap *pmap, vm_page_t m, 2026 vm_offset_t va, pmap_inval_info_t info) 2027 { 2028 pv_entry_t pv; 2029 int rtval; 2030 2031 crit_enter(); 2032 if (m->md.pv_list_count < pmap->pm_stats.resident_count) { 2033 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2034 if (pmap == pv->pv_pmap && va == pv->pv_va) 2035 break; 2036 } 2037 } else { 2038 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { 2039 if (va == pv->pv_va) 2040 break; 2041 } 2042 } 2043 2044 rtval = 0; 2045 KKASSERT(pv); 2046 2047 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2048 m->md.pv_list_count--; 2049 KKASSERT(m->md.pv_list_count >= 0); 2050 if (TAILQ_EMPTY(&m->md.pv_list)) 2051 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2052 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2053 ++pmap->pm_generation; 2054 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info); 2055 free_pv_entry(pv); 2056 2057 crit_exit(); 2058 return rtval; 2059 } 2060 2061 /* 2062 * Create a pv entry for page at pa for 2063 * (pmap, va). 2064 */ 2065 static 2066 void 2067 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) 2068 { 2069 pv_entry_t pv; 2070 2071 crit_enter(); 2072 pv = get_pv_entry(); 2073 pv->pv_va = va; 2074 pv->pv_pmap = pmap; 2075 pv->pv_ptem = mpte; 2076 2077 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); 2078 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2079 ++pmap->pm_generation; 2080 m->md.pv_list_count++; 2081 2082 crit_exit(); 2083 } 2084 2085 /* 2086 * pmap_remove_pte: do the things to unmap a page in a process 2087 */ 2088 static 2089 int 2090 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, 2091 pmap_inval_info_t info) 2092 { 2093 pt_entry_t oldpte; 2094 vm_page_t m; 2095 2096 pmap_inval_interlock(info, pmap, va); 2097 oldpte = pte_load_clear(ptq); 2098 pmap_inval_deinterlock(info, pmap); 2099 if (oldpte & PG_W) 2100 pmap->pm_stats.wired_count -= 1; 2101 /* 2102 * Machines that don't support invlpg, also don't support 2103 * PG_G. XXX PG_G is disabled for SMP so don't worry about 2104 * the SMP case. 2105 */ 2106 if (oldpte & PG_G) 2107 cpu_invlpg((void *)va); 2108 KKASSERT(pmap->pm_stats.resident_count > 0); 2109 --pmap->pm_stats.resident_count; 2110 if (oldpte & PG_MANAGED) { 2111 m = PHYS_TO_VM_PAGE(oldpte); 2112 if (oldpte & PG_M) { 2113 #if defined(PMAP_DIAGNOSTIC) 2114 if (pmap_nw_modified((pt_entry_t) oldpte)) { 2115 kprintf( 2116 "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2117 va, oldpte); 2118 } 2119 #endif 2120 if (pmap_track_modified(va)) 2121 vm_page_dirty(m); 2122 } 2123 if (oldpte & PG_A) 2124 vm_page_flag_set(m, PG_REFERENCED); 2125 return pmap_remove_entry(pmap, m, va, info); 2126 } else { 2127 return pmap_unuse_pt(pmap, va, NULL, info); 2128 } 2129 2130 return 0; 2131 } 2132 2133 /* 2134 * pmap_remove_page: 2135 * 2136 * Remove a single page from a process address space. 2137 * 2138 * This function may not be called from an interrupt if the pmap is 2139 * not kernel_pmap. 2140 */ 2141 static 2142 void 2143 pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info) 2144 { 2145 pt_entry_t *pte; 2146 2147 pte = pmap_pte(pmap, va); 2148 if (pte == NULL) 2149 return; 2150 if ((*pte & PG_V) == 0) 2151 return; 2152 pmap_remove_pte(pmap, pte, va, info); 2153 } 2154 2155 /* 2156 * pmap_remove: 2157 * 2158 * Remove the given range of addresses from the specified map. 2159 * 2160 * It is assumed that the start and end are properly 2161 * rounded to the page size. 2162 * 2163 * This function may not be called from an interrupt if the pmap is 2164 * not kernel_pmap. 2165 */ 2166 void 2167 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 2168 { 2169 vm_offset_t va_next; 2170 pml4_entry_t *pml4e; 2171 pdp_entry_t *pdpe; 2172 pd_entry_t ptpaddr, *pde; 2173 pt_entry_t *pte; 2174 struct pmap_inval_info info; 2175 2176 if (pmap == NULL) 2177 return; 2178 2179 lwkt_gettoken(&vm_token); 2180 if (pmap->pm_stats.resident_count == 0) { 2181 lwkt_reltoken(&vm_token); 2182 return; 2183 } 2184 2185 pmap_inval_init(&info); 2186 2187 /* 2188 * special handling of removing one page. a very 2189 * common operation and easy to short circuit some 2190 * code. 2191 */ 2192 if (sva + PAGE_SIZE == eva) { 2193 pde = pmap_pde(pmap, sva); 2194 if (pde && (*pde & PG_PS) == 0) { 2195 pmap_remove_page(pmap, sva, &info); 2196 pmap_inval_done(&info); 2197 lwkt_reltoken(&vm_token); 2198 return; 2199 } 2200 } 2201 2202 for (; sva < eva; sva = va_next) { 2203 pml4e = pmap_pml4e(pmap, sva); 2204 if ((*pml4e & PG_V) == 0) { 2205 va_next = (sva + NBPML4) & ~PML4MASK; 2206 if (va_next < sva) 2207 va_next = eva; 2208 continue; 2209 } 2210 2211 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2212 if ((*pdpe & PG_V) == 0) { 2213 va_next = (sva + NBPDP) & ~PDPMASK; 2214 if (va_next < sva) 2215 va_next = eva; 2216 continue; 2217 } 2218 2219 /* 2220 * Calculate index for next page table. 2221 */ 2222 va_next = (sva + NBPDR) & ~PDRMASK; 2223 if (va_next < sva) 2224 va_next = eva; 2225 2226 pde = pmap_pdpe_to_pde(pdpe, sva); 2227 ptpaddr = *pde; 2228 2229 /* 2230 * Weed out invalid mappings. 2231 */ 2232 if (ptpaddr == 0) 2233 continue; 2234 2235 /* 2236 * Check for large page. 2237 */ 2238 if ((ptpaddr & PG_PS) != 0) { 2239 /* JG FreeBSD has more complex treatment here */ 2240 pmap_inval_interlock(&info, pmap, -1); 2241 *pde = 0; 2242 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2243 pmap_inval_deinterlock(&info, pmap); 2244 continue; 2245 } 2246 2247 /* 2248 * Limit our scan to either the end of the va represented 2249 * by the current page table page, or to the end of the 2250 * range being removed. 2251 */ 2252 if (va_next > eva) 2253 va_next = eva; 2254 2255 /* 2256 * NOTE: pmap_remove_pte() can block. 2257 */ 2258 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2259 sva += PAGE_SIZE) { 2260 if (*pte == 0) 2261 continue; 2262 if (pmap_remove_pte(pmap, pte, sva, &info)) 2263 break; 2264 } 2265 } 2266 pmap_inval_done(&info); 2267 lwkt_reltoken(&vm_token); 2268 } 2269 2270 /* 2271 * pmap_remove_all: 2272 * 2273 * Removes this physical page from all physical maps in which it resides. 2274 * Reflects back modify bits to the pager. 2275 * 2276 * This routine may not be called from an interrupt. 2277 */ 2278 2279 static 2280 void 2281 pmap_remove_all(vm_page_t m) 2282 { 2283 struct pmap_inval_info info; 2284 pt_entry_t *pte, tpte; 2285 pv_entry_t pv; 2286 2287 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2288 return; 2289 2290 lwkt_gettoken(&vm_token); 2291 pmap_inval_init(&info); 2292 crit_enter(); 2293 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2294 KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0); 2295 --pv->pv_pmap->pm_stats.resident_count; 2296 2297 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2298 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 2299 tpte = pte_load_clear(pte); 2300 if (tpte & PG_W) 2301 pv->pv_pmap->pm_stats.wired_count--; 2302 pmap_inval_deinterlock(&info, pv->pv_pmap); 2303 if (tpte & PG_A) 2304 vm_page_flag_set(m, PG_REFERENCED); 2305 2306 /* 2307 * Update the vm_page_t clean and reference bits. 2308 */ 2309 if (tpte & PG_M) { 2310 #if defined(PMAP_DIAGNOSTIC) 2311 if (pmap_nw_modified(tpte)) { 2312 kprintf( 2313 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2314 pv->pv_va, tpte); 2315 } 2316 #endif 2317 if (pmap_track_modified(pv->pv_va)) 2318 vm_page_dirty(m); 2319 } 2320 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2321 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 2322 ++pv->pv_pmap->pm_generation; 2323 m->md.pv_list_count--; 2324 KKASSERT(m->md.pv_list_count >= 0); 2325 if (TAILQ_EMPTY(&m->md.pv_list)) 2326 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2327 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info); 2328 free_pv_entry(pv); 2329 } 2330 crit_exit(); 2331 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2332 pmap_inval_done(&info); 2333 lwkt_reltoken(&vm_token); 2334 } 2335 2336 /* 2337 * pmap_protect: 2338 * 2339 * Set the physical protection on the specified range of this map 2340 * as requested. 2341 * 2342 * This function may not be called from an interrupt if the map is 2343 * not the kernel_pmap. 2344 */ 2345 void 2346 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2347 { 2348 vm_offset_t va_next; 2349 pml4_entry_t *pml4e; 2350 pdp_entry_t *pdpe; 2351 pd_entry_t ptpaddr, *pde; 2352 pt_entry_t *pte; 2353 pmap_inval_info info; 2354 2355 /* JG review for NX */ 2356 2357 if (pmap == NULL) 2358 return; 2359 2360 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2361 pmap_remove(pmap, sva, eva); 2362 return; 2363 } 2364 2365 if (prot & VM_PROT_WRITE) 2366 return; 2367 2368 lwkt_gettoken(&vm_token); 2369 pmap_inval_init(&info); 2370 2371 for (; sva < eva; sva = va_next) { 2372 2373 pml4e = pmap_pml4e(pmap, sva); 2374 if ((*pml4e & PG_V) == 0) { 2375 va_next = (sva + NBPML4) & ~PML4MASK; 2376 if (va_next < sva) 2377 va_next = eva; 2378 continue; 2379 } 2380 2381 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2382 if ((*pdpe & PG_V) == 0) { 2383 va_next = (sva + NBPDP) & ~PDPMASK; 2384 if (va_next < sva) 2385 va_next = eva; 2386 continue; 2387 } 2388 2389 va_next = (sva + NBPDR) & ~PDRMASK; 2390 if (va_next < sva) 2391 va_next = eva; 2392 2393 pde = pmap_pdpe_to_pde(pdpe, sva); 2394 ptpaddr = *pde; 2395 2396 /* 2397 * Check for large page. 2398 */ 2399 if ((ptpaddr & PG_PS) != 0) { 2400 pmap_inval_interlock(&info, pmap, -1); 2401 *pde &= ~(PG_M|PG_RW); 2402 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2403 pmap_inval_deinterlock(&info, pmap); 2404 continue; 2405 } 2406 2407 /* 2408 * Weed out invalid mappings. Note: we assume that the page 2409 * directory table is always allocated, and in kernel virtual. 2410 */ 2411 if (ptpaddr == 0) 2412 continue; 2413 2414 if (va_next > eva) 2415 va_next = eva; 2416 2417 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2418 sva += PAGE_SIZE) { 2419 pt_entry_t pbits; 2420 pt_entry_t cbits; 2421 vm_page_t m; 2422 2423 /* 2424 * XXX non-optimal. Note also that there can be 2425 * no pmap_inval_flush() calls until after we modify 2426 * ptbase[sindex] (or otherwise we have to do another 2427 * pmap_inval_add() call). 2428 */ 2429 pmap_inval_interlock(&info, pmap, sva); 2430 again: 2431 pbits = *pte; 2432 cbits = pbits; 2433 if ((pbits & PG_V) == 0) { 2434 pmap_inval_deinterlock(&info, pmap); 2435 continue; 2436 } 2437 if (pbits & PG_MANAGED) { 2438 m = NULL; 2439 if (pbits & PG_A) { 2440 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2441 vm_page_flag_set(m, PG_REFERENCED); 2442 cbits &= ~PG_A; 2443 } 2444 if (pbits & PG_M) { 2445 if (pmap_track_modified(sva)) { 2446 if (m == NULL) 2447 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2448 vm_page_dirty(m); 2449 cbits &= ~PG_M; 2450 } 2451 } 2452 } 2453 cbits &= ~PG_RW; 2454 if (pbits != cbits && 2455 !atomic_cmpset_long(pte, pbits, cbits)) { 2456 goto again; 2457 } 2458 pmap_inval_deinterlock(&info, pmap); 2459 } 2460 } 2461 pmap_inval_done(&info); 2462 lwkt_reltoken(&vm_token); 2463 } 2464 2465 /* 2466 * Insert the given physical page (p) at 2467 * the specified virtual address (v) in the 2468 * target physical map with the protection requested. 2469 * 2470 * If specified, the page will be wired down, meaning 2471 * that the related pte can not be reclaimed. 2472 * 2473 * NB: This is the only routine which MAY NOT lazy-evaluate 2474 * or lose information. That is, this routine must actually 2475 * insert this page into the given map NOW. 2476 */ 2477 void 2478 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2479 boolean_t wired) 2480 { 2481 vm_paddr_t pa; 2482 pd_entry_t *pde; 2483 pt_entry_t *pte; 2484 vm_paddr_t opa; 2485 pt_entry_t origpte, newpte; 2486 vm_page_t mpte; 2487 pmap_inval_info info; 2488 2489 if (pmap == NULL) 2490 return; 2491 2492 va = trunc_page(va); 2493 #ifdef PMAP_DIAGNOSTIC 2494 if (va >= KvaEnd) 2495 panic("pmap_enter: toobig"); 2496 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 2497 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); 2498 #endif 2499 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2500 kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n"); 2501 #ifdef DDB 2502 db_print_backtrace(); 2503 #endif 2504 } 2505 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2506 kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n"); 2507 #ifdef DDB 2508 db_print_backtrace(); 2509 #endif 2510 } 2511 2512 lwkt_gettoken(&vm_token); 2513 2514 /* 2515 * In the case that a page table page is not 2516 * resident, we are creating it here. 2517 */ 2518 if (va < VM_MAX_USER_ADDRESS) 2519 mpte = pmap_allocpte(pmap, va); 2520 else 2521 mpte = NULL; 2522 2523 pmap_inval_init(&info); 2524 pde = pmap_pde(pmap, va); 2525 if (pde != NULL && (*pde & PG_V) != 0) { 2526 if ((*pde & PG_PS) != 0) 2527 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2528 pte = pmap_pde_to_pte(pde, va); 2529 } else 2530 panic("pmap_enter: invalid page directory va=%#lx", va); 2531 2532 KKASSERT(pte != NULL); 2533 pa = VM_PAGE_TO_PHYS(m); 2534 origpte = *pte; 2535 opa = origpte & PG_FRAME; 2536 2537 /* 2538 * Mapping has not changed, must be protection or wiring change. 2539 */ 2540 if (origpte && (opa == pa)) { 2541 /* 2542 * Wiring change, just update stats. We don't worry about 2543 * wiring PT pages as they remain resident as long as there 2544 * are valid mappings in them. Hence, if a user page is wired, 2545 * the PT page will be also. 2546 */ 2547 if (wired && ((origpte & PG_W) == 0)) 2548 pmap->pm_stats.wired_count++; 2549 else if (!wired && (origpte & PG_W)) 2550 pmap->pm_stats.wired_count--; 2551 2552 #if defined(PMAP_DIAGNOSTIC) 2553 if (pmap_nw_modified(origpte)) { 2554 kprintf( 2555 "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2556 va, origpte); 2557 } 2558 #endif 2559 2560 /* 2561 * Remove the extra pte reference. Note that we cannot 2562 * optimize the RO->RW case because we have adjusted the 2563 * wiring count above and may need to adjust the wiring 2564 * bits below. 2565 */ 2566 if (mpte) 2567 mpte->hold_count--; 2568 2569 /* 2570 * We might be turning off write access to the page, 2571 * so we go ahead and sense modify status. 2572 */ 2573 if (origpte & PG_MANAGED) { 2574 if ((origpte & PG_M) && pmap_track_modified(va)) { 2575 vm_page_t om; 2576 om = PHYS_TO_VM_PAGE(opa); 2577 vm_page_dirty(om); 2578 } 2579 pa |= PG_MANAGED; 2580 KKASSERT(m->flags & PG_MAPPED); 2581 } 2582 goto validate; 2583 } 2584 /* 2585 * Mapping has changed, invalidate old range and fall through to 2586 * handle validating new mapping. 2587 */ 2588 while (opa) { 2589 int err; 2590 err = pmap_remove_pte(pmap, pte, va, &info); 2591 if (err) 2592 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2593 origpte = *pte; 2594 opa = origpte & PG_FRAME; 2595 if (opa) { 2596 kprintf("pmap_enter: Warning, raced pmap %p va %p\n", 2597 pmap, (void *)va); 2598 } 2599 } 2600 2601 /* 2602 * Enter on the PV list if part of our managed memory. Note that we 2603 * raise IPL while manipulating pv_table since pmap_enter can be 2604 * called at interrupt time. 2605 */ 2606 if (pmap_initialized && 2607 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2608 pmap_insert_entry(pmap, va, mpte, m); 2609 pa |= PG_MANAGED; 2610 vm_page_flag_set(m, PG_MAPPED); 2611 } 2612 2613 /* 2614 * Increment counters 2615 */ 2616 ++pmap->pm_stats.resident_count; 2617 if (wired) 2618 pmap->pm_stats.wired_count++; 2619 2620 validate: 2621 /* 2622 * Now validate mapping with desired protection/wiring. 2623 */ 2624 newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | PG_V); 2625 2626 if (wired) 2627 newpte |= PG_W; 2628 if (va < VM_MAX_USER_ADDRESS) 2629 newpte |= PG_U; 2630 if (pmap == &kernel_pmap) 2631 newpte |= pgeflag; 2632 2633 /* 2634 * if the mapping or permission bits are different, we need 2635 * to update the pte. 2636 */ 2637 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2638 pmap_inval_interlock(&info, pmap, va); 2639 *pte = newpte | PG_A; 2640 pmap_inval_deinterlock(&info, pmap); 2641 if (newpte & PG_RW) 2642 vm_page_flag_set(m, PG_WRITEABLE); 2643 } 2644 KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2645 pmap_inval_done(&info); 2646 lwkt_reltoken(&vm_token); 2647 } 2648 2649 /* 2650 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 2651 * This code also assumes that the pmap has no pre-existing entry for this 2652 * VA. 2653 * 2654 * This code currently may only be used on user pmaps, not kernel_pmap. 2655 */ 2656 void 2657 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 2658 { 2659 pt_entry_t *pte; 2660 vm_paddr_t pa; 2661 vm_page_t mpte; 2662 vm_pindex_t ptepindex; 2663 pd_entry_t *ptepa; 2664 pmap_inval_info info; 2665 2666 lwkt_gettoken(&vm_token); 2667 pmap_inval_init(&info); 2668 2669 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2670 kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n"); 2671 #ifdef DDB 2672 db_print_backtrace(); 2673 #endif 2674 } 2675 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2676 kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n"); 2677 #ifdef DDB 2678 db_print_backtrace(); 2679 #endif 2680 } 2681 2682 KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */ 2683 2684 /* 2685 * Calculate the page table page (mpte), allocating it if necessary. 2686 * 2687 * A held page table page (mpte), or NULL, is passed onto the 2688 * section following. 2689 */ 2690 if (va < VM_MAX_USER_ADDRESS) { 2691 /* 2692 * Calculate pagetable page index 2693 */ 2694 ptepindex = pmap_pde_pindex(va); 2695 2696 do { 2697 /* 2698 * Get the page directory entry 2699 */ 2700 ptepa = pmap_pde(pmap, va); 2701 2702 /* 2703 * If the page table page is mapped, we just increment 2704 * the hold count, and activate it. 2705 */ 2706 if (ptepa && (*ptepa & PG_V) != 0) { 2707 if (*ptepa & PG_PS) 2708 panic("pmap_enter_quick: unexpected mapping into 2MB page"); 2709 // if (pmap->pm_ptphint && 2710 // (pmap->pm_ptphint->pindex == ptepindex)) { 2711 // mpte = pmap->pm_ptphint; 2712 // } else { 2713 mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 2714 pmap->pm_ptphint = mpte; 2715 // } 2716 if (mpte) 2717 mpte->hold_count++; 2718 } else { 2719 mpte = _pmap_allocpte(pmap, ptepindex); 2720 } 2721 } while (mpte == NULL); 2722 } else { 2723 mpte = NULL; 2724 /* this code path is not yet used */ 2725 } 2726 2727 /* 2728 * With a valid (and held) page directory page, we can just use 2729 * vtopte() to get to the pte. If the pte is already present 2730 * we do not disturb it. 2731 */ 2732 pte = vtopte(va); 2733 if (*pte & PG_V) { 2734 if (mpte) 2735 pmap_unwire_pte_hold(pmap, va, mpte, &info); 2736 pa = VM_PAGE_TO_PHYS(m); 2737 KKASSERT(((*pte ^ pa) & PG_FRAME) == 0); 2738 pmap_inval_done(&info); 2739 lwkt_reltoken(&vm_token); 2740 return; 2741 } 2742 2743 /* 2744 * Enter on the PV list if part of our managed memory 2745 */ 2746 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2747 pmap_insert_entry(pmap, va, mpte, m); 2748 vm_page_flag_set(m, PG_MAPPED); 2749 } 2750 2751 /* 2752 * Increment counters 2753 */ 2754 ++pmap->pm_stats.resident_count; 2755 2756 pa = VM_PAGE_TO_PHYS(m); 2757 2758 /* 2759 * Now validate mapping with RO protection 2760 */ 2761 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2762 *pte = pa | PG_V | PG_U; 2763 else 2764 *pte = pa | PG_V | PG_U | PG_MANAGED; 2765 /* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */ 2766 pmap_inval_done(&info); 2767 lwkt_reltoken(&vm_token); 2768 } 2769 2770 /* 2771 * Make a temporary mapping for a physical address. This is only intended 2772 * to be used for panic dumps. 2773 */ 2774 /* JG Needed on x86_64? */ 2775 void * 2776 pmap_kenter_temporary(vm_paddr_t pa, int i) 2777 { 2778 pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 2779 return ((void *)crashdumpmap); 2780 } 2781 2782 #define MAX_INIT_PT (96) 2783 2784 /* 2785 * This routine preloads the ptes for a given object into the specified pmap. 2786 * This eliminates the blast of soft faults on process startup and 2787 * immediately after an mmap. 2788 */ 2789 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2790 2791 void 2792 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 2793 vm_object_t object, vm_pindex_t pindex, 2794 vm_size_t size, int limit) 2795 { 2796 struct rb_vm_page_scan_info info; 2797 struct lwp *lp; 2798 vm_size_t psize; 2799 2800 /* 2801 * We can't preinit if read access isn't set or there is no pmap 2802 * or object. 2803 */ 2804 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2805 return; 2806 2807 /* 2808 * We can't preinit if the pmap is not the current pmap 2809 */ 2810 lp = curthread->td_lwp; 2811 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2812 return; 2813 2814 psize = x86_64_btop(size); 2815 2816 if ((object->type != OBJT_VNODE) || 2817 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2818 (object->resident_page_count > MAX_INIT_PT))) { 2819 return; 2820 } 2821 2822 if (psize + pindex > object->size) { 2823 if (object->size < pindex) 2824 return; 2825 psize = object->size - pindex; 2826 } 2827 2828 if (psize == 0) 2829 return; 2830 2831 /* 2832 * Use a red-black scan to traverse the requested range and load 2833 * any valid pages found into the pmap. 2834 * 2835 * We cannot safely scan the object's memq unless we are in a 2836 * critical section since interrupts can remove pages from objects. 2837 */ 2838 info.start_pindex = pindex; 2839 info.end_pindex = pindex + psize - 1; 2840 info.limit = limit; 2841 info.mpte = NULL; 2842 info.addr = addr; 2843 info.pmap = pmap; 2844 2845 crit_enter(); 2846 lwkt_gettoken(&vm_token); 2847 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2848 pmap_object_init_pt_callback, &info); 2849 lwkt_reltoken(&vm_token); 2850 crit_exit(); 2851 } 2852 2853 static 2854 int 2855 pmap_object_init_pt_callback(vm_page_t p, void *data) 2856 { 2857 struct rb_vm_page_scan_info *info = data; 2858 vm_pindex_t rel_index; 2859 /* 2860 * don't allow an madvise to blow away our really 2861 * free pages allocating pv entries. 2862 */ 2863 if ((info->limit & MAP_PREFAULT_MADVISE) && 2864 vmstats.v_free_count < vmstats.v_free_reserved) { 2865 return(-1); 2866 } 2867 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2868 (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { 2869 if ((p->queue - p->pc) == PQ_CACHE) 2870 vm_page_deactivate(p); 2871 vm_page_busy(p); 2872 rel_index = p->pindex - info->start_pindex; 2873 pmap_enter_quick(info->pmap, 2874 info->addr + x86_64_ptob(rel_index), p); 2875 vm_page_wakeup(p); 2876 } 2877 return(0); 2878 } 2879 2880 /* 2881 * Return TRUE if the pmap is in shape to trivially 2882 * pre-fault the specified address. 2883 * 2884 * Returns FALSE if it would be non-trivial or if a 2885 * pte is already loaded into the slot. 2886 */ 2887 int 2888 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2889 { 2890 pt_entry_t *pte; 2891 pd_entry_t *pde; 2892 int ret; 2893 2894 lwkt_gettoken(&vm_token); 2895 pde = pmap_pde(pmap, addr); 2896 if (pde == NULL || *pde == 0) { 2897 ret = 0; 2898 } else { 2899 pte = vtopte(addr); 2900 ret = (*pte) ? 0 : 1; 2901 } 2902 lwkt_reltoken(&vm_token); 2903 return(ret); 2904 } 2905 2906 /* 2907 * Routine: pmap_change_wiring 2908 * Function: Change the wiring attribute for a map/virtual-address 2909 * pair. 2910 * In/out conditions: 2911 * The mapping must already exist in the pmap. 2912 */ 2913 void 2914 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 2915 { 2916 pt_entry_t *pte; 2917 2918 if (pmap == NULL) 2919 return; 2920 2921 lwkt_gettoken(&vm_token); 2922 pte = pmap_pte(pmap, va); 2923 2924 if (wired && !pmap_pte_w(pte)) 2925 pmap->pm_stats.wired_count++; 2926 else if (!wired && pmap_pte_w(pte)) 2927 pmap->pm_stats.wired_count--; 2928 2929 /* 2930 * Wiring is not a hardware characteristic so there is no need to 2931 * invalidate TLB. However, in an SMP environment we must use 2932 * a locked bus cycle to update the pte (if we are not using 2933 * the pmap_inval_*() API that is)... it's ok to do this for simple 2934 * wiring changes. 2935 */ 2936 #ifdef SMP 2937 if (wired) 2938 atomic_set_long(pte, PG_W); 2939 else 2940 atomic_clear_long(pte, PG_W); 2941 #else 2942 if (wired) 2943 atomic_set_long_nonlocked(pte, PG_W); 2944 else 2945 atomic_clear_long_nonlocked(pte, PG_W); 2946 #endif 2947 lwkt_reltoken(&vm_token); 2948 } 2949 2950 2951 2952 /* 2953 * Copy the range specified by src_addr/len 2954 * from the source map to the range dst_addr/len 2955 * in the destination map. 2956 * 2957 * This routine is only advisory and need not do anything. 2958 */ 2959 void 2960 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2961 vm_size_t len, vm_offset_t src_addr) 2962 { 2963 return; 2964 #if 0 2965 pmap_inval_info info; 2966 vm_offset_t addr; 2967 vm_offset_t end_addr = src_addr + len; 2968 vm_offset_t pdnxt; 2969 pd_entry_t src_frame, dst_frame; 2970 vm_page_t m; 2971 2972 if (dst_addr != src_addr) 2973 return; 2974 #if JGPMAP32 2975 src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 2976 if (src_frame != (PTDpde & PG_FRAME)) { 2977 return; 2978 } 2979 2980 dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 2981 if (dst_frame != (APTDpde & PG_FRAME)) { 2982 APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); 2983 /* The page directory is not shared between CPUs */ 2984 cpu_invltlb(); 2985 } 2986 #endif 2987 pmap_inval_init(&info); 2988 pmap_inval_add(&info, dst_pmap, -1); 2989 pmap_inval_add(&info, src_pmap, -1); 2990 2991 /* 2992 * critical section protection is required to maintain the page/object 2993 * association, interrupts can free pages and remove them from 2994 * their objects. 2995 */ 2996 crit_enter(); 2997 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 2998 pt_entry_t *src_pte, *dst_pte; 2999 vm_page_t dstmpte, srcmpte; 3000 vm_offset_t srcptepaddr; 3001 vm_pindex_t ptepindex; 3002 3003 if (addr >= UPT_MIN_ADDRESS) 3004 panic("pmap_copy: invalid to pmap_copy page tables\n"); 3005 3006 /* 3007 * Don't let optional prefaulting of pages make us go 3008 * way below the low water mark of free pages or way 3009 * above high water mark of used pv entries. 3010 */ 3011 if (vmstats.v_free_count < vmstats.v_free_reserved || 3012 pv_entry_count > pv_entry_high_water) 3013 break; 3014 3015 pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); 3016 ptepindex = addr >> PDRSHIFT; 3017 3018 #if JGPMAP32 3019 srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; 3020 #endif 3021 if (srcptepaddr == 0) 3022 continue; 3023 3024 if (srcptepaddr & PG_PS) { 3025 #if JGPMAP32 3026 if (dst_pmap->pm_pdir[ptepindex] == 0) { 3027 dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; 3028 dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3029 } 3030 #endif 3031 continue; 3032 } 3033 3034 srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); 3035 if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || 3036 (srcmpte->flags & PG_BUSY)) { 3037 continue; 3038 } 3039 3040 if (pdnxt > end_addr) 3041 pdnxt = end_addr; 3042 3043 src_pte = vtopte(addr); 3044 #if JGPMAP32 3045 dst_pte = avtopte(addr); 3046 #endif 3047 while (addr < pdnxt) { 3048 pt_entry_t ptetemp; 3049 3050 ptetemp = *src_pte; 3051 /* 3052 * we only virtual copy managed pages 3053 */ 3054 if ((ptetemp & PG_MANAGED) != 0) { 3055 /* 3056 * We have to check after allocpte for the 3057 * pte still being around... allocpte can 3058 * block. 3059 * 3060 * pmap_allocpte() can block. If we lose 3061 * our page directory mappings we stop. 3062 */ 3063 dstmpte = pmap_allocpte(dst_pmap, addr); 3064 3065 #if JGPMAP32 3066 if (src_frame != (PTDpde & PG_FRAME) || 3067 dst_frame != (APTDpde & PG_FRAME) 3068 ) { 3069 kprintf("WARNING: pmap_copy: detected and corrected race\n"); 3070 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3071 goto failed; 3072 } else if ((*dst_pte == 0) && 3073 (ptetemp = *src_pte) != 0 && 3074 (ptetemp & PG_MANAGED)) { 3075 /* 3076 * Clear the modified and 3077 * accessed (referenced) bits 3078 * during the copy. 3079 */ 3080 m = PHYS_TO_VM_PAGE(ptetemp); 3081 *dst_pte = ptetemp & ~(PG_M | PG_A); 3082 ++dst_pmap->pm_stats.resident_count; 3083 pmap_insert_entry(dst_pmap, addr, 3084 dstmpte, m); 3085 KKASSERT(m->flags & PG_MAPPED); 3086 } else { 3087 kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n"); 3088 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3089 goto failed; 3090 } 3091 #endif 3092 if (dstmpte->hold_count >= srcmpte->hold_count) 3093 break; 3094 } 3095 addr += PAGE_SIZE; 3096 src_pte++; 3097 dst_pte++; 3098 } 3099 } 3100 failed: 3101 crit_exit(); 3102 pmap_inval_done(&info); 3103 #endif 3104 } 3105 3106 /* 3107 * pmap_zero_page: 3108 * 3109 * Zero the specified physical page. 3110 * 3111 * This function may be called from an interrupt and no locking is 3112 * required. 3113 */ 3114 void 3115 pmap_zero_page(vm_paddr_t phys) 3116 { 3117 vm_offset_t va = PHYS_TO_DMAP(phys); 3118 3119 pagezero((void *)va); 3120 } 3121 3122 /* 3123 * pmap_page_assertzero: 3124 * 3125 * Assert that a page is empty, panic if it isn't. 3126 */ 3127 void 3128 pmap_page_assertzero(vm_paddr_t phys) 3129 { 3130 vm_offset_t virt = PHYS_TO_DMAP(phys); 3131 int i; 3132 3133 for (i = 0; i < PAGE_SIZE; i += sizeof(long)) { 3134 if (*(long *)((char *)virt + i) != 0) { 3135 panic("pmap_page_assertzero() @ %p not zero!\n", (void *)virt); 3136 } 3137 } 3138 } 3139 3140 /* 3141 * pmap_zero_page: 3142 * 3143 * Zero part of a physical page by mapping it into memory and clearing 3144 * its contents with bzero. 3145 * 3146 * off and size may not cover an area beyond a single hardware page. 3147 */ 3148 void 3149 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 3150 { 3151 vm_offset_t virt = PHYS_TO_DMAP(phys); 3152 3153 bzero((char *)virt + off, size); 3154 } 3155 3156 /* 3157 * pmap_copy_page: 3158 * 3159 * Copy the physical page from the source PA to the target PA. 3160 * This function may be called from an interrupt. No locking 3161 * is required. 3162 */ 3163 void 3164 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 3165 { 3166 vm_offset_t src_virt, dst_virt; 3167 3168 src_virt = PHYS_TO_DMAP(src); 3169 dst_virt = PHYS_TO_DMAP(dst); 3170 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 3171 } 3172 3173 /* 3174 * pmap_copy_page_frag: 3175 * 3176 * Copy the physical page from the source PA to the target PA. 3177 * This function may be called from an interrupt. No locking 3178 * is required. 3179 */ 3180 void 3181 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 3182 { 3183 vm_offset_t src_virt, dst_virt; 3184 3185 src_virt = PHYS_TO_DMAP(src); 3186 dst_virt = PHYS_TO_DMAP(dst); 3187 3188 bcopy((char *)src_virt + (src & PAGE_MASK), 3189 (char *)dst_virt + (dst & PAGE_MASK), 3190 bytes); 3191 } 3192 3193 /* 3194 * Returns true if the pmap's pv is one of the first 3195 * 16 pvs linked to from this page. This count may 3196 * be changed upwards or downwards in the future; it 3197 * is only necessary that true be returned for a small 3198 * subset of pmaps for proper page aging. 3199 */ 3200 boolean_t 3201 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3202 { 3203 pv_entry_t pv; 3204 int loops = 0; 3205 3206 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3207 return FALSE; 3208 3209 crit_enter(); 3210 lwkt_gettoken(&vm_token); 3211 3212 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3213 if (pv->pv_pmap == pmap) { 3214 lwkt_reltoken(&vm_token); 3215 crit_exit(); 3216 return TRUE; 3217 } 3218 loops++; 3219 if (loops >= 16) 3220 break; 3221 } 3222 lwkt_reltoken(&vm_token); 3223 crit_exit(); 3224 return (FALSE); 3225 } 3226 3227 /* 3228 * Remove all pages from specified address space 3229 * this aids process exit speeds. Also, this code 3230 * is special cased for current process only, but 3231 * can have the more generic (and slightly slower) 3232 * mode enabled. This is much faster than pmap_remove 3233 * in the case of running down an entire address space. 3234 */ 3235 void 3236 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3237 { 3238 struct lwp *lp; 3239 pt_entry_t *pte, tpte; 3240 pv_entry_t pv, npv; 3241 vm_page_t m; 3242 pmap_inval_info info; 3243 int iscurrentpmap; 3244 int save_generation; 3245 3246 lp = curthread->td_lwp; 3247 if (lp && pmap == vmspace_pmap(lp->lwp_vmspace)) 3248 iscurrentpmap = 1; 3249 else 3250 iscurrentpmap = 0; 3251 3252 lwkt_gettoken(&vm_token); 3253 pmap_inval_init(&info); 3254 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 3255 if (pv->pv_va >= eva || pv->pv_va < sva) { 3256 npv = TAILQ_NEXT(pv, pv_plist); 3257 continue; 3258 } 3259 3260 KKASSERT(pmap == pv->pv_pmap); 3261 3262 if (iscurrentpmap) 3263 pte = vtopte(pv->pv_va); 3264 else 3265 pte = pmap_pte_quick(pmap, pv->pv_va); 3266 pmap_inval_interlock(&info, pmap, pv->pv_va); 3267 3268 /* 3269 * We cannot remove wired pages from a process' mapping 3270 * at this time 3271 */ 3272 if (*pte & PG_W) { 3273 pmap_inval_deinterlock(&info, pmap); 3274 npv = TAILQ_NEXT(pv, pv_plist); 3275 continue; 3276 } 3277 tpte = pte_load_clear(pte); 3278 3279 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3280 3281 KASSERT(m < &vm_page_array[vm_page_array_size], 3282 ("pmap_remove_pages: bad tpte %lx", tpte)); 3283 3284 KKASSERT(pmap->pm_stats.resident_count > 0); 3285 --pmap->pm_stats.resident_count; 3286 pmap_inval_deinterlock(&info, pmap); 3287 3288 /* 3289 * Update the vm_page_t clean and reference bits. 3290 */ 3291 if (tpte & PG_M) { 3292 vm_page_dirty(m); 3293 } 3294 3295 npv = TAILQ_NEXT(pv, pv_plist); 3296 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 3297 save_generation = ++pmap->pm_generation; 3298 3299 m->md.pv_list_count--; 3300 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3301 if (TAILQ_EMPTY(&m->md.pv_list)) 3302 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3303 3304 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info); 3305 free_pv_entry(pv); 3306 3307 /* 3308 * Restart the scan if we blocked during the unuse or free 3309 * calls and other removals were made. 3310 */ 3311 if (save_generation != pmap->pm_generation) { 3312 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 3313 npv = TAILQ_FIRST(&pmap->pm_pvlist); 3314 } 3315 } 3316 pmap_inval_done(&info); 3317 lwkt_reltoken(&vm_token); 3318 } 3319 3320 /* 3321 * pmap_testbit tests bits in pte's 3322 * note that the testbit/clearbit routines are inline, 3323 * and a lot of things compile-time evaluate. 3324 */ 3325 static 3326 boolean_t 3327 pmap_testbit(vm_page_t m, int bit) 3328 { 3329 pv_entry_t pv; 3330 pt_entry_t *pte; 3331 3332 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3333 return FALSE; 3334 3335 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 3336 return FALSE; 3337 3338 crit_enter(); 3339 3340 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3341 /* 3342 * if the bit being tested is the modified bit, then 3343 * mark clean_map and ptes as never 3344 * modified. 3345 */ 3346 if (bit & (PG_A|PG_M)) { 3347 if (!pmap_track_modified(pv->pv_va)) 3348 continue; 3349 } 3350 3351 #if defined(PMAP_DIAGNOSTIC) 3352 if (pv->pv_pmap == NULL) { 3353 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 3354 continue; 3355 } 3356 #endif 3357 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3358 if (*pte & bit) { 3359 crit_exit(); 3360 return TRUE; 3361 } 3362 } 3363 crit_exit(); 3364 return (FALSE); 3365 } 3366 3367 /* 3368 * this routine is used to modify bits in ptes 3369 */ 3370 static __inline 3371 void 3372 pmap_clearbit(vm_page_t m, int bit) 3373 { 3374 struct pmap_inval_info info; 3375 pv_entry_t pv; 3376 pt_entry_t *pte; 3377 pt_entry_t pbits; 3378 3379 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3380 return; 3381 3382 pmap_inval_init(&info); 3383 3384 /* 3385 * Loop over all current mappings setting/clearing as appropos If 3386 * setting RO do we need to clear the VAC? 3387 */ 3388 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3389 /* 3390 * don't write protect pager mappings 3391 */ 3392 if (bit == PG_RW) { 3393 if (!pmap_track_modified(pv->pv_va)) 3394 continue; 3395 } 3396 3397 #if defined(PMAP_DIAGNOSTIC) 3398 if (pv->pv_pmap == NULL) { 3399 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 3400 continue; 3401 } 3402 #endif 3403 3404 /* 3405 * Careful here. We can use a locked bus instruction to 3406 * clear PG_A or PG_M safely but we need to synchronize 3407 * with the target cpus when we mess with PG_RW. 3408 * 3409 * We do not have to force synchronization when clearing 3410 * PG_M even for PTEs generated via virtual memory maps, 3411 * because the virtual kernel will invalidate the pmap 3412 * entry when/if it needs to resynchronize the Modify bit. 3413 */ 3414 if (bit & PG_RW) 3415 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 3416 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3417 again: 3418 pbits = *pte; 3419 if (pbits & bit) { 3420 if (bit == PG_RW) { 3421 if (pbits & PG_M) { 3422 vm_page_dirty(m); 3423 atomic_clear_long(pte, PG_M|PG_RW); 3424 } else { 3425 /* 3426 * The cpu may be trying to set PG_M 3427 * simultaniously with our clearing 3428 * of PG_RW. 3429 */ 3430 if (!atomic_cmpset_long(pte, pbits, 3431 pbits & ~PG_RW)) 3432 goto again; 3433 } 3434 } else if (bit == PG_M) { 3435 /* 3436 * We could also clear PG_RW here to force 3437 * a fault on write to redetect PG_M for 3438 * virtual kernels, but it isn't necessary 3439 * since virtual kernels invalidate the pte 3440 * when they clear the VPTE_M bit in their 3441 * virtual page tables. 3442 */ 3443 atomic_clear_long(pte, PG_M); 3444 } else { 3445 atomic_clear_long(pte, bit); 3446 } 3447 } 3448 if (bit & PG_RW) 3449 pmap_inval_deinterlock(&info, pv->pv_pmap); 3450 } 3451 pmap_inval_done(&info); 3452 } 3453 3454 /* 3455 * pmap_page_protect: 3456 * 3457 * Lower the permission for all mappings to a given page. 3458 */ 3459 void 3460 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3461 { 3462 /* JG NX support? */ 3463 if ((prot & VM_PROT_WRITE) == 0) { 3464 lwkt_gettoken(&vm_token); 3465 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3466 pmap_clearbit(m, PG_RW); 3467 vm_page_flag_clear(m, PG_WRITEABLE); 3468 } else { 3469 pmap_remove_all(m); 3470 } 3471 lwkt_reltoken(&vm_token); 3472 } 3473 } 3474 3475 vm_paddr_t 3476 pmap_phys_address(vm_pindex_t ppn) 3477 { 3478 return (x86_64_ptob(ppn)); 3479 } 3480 3481 /* 3482 * pmap_ts_referenced: 3483 * 3484 * Return a count of reference bits for a page, clearing those bits. 3485 * It is not necessary for every reference bit to be cleared, but it 3486 * is necessary that 0 only be returned when there are truly no 3487 * reference bits set. 3488 * 3489 * XXX: The exact number of bits to check and clear is a matter that 3490 * should be tested and standardized at some point in the future for 3491 * optimal aging of shared pages. 3492 */ 3493 int 3494 pmap_ts_referenced(vm_page_t m) 3495 { 3496 pv_entry_t pv, pvf, pvn; 3497 pt_entry_t *pte; 3498 int rtval = 0; 3499 3500 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3501 return (rtval); 3502 3503 crit_enter(); 3504 lwkt_gettoken(&vm_token); 3505 3506 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3507 3508 pvf = pv; 3509 3510 do { 3511 pvn = TAILQ_NEXT(pv, pv_list); 3512 3513 crit_enter(); 3514 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3515 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3516 crit_exit(); 3517 3518 if (!pmap_track_modified(pv->pv_va)) 3519 continue; 3520 3521 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3522 3523 if (pte && (*pte & PG_A)) { 3524 #ifdef SMP 3525 atomic_clear_long(pte, PG_A); 3526 #else 3527 atomic_clear_long_nonlocked(pte, PG_A); 3528 #endif 3529 rtval++; 3530 if (rtval > 4) { 3531 break; 3532 } 3533 } 3534 } while ((pv = pvn) != NULL && pv != pvf); 3535 } 3536 lwkt_reltoken(&vm_token); 3537 crit_exit(); 3538 3539 return (rtval); 3540 } 3541 3542 /* 3543 * pmap_is_modified: 3544 * 3545 * Return whether or not the specified physical page was modified 3546 * in any physical maps. 3547 */ 3548 boolean_t 3549 pmap_is_modified(vm_page_t m) 3550 { 3551 boolean_t res; 3552 3553 lwkt_gettoken(&vm_token); 3554 res = pmap_testbit(m, PG_M); 3555 lwkt_reltoken(&vm_token); 3556 return (res); 3557 } 3558 3559 /* 3560 * Clear the modify bits on the specified physical page. 3561 */ 3562 void 3563 pmap_clear_modify(vm_page_t m) 3564 { 3565 lwkt_gettoken(&vm_token); 3566 pmap_clearbit(m, PG_M); 3567 lwkt_reltoken(&vm_token); 3568 } 3569 3570 /* 3571 * pmap_clear_reference: 3572 * 3573 * Clear the reference bit on the specified physical page. 3574 */ 3575 void 3576 pmap_clear_reference(vm_page_t m) 3577 { 3578 lwkt_gettoken(&vm_token); 3579 pmap_clearbit(m, PG_A); 3580 lwkt_reltoken(&vm_token); 3581 } 3582 3583 /* 3584 * Miscellaneous support routines follow 3585 */ 3586 3587 static 3588 void 3589 i386_protection_init(void) 3590 { 3591 int *kp, prot; 3592 3593 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 3594 kp = protection_codes; 3595 for (prot = 0; prot < 8; prot++) { 3596 switch (prot) { 3597 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 3598 /* 3599 * Read access is also 0. There isn't any execute bit, 3600 * so just make it readable. 3601 */ 3602 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 3603 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 3604 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 3605 *kp++ = 0; 3606 break; 3607 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 3608 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 3609 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 3610 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 3611 *kp++ = PG_RW; 3612 break; 3613 } 3614 } 3615 } 3616 3617 /* 3618 * Map a set of physical memory pages into the kernel virtual 3619 * address space. Return a pointer to where it is mapped. This 3620 * routine is intended to be used for mapping device memory, 3621 * NOT real memory. 3622 * 3623 * NOTE: we can't use pgeflag unless we invalidate the pages one at 3624 * a time. 3625 */ 3626 void * 3627 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 3628 { 3629 vm_offset_t va, tmpva, offset; 3630 pt_entry_t *pte; 3631 3632 offset = pa & PAGE_MASK; 3633 size = roundup(offset + size, PAGE_SIZE); 3634 3635 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3636 if (va == 0) 3637 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3638 3639 pa = pa & ~PAGE_MASK; 3640 for (tmpva = va; size > 0;) { 3641 pte = vtopte(tmpva); 3642 *pte = pa | PG_RW | PG_V; /* | pgeflag; */ 3643 size -= PAGE_SIZE; 3644 tmpva += PAGE_SIZE; 3645 pa += PAGE_SIZE; 3646 } 3647 cpu_invltlb(); 3648 smp_invltlb(); 3649 3650 return ((void *)(va + offset)); 3651 } 3652 3653 void * 3654 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 3655 { 3656 vm_offset_t va, tmpva, offset; 3657 pt_entry_t *pte; 3658 3659 offset = pa & PAGE_MASK; 3660 size = roundup(offset + size, PAGE_SIZE); 3661 3662 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3663 if (va == 0) 3664 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3665 3666 pa = pa & ~PAGE_MASK; 3667 for (tmpva = va; size > 0;) { 3668 pte = vtopte(tmpva); 3669 *pte = pa | PG_RW | PG_V | PG_N; /* | pgeflag; */ 3670 size -= PAGE_SIZE; 3671 tmpva += PAGE_SIZE; 3672 pa += PAGE_SIZE; 3673 } 3674 cpu_invltlb(); 3675 smp_invltlb(); 3676 3677 return ((void *)(va + offset)); 3678 } 3679 3680 void 3681 pmap_unmapdev(vm_offset_t va, vm_size_t size) 3682 { 3683 vm_offset_t base, offset; 3684 3685 base = va & ~PAGE_MASK; 3686 offset = va & PAGE_MASK; 3687 size = roundup(offset + size, PAGE_SIZE); 3688 pmap_qremove(va, size >> PAGE_SHIFT); 3689 kmem_free(&kernel_map, base, size); 3690 } 3691 3692 /* 3693 * perform the pmap work for mincore 3694 */ 3695 int 3696 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3697 { 3698 pt_entry_t *ptep, pte; 3699 vm_page_t m; 3700 int val = 0; 3701 3702 lwkt_gettoken(&vm_token); 3703 ptep = pmap_pte(pmap, addr); 3704 3705 if (ptep && (pte = *ptep) != 0) { 3706 vm_offset_t pa; 3707 3708 val = MINCORE_INCORE; 3709 if ((pte & PG_MANAGED) == 0) 3710 goto done; 3711 3712 pa = pte & PG_FRAME; 3713 3714 m = PHYS_TO_VM_PAGE(pa); 3715 3716 /* 3717 * Modified by us 3718 */ 3719 if (pte & PG_M) 3720 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3721 /* 3722 * Modified by someone 3723 */ 3724 else if (m->dirty || pmap_is_modified(m)) 3725 val |= MINCORE_MODIFIED_OTHER; 3726 /* 3727 * Referenced by us 3728 */ 3729 if (pte & PG_A) 3730 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3731 3732 /* 3733 * Referenced by someone 3734 */ 3735 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3736 val |= MINCORE_REFERENCED_OTHER; 3737 vm_page_flag_set(m, PG_REFERENCED); 3738 } 3739 } 3740 done: 3741 lwkt_reltoken(&vm_token); 3742 return val; 3743 } 3744 3745 /* 3746 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3747 * vmspace will be ref'd and the old one will be deref'd. 3748 * 3749 * The vmspace for all lwps associated with the process will be adjusted 3750 * and cr3 will be reloaded if any lwp is the current lwp. 3751 */ 3752 void 3753 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3754 { 3755 struct vmspace *oldvm; 3756 struct lwp *lp; 3757 3758 crit_enter(); 3759 oldvm = p->p_vmspace; 3760 if (oldvm != newvm) { 3761 p->p_vmspace = newvm; 3762 KKASSERT(p->p_nthreads == 1); 3763 lp = RB_ROOT(&p->p_lwp_tree); 3764 pmap_setlwpvm(lp, newvm); 3765 if (adjrefs) { 3766 sysref_get(&newvm->vm_sysref); 3767 sysref_put(&oldvm->vm_sysref); 3768 } 3769 } 3770 crit_exit(); 3771 } 3772 3773 /* 3774 * Set the vmspace for a LWP. The vmspace is almost universally set the 3775 * same as the process vmspace, but virtual kernels need to swap out contexts 3776 * on a per-lwp basis. 3777 */ 3778 void 3779 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3780 { 3781 struct vmspace *oldvm; 3782 struct pmap *pmap; 3783 3784 crit_enter(); 3785 oldvm = lp->lwp_vmspace; 3786 3787 if (oldvm != newvm) { 3788 lp->lwp_vmspace = newvm; 3789 if (curthread->td_lwp == lp) { 3790 pmap = vmspace_pmap(newvm); 3791 #if defined(SMP) 3792 atomic_set_int(&pmap->pm_active, mycpu->gd_cpumask); 3793 if (pmap->pm_active & CPUMASK_LOCK) 3794 pmap_interlock_wait(newvm); 3795 #else 3796 pmap->pm_active |= 1; 3797 #endif 3798 #if defined(SWTCH_OPTIM_STATS) 3799 tlb_flush_count++; 3800 #endif 3801 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 3802 curthread->td_pcb->pcb_cr3 |= PG_RW | PG_U | PG_V; 3803 load_cr3(curthread->td_pcb->pcb_cr3); 3804 pmap = vmspace_pmap(oldvm); 3805 #if defined(SMP) 3806 atomic_clear_int(&pmap->pm_active, mycpu->gd_cpumask); 3807 #else 3808 pmap->pm_active &= ~1; 3809 #endif 3810 } 3811 } 3812 crit_exit(); 3813 } 3814 3815 #ifdef SMP 3816 3817 /* 3818 * Called when switching to a locked pmap 3819 */ 3820 void 3821 pmap_interlock_wait(struct vmspace *vm) 3822 { 3823 struct pmap *pmap = &vm->vm_pmap; 3824 3825 if (pmap->pm_active & CPUMASK_LOCK) { 3826 while (pmap->pm_active & CPUMASK_LOCK) { 3827 cpu_pause(); 3828 cpu_ccfence(); 3829 lwkt_process_ipiq(); 3830 } 3831 } 3832 } 3833 3834 #endif 3835 3836 vm_offset_t 3837 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3838 { 3839 3840 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3841 return addr; 3842 } 3843 3844 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 3845 return addr; 3846 } 3847