1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2003 Peter Wemm 8 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 9 * Copyright (c) 2008, 2009 The DragonFly Project. 10 * Copyright (c) 2008, 2009 Jordan Gordeev. 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 47 */ 48 49 /* 50 * Manages physical address maps. 51 */ 52 53 #if JG 54 #include "opt_pmap.h" 55 #endif 56 #include "opt_msgbuf.h" 57 58 #include <sys/param.h> 59 #include <sys/systm.h> 60 #include <sys/kernel.h> 61 #include <sys/proc.h> 62 #include <sys/msgbuf.h> 63 #include <sys/vmmeter.h> 64 #include <sys/mman.h> 65 #include <sys/vmspace.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <sys/sysctl.h> 70 #include <sys/lock.h> 71 #include <vm/vm_kern.h> 72 #include <vm/vm_page.h> 73 #include <vm/vm_map.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_pageout.h> 77 #include <vm/vm_pager.h> 78 #include <vm/vm_zone.h> 79 80 #include <sys/user.h> 81 #include <sys/thread2.h> 82 #include <sys/sysref2.h> 83 #include <sys/spinlock2.h> 84 85 #include <machine/cputypes.h> 86 #include <machine/md_var.h> 87 #include <machine/specialreg.h> 88 #include <machine/smp.h> 89 #include <machine/globaldata.h> 90 #include <machine/pmap.h> 91 #include <machine/pmap_inval.h> 92 93 #include <ddb/ddb.h> 94 95 #include <stdio.h> 96 #include <assert.h> 97 #include <stdlib.h> 98 99 #define PMAP_KEEP_PDIRS 100 #ifndef PMAP_SHPGPERPROC 101 #define PMAP_SHPGPERPROC 1000 102 #endif 103 104 #if defined(DIAGNOSTIC) 105 #define PMAP_DIAGNOSTIC 106 #endif 107 108 #define MINPV 2048 109 110 #if !defined(PMAP_DIAGNOSTIC) 111 #define PMAP_INLINE __inline 112 #else 113 #define PMAP_INLINE 114 #endif 115 116 /* 117 * Get PDEs and PTEs for user/kernel address space 118 */ 119 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 120 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 121 122 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & VPTE_V) != 0) 123 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & VPTE_WIRED) != 0) 124 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & VPTE_M) != 0) 125 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & VPTE_A) != 0) 126 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & VPTE_V) != 0) 127 128 /* 129 * Given a map and a machine independent protection code, 130 * convert to a vax protection code. 131 */ 132 #define pte_prot(m, p) \ 133 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 134 static int protection_codes[8]; 135 136 struct pmap kernel_pmap; 137 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); 138 139 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 140 141 static vm_object_t kptobj; 142 143 static int nkpt; 144 145 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 146 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 147 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 148 149 150 /* 151 * Data for the pv entry allocation mechanism 152 */ 153 static vm_zone_t pvzone; 154 static struct vm_zone pvzone_store; 155 static struct vm_object pvzone_obj; 156 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; 157 static int pmap_pagedaemon_waken = 0; 158 static struct pv_entry *pvinit; 159 160 /* 161 * All those kernel PT submaps that BSD is so fond of 162 */ 163 pt_entry_t *CMAP1 = NULL, *ptmmap; 164 caddr_t CADDR1 = 0; 165 static pt_entry_t *msgbufmap; 166 167 uint64_t KPTphys; 168 169 static PMAP_INLINE void free_pv_entry (pv_entry_t pv); 170 static pv_entry_t get_pv_entry (void); 171 static void i386_protection_init (void); 172 static __inline void pmap_clearbit (vm_page_t m, int bit); 173 174 static void pmap_remove_all (vm_page_t m); 175 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 176 vm_offset_t sva); 177 static void pmap_remove_page (struct pmap *pmap, vm_offset_t va); 178 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 179 vm_offset_t va); 180 static boolean_t pmap_testbit (vm_page_t m, int bit); 181 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 182 vm_page_t mpte, vm_page_t m); 183 184 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 185 186 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 187 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 188 #if JGPMAP32 189 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 190 #endif 191 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 192 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t); 193 194 /* 195 * pmap_pte_quick: 196 * 197 * Super fast pmap_pte routine best used when scanning the pv lists. 198 * This eliminates many course-grained invltlb calls. Note that many of 199 * the pv list scans are across different pmaps and it is very wasteful 200 * to do an entire invltlb when checking a single mapping. 201 * 202 * Should only be called while in a critical section. 203 */ 204 #if JGPMAP32 205 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 206 207 static pt_entry_t * 208 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 209 { 210 return pmap_pte(pmap, va); 211 } 212 #endif 213 214 /* Return a non-clipped PD index for a given VA */ 215 static __inline vm_pindex_t 216 pmap_pde_pindex(vm_offset_t va) 217 { 218 return va >> PDRSHIFT; 219 } 220 221 /* Return various clipped indexes for a given VA */ 222 static __inline vm_pindex_t 223 pmap_pte_index(vm_offset_t va) 224 { 225 226 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 227 } 228 229 static __inline vm_pindex_t 230 pmap_pde_index(vm_offset_t va) 231 { 232 233 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 234 } 235 236 static __inline vm_pindex_t 237 pmap_pdpe_index(vm_offset_t va) 238 { 239 240 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 241 } 242 243 static __inline vm_pindex_t 244 pmap_pml4e_index(vm_offset_t va) 245 { 246 247 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 248 } 249 250 /* Return a pointer to the PML4 slot that corresponds to a VA */ 251 static __inline pml4_entry_t * 252 pmap_pml4e(pmap_t pmap, vm_offset_t va) 253 { 254 255 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 256 } 257 258 /* Return a pointer to the PDP slot that corresponds to a VA */ 259 static __inline pdp_entry_t * 260 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 261 { 262 pdp_entry_t *pdpe; 263 264 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & VPTE_FRAME); 265 return (&pdpe[pmap_pdpe_index(va)]); 266 } 267 268 /* Return a pointer to the PDP slot that corresponds to a VA */ 269 static __inline pdp_entry_t * 270 pmap_pdpe(pmap_t pmap, vm_offset_t va) 271 { 272 pml4_entry_t *pml4e; 273 274 pml4e = pmap_pml4e(pmap, va); 275 if ((*pml4e & VPTE_V) == 0) 276 return NULL; 277 return (pmap_pml4e_to_pdpe(pml4e, va)); 278 } 279 280 /* Return a pointer to the PD slot that corresponds to a VA */ 281 static __inline pd_entry_t * 282 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 283 { 284 pd_entry_t *pde; 285 286 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & VPTE_FRAME); 287 return (&pde[pmap_pde_index(va)]); 288 } 289 290 /* Return a pointer to the PD slot that corresponds to a VA */ 291 static __inline pd_entry_t * 292 pmap_pde(pmap_t pmap, vm_offset_t va) 293 { 294 pdp_entry_t *pdpe; 295 296 pdpe = pmap_pdpe(pmap, va); 297 if (pdpe == NULL || (*pdpe & VPTE_V) == 0) 298 return NULL; 299 return (pmap_pdpe_to_pde(pdpe, va)); 300 } 301 302 /* Return a pointer to the PT slot that corresponds to a VA */ 303 static __inline pt_entry_t * 304 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 305 { 306 pt_entry_t *pte; 307 308 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & VPTE_FRAME); 309 return (&pte[pmap_pte_index(va)]); 310 } 311 312 /* Return a pointer to the PT slot that corresponds to a VA */ 313 static __inline pt_entry_t * 314 pmap_pte(pmap_t pmap, vm_offset_t va) 315 { 316 pd_entry_t *pde; 317 318 pde = pmap_pde(pmap, va); 319 if (pde == NULL || (*pde & VPTE_V) == 0) 320 return NULL; 321 if ((*pde & VPTE_PS) != 0) /* compat with i386 pmap_pte() */ 322 return ((pt_entry_t *)pde); 323 return (pmap_pde_to_pte(pde, va)); 324 } 325 326 327 #if JGV 328 PMAP_INLINE pt_entry_t * 329 vtopte(vm_offset_t va) 330 { 331 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 332 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 333 334 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 335 } 336 337 static __inline pd_entry_t * 338 vtopde(vm_offset_t va) 339 { 340 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 341 NPML4EPGSHIFT)) - 1); 342 343 return (PDmap + ((va >> PDRSHIFT) & mask)); 344 } 345 #else 346 static PMAP_INLINE pt_entry_t * 347 vtopte(vm_offset_t va) 348 { 349 pt_entry_t *x; 350 x = pmap_pte(&kernel_pmap, va); 351 assert(x != NULL); 352 return x; 353 } 354 355 static __inline pd_entry_t * 356 vtopde(vm_offset_t va) 357 { 358 pd_entry_t *x; 359 x = pmap_pde(&kernel_pmap, va); 360 assert(x != NULL); 361 return x; 362 } 363 #endif 364 365 static uint64_t 366 allocpages(vm_paddr_t *firstaddr, int n) 367 { 368 uint64_t ret; 369 370 ret = *firstaddr; 371 #if JGV 372 bzero((void *)ret, n * PAGE_SIZE); 373 #endif 374 *firstaddr += n * PAGE_SIZE; 375 return (ret); 376 } 377 378 static void 379 create_pagetables(vm_paddr_t *firstaddr, int64_t ptov_offset) 380 { 381 int i; 382 pml4_entry_t *KPML4virt; 383 pdp_entry_t *KPDPvirt; 384 pd_entry_t *KPDvirt; 385 pt_entry_t *KPTvirt; 386 int kpml4i = pmap_pml4e_index(ptov_offset); 387 int kpdpi = pmap_pdpe_index(ptov_offset); 388 389 /* 390 * Calculate NKPT - number of kernel page tables. We have to 391 * accomodoate prealloction of the vm_page_array, dump bitmap, 392 * MSGBUF_SIZE, and other stuff. Be generous. 393 * 394 * Maxmem is in pages. 395 */ 396 nkpt = (Maxmem * (sizeof(struct vm_page) * 2) + MSGBUF_SIZE) / NBPDR; 397 398 /* 399 * Allocate pages 400 */ 401 KPML4phys = allocpages(firstaddr, 1); 402 KPDPphys = allocpages(firstaddr, NKPML4E); 403 KPDphys = allocpages(firstaddr, NKPDPE); 404 KPTphys = allocpages(firstaddr, nkpt); 405 406 KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 407 KPDPvirt = (pdp_entry_t *)PHYS_TO_DMAP(KPDPphys); 408 KPDvirt = (pd_entry_t *)PHYS_TO_DMAP(KPDphys); 409 KPTvirt = (pt_entry_t *)PHYS_TO_DMAP(KPTphys); 410 411 bzero(KPML4virt, 1 * PAGE_SIZE); 412 bzero(KPDPvirt, NKPML4E * PAGE_SIZE); 413 bzero(KPDvirt, NKPDPE * PAGE_SIZE); 414 bzero(KPTvirt, nkpt * PAGE_SIZE); 415 416 /* Now map the page tables at their location within PTmap */ 417 for (i = 0; i < nkpt; i++) { 418 KPDvirt[i] = KPTphys + (i << PAGE_SHIFT); 419 KPDvirt[i] |= VPTE_R | VPTE_W | VPTE_V; 420 } 421 422 /* And connect up the PD to the PDP */ 423 for (i = 0; i < NKPDPE; i++) { 424 KPDPvirt[i + kpdpi] = KPDphys + (i << PAGE_SHIFT); 425 KPDPvirt[i + kpdpi] |= VPTE_R | VPTE_W | VPTE_V; 426 } 427 428 /* And recursively map PML4 to itself in order to get PTmap */ 429 KPML4virt[PML4PML4I] = KPML4phys; 430 KPML4virt[PML4PML4I] |= VPTE_R | VPTE_W | VPTE_V; 431 432 /* Connect the KVA slot up to the PML4 */ 433 KPML4virt[kpml4i] = KPDPphys; 434 KPML4virt[kpml4i] |= VPTE_R | VPTE_W | VPTE_V; 435 } 436 437 /* 438 * Bootstrap the system enough to run with virtual memory. 439 * 440 * On the i386 this is called after mapping has already been enabled 441 * and just syncs the pmap module with what has already been done. 442 * [We can't call it easily with mapping off since the kernel is not 443 * mapped with PA == VA, hence we would have to relocate every address 444 * from the linked base (virtual) address "KERNBASE" to the actual 445 * (physical) address starting relative to 0] 446 */ 447 void 448 pmap_bootstrap(vm_paddr_t *firstaddr, int64_t ptov_offset) 449 { 450 vm_offset_t va; 451 pt_entry_t *pte; 452 453 /* 454 * Create an initial set of page tables to run the kernel in. 455 */ 456 create_pagetables(firstaddr, ptov_offset); 457 458 virtual_start = KvaStart + *firstaddr; 459 virtual_end = KvaEnd; 460 461 /* 462 * Initialize protection array. 463 */ 464 i386_protection_init(); 465 466 /* 467 * The kernel's pmap is statically allocated so we don't have to use 468 * pmap_create, which is unlikely to work correctly at this part of 469 * the boot sequence (XXX and which no longer exists). 470 * 471 * The kernel_pmap's pm_pteobj is used only for locking and not 472 * for mmu pages. 473 */ 474 kernel_pmap.pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 475 kernel_pmap.pm_count = 1; 476 kernel_pmap.pm_active = (cpumask_t)-1; /* don't allow deactivation */ 477 kernel_pmap.pm_pteobj = &kernel_object; 478 TAILQ_INIT(&kernel_pmap.pm_pvlist); 479 TAILQ_INIT(&kernel_pmap.pm_pvlist_free); 480 lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok"); 481 spin_init(&kernel_pmap.pm_spin); 482 483 /* 484 * Reserve some special page table entries/VA space for temporary 485 * mapping of pages. 486 */ 487 #define SYSMAP(c, p, v, n) \ 488 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 489 490 va = virtual_start; 491 pte = pmap_pte(&kernel_pmap, va); 492 493 /* 494 * CMAP1/CMAP2 are used for zeroing and copying pages. 495 */ 496 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 497 498 #if JGV 499 /* 500 * Crashdump maps. 501 */ 502 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 503 #endif 504 505 /* 506 * ptvmmap is used for reading arbitrary physical pages via 507 * /dev/mem. 508 */ 509 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 510 511 /* 512 * msgbufp is used to map the system message buffer. 513 * XXX msgbufmap is not used. 514 */ 515 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 516 atop(round_page(MSGBUF_SIZE))) 517 518 virtual_start = va; 519 520 *CMAP1 = 0; 521 522 cpu_invltlb(); 523 } 524 525 /* 526 * Initialize the pmap module. 527 * Called by vm_init, to initialize any structures that the pmap 528 * system needs to map virtual memory. 529 * pmap_init has been enhanced to support in a fairly consistant 530 * way, discontiguous physical memory. 531 */ 532 void 533 pmap_init(void) 534 { 535 int i; 536 int initial_pvs; 537 538 /* 539 * object for kernel page table pages 540 */ 541 /* JG I think the number can be arbitrary */ 542 kptobj = vm_object_allocate(OBJT_DEFAULT, 5); 543 544 /* 545 * Allocate memory for random pmap data structures. Includes the 546 * pv_head_table. 547 */ 548 549 for(i = 0; i < vm_page_array_size; i++) { 550 vm_page_t m; 551 552 m = &vm_page_array[i]; 553 TAILQ_INIT(&m->md.pv_list); 554 m->md.pv_list_count = 0; 555 } 556 557 /* 558 * init the pv free list 559 */ 560 initial_pvs = vm_page_array_size; 561 if (initial_pvs < MINPV) 562 initial_pvs = MINPV; 563 pvzone = &pvzone_store; 564 pvinit = (struct pv_entry *) kmem_alloc(&kernel_map, 565 initial_pvs * sizeof (struct pv_entry)); 566 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, 567 initial_pvs); 568 569 /* 570 * Now it is safe to enable pv_table recording. 571 */ 572 pmap_initialized = TRUE; 573 } 574 575 /* 576 * Initialize the address space (zone) for the pv_entries. Set a 577 * high water mark so that the system can recover from excessive 578 * numbers of pv entries. 579 */ 580 void 581 pmap_init2(void) 582 { 583 int shpgperproc = PMAP_SHPGPERPROC; 584 585 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 586 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 587 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 588 pv_entry_high_water = 9 * (pv_entry_max / 10); 589 zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); 590 } 591 592 593 /*************************************************** 594 * Low level helper routines..... 595 ***************************************************/ 596 597 /* 598 * The modification bit is not tracked for any pages in this range. XXX 599 * such pages in this maps should always use pmap_k*() functions and not 600 * be managed anyhow. 601 * 602 * XXX User and kernel address spaces are independant for virtual kernels, 603 * this function only applies to the kernel pmap. 604 */ 605 static int 606 pmap_track_modified(pmap_t pmap, vm_offset_t va) 607 { 608 if (pmap != &kernel_pmap) 609 return 1; 610 if ((va < clean_sva) || (va >= clean_eva)) 611 return 1; 612 else 613 return 0; 614 } 615 616 /* 617 * Extract the physical page address associated with the map/VA pair. 618 * 619 * No requirements. 620 */ 621 vm_paddr_t 622 pmap_extract(pmap_t pmap, vm_offset_t va) 623 { 624 vm_paddr_t rtval; 625 pt_entry_t *pte; 626 pd_entry_t pde, *pdep; 627 628 lwkt_gettoken(&vm_token); 629 rtval = 0; 630 pdep = pmap_pde(pmap, va); 631 if (pdep != NULL) { 632 pde = *pdep; 633 if (pde) { 634 if ((pde & VPTE_PS) != 0) { 635 /* JGV */ 636 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 637 } else { 638 pte = pmap_pde_to_pte(pdep, va); 639 rtval = (*pte & VPTE_FRAME) | (va & PAGE_MASK); 640 } 641 } 642 } 643 lwkt_reltoken(&vm_token); 644 return rtval; 645 } 646 647 /* 648 * Routine: pmap_kextract 649 * Function: 650 * Extract the physical page address associated 651 * kernel virtual address. 652 */ 653 vm_paddr_t 654 pmap_kextract(vm_offset_t va) 655 { 656 pd_entry_t pde; 657 vm_paddr_t pa; 658 659 KKASSERT(va >= KvaStart && va < KvaEnd); 660 661 /* 662 * The DMAP region is not included in [KvaStart, KvaEnd) 663 */ 664 #if 0 665 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 666 pa = DMAP_TO_PHYS(va); 667 } else { 668 #endif 669 pde = *vtopde(va); 670 if (pde & VPTE_PS) { 671 /* JGV */ 672 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 673 } else { 674 /* 675 * Beware of a concurrent promotion that changes the 676 * PDE at this point! For example, vtopte() must not 677 * be used to access the PTE because it would use the 678 * new PDE. It is, however, safe to use the old PDE 679 * because the page table page is preserved by the 680 * promotion. 681 */ 682 pa = *pmap_pde_to_pte(&pde, va); 683 pa = (pa & VPTE_FRAME) | (va & PAGE_MASK); 684 } 685 #if 0 686 } 687 #endif 688 return pa; 689 } 690 691 /*************************************************** 692 * Low level mapping routines..... 693 ***************************************************/ 694 695 /* 696 * Enter a mapping into kernel_pmap. Mappings created in this fashion 697 * are not managed. Mappings must be immediately accessible on all cpus. 698 * 699 * Call pmap_inval_pte() to invalidate the virtual pte and clean out the 700 * real pmap and handle related races before storing the new vpte. 701 */ 702 void 703 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 704 { 705 pt_entry_t *pte; 706 pt_entry_t npte; 707 708 KKASSERT(va >= KvaStart && va < KvaEnd); 709 npte = pa | VPTE_R | VPTE_W | VPTE_V; 710 pte = vtopte(va); 711 if (*pte & VPTE_V) 712 pmap_inval_pte(pte, &kernel_pmap, va); 713 *pte = npte; 714 } 715 716 /* 717 * Enter an unmanaged KVA mapping for the private use of the current 718 * cpu only. pmap_kenter_sync() may be called to make the mapping usable 719 * by other cpus. 720 * 721 * It is illegal for the mapping to be accessed by other cpus unleess 722 * pmap_kenter_sync*() is called. 723 */ 724 void 725 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 726 { 727 pt_entry_t *pte; 728 pt_entry_t npte; 729 730 KKASSERT(va >= KvaStart && va < KvaEnd); 731 732 npte = (vpte_t)pa | VPTE_R | VPTE_W | VPTE_V; 733 pte = vtopte(va); 734 if (*pte & VPTE_V) 735 pmap_inval_pte_quick(pte, &kernel_pmap, va); 736 *pte = npte; 737 //cpu_invlpg((void *)va); 738 } 739 740 /* 741 * Synchronize a kvm mapping originally made for the private use on 742 * some other cpu so it can be used on all cpus. 743 * 744 * XXX add MADV_RESYNC to improve performance. 745 */ 746 void 747 pmap_kenter_sync(vm_offset_t va) 748 { 749 madvise((void *)va, PAGE_SIZE, MADV_INVAL); 750 } 751 752 /* 753 * Synchronize a kvm mapping originally made for the private use on 754 * some other cpu so it can be used on our cpu. Turns out to be the 755 * same madvise() call, because we have to sync the real pmaps anyway. 756 * 757 * XXX add MADV_RESYNC to improve performance. 758 */ 759 void 760 pmap_kenter_sync_quick(vm_offset_t va) 761 { 762 madvise((void *)va, PAGE_SIZE, MADV_INVAL); 763 } 764 765 /* 766 * Remove an unmanaged mapping created with pmap_kenter*(). 767 */ 768 void 769 pmap_kremove(vm_offset_t va) 770 { 771 pt_entry_t *pte; 772 773 KKASSERT(va >= KvaStart && va < KvaEnd); 774 775 pte = vtopte(va); 776 if (*pte & VPTE_V) 777 pmap_inval_pte(pte, &kernel_pmap, va); 778 *pte = 0; 779 } 780 781 /* 782 * Remove an unmanaged mapping created with pmap_kenter*() but synchronize 783 * only with this cpu. 784 * 785 * Unfortunately because we optimize new entries by testing VPTE_V later 786 * on, we actually still have to synchronize with all the cpus. XXX maybe 787 * store a junk value and test against 0 in the other places instead? 788 */ 789 void 790 pmap_kremove_quick(vm_offset_t va) 791 { 792 pt_entry_t *pte; 793 794 KKASSERT(va >= KvaStart && va < KvaEnd); 795 796 pte = vtopte(va); 797 if (*pte & VPTE_V) 798 pmap_inval_pte(pte, &kernel_pmap, va); /* NOT _quick */ 799 *pte = 0; 800 } 801 802 /* 803 * Used to map a range of physical addresses into kernel 804 * virtual address space. 805 * 806 * For now, VM is already on, we only need to map the 807 * specified memory. 808 */ 809 vm_offset_t 810 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 811 { 812 return PHYS_TO_DMAP(start); 813 } 814 815 816 /* 817 * Map a set of unmanaged VM pages into KVM. 818 */ 819 void 820 pmap_qenter(vm_offset_t va, vm_page_t *m, int count) 821 { 822 vm_offset_t end_va; 823 824 end_va = va + count * PAGE_SIZE; 825 KKASSERT(va >= KvaStart && end_va < KvaEnd); 826 827 while (va < end_va) { 828 pt_entry_t *pte; 829 830 pte = vtopte(va); 831 if (*pte & VPTE_V) 832 pmap_inval_pte(pte, &kernel_pmap, va); 833 *pte = VM_PAGE_TO_PHYS(*m) | VPTE_R | VPTE_W | VPTE_V; 834 va += PAGE_SIZE; 835 m++; 836 } 837 } 838 839 /* 840 * Undo the effects of pmap_qenter*(). 841 */ 842 void 843 pmap_qremove(vm_offset_t va, int count) 844 { 845 vm_offset_t end_va; 846 847 end_va = va + count * PAGE_SIZE; 848 KKASSERT(va >= KvaStart && end_va < KvaEnd); 849 850 while (va < end_va) { 851 pt_entry_t *pte; 852 853 pte = vtopte(va); 854 if (*pte & VPTE_V) 855 pmap_inval_pte(pte, &kernel_pmap, va); 856 *pte = 0; 857 va += PAGE_SIZE; 858 } 859 } 860 861 /* 862 * This routine works like vm_page_lookup() but also blocks as long as the 863 * page is busy. This routine does not busy the page it returns. 864 * 865 * Unless the caller is managing objects whos pages are in a known state, 866 * the call should be made with a critical section held so the page's object 867 * association remains valid on return. 868 */ 869 static vm_page_t 870 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 871 { 872 vm_page_t m; 873 874 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 875 m = vm_page_lookup_busy_wait(object, pindex, FALSE, "pplookp"); 876 877 return(m); 878 } 879 880 /* 881 * Create a new thread and optionally associate it with a (new) process. 882 * NOTE! the new thread's cpu may not equal the current cpu. 883 */ 884 void 885 pmap_init_thread(thread_t td) 886 { 887 /* enforce pcb placement */ 888 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 889 td->td_savefpu = &td->td_pcb->pcb_save; 890 td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */ 891 } 892 893 /* 894 * This routine directly affects the fork perf for a process. 895 */ 896 void 897 pmap_init_proc(struct proc *p) 898 { 899 } 900 901 /*************************************************** 902 * Page table page management routines..... 903 ***************************************************/ 904 905 static __inline int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, 906 vm_page_t m); 907 908 /* 909 * This routine unholds page table pages, and if the hold count 910 * drops to zero, then it decrements the wire count. 911 * 912 * We must recheck that this is the last hold reference after busy-sleeping 913 * on the page. 914 */ 915 static int 916 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) 917 { 918 vm_page_busy_wait(m, FALSE, "pmuwpt"); 919 KASSERT(m->queue == PQ_NONE, 920 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m)); 921 922 if (m->hold_count == 1) { 923 /* 924 * Unmap the page table page. 925 */ 926 //abort(); /* JG */ 927 /* pmap_inval_add(info, pmap, -1); */ 928 929 if (m->pindex >= (NUPDE + NUPDPE)) { 930 /* PDP page */ 931 pml4_entry_t *pml4; 932 pml4 = pmap_pml4e(pmap, va); 933 *pml4 = 0; 934 } else if (m->pindex >= NUPDE) { 935 /* PD page */ 936 pdp_entry_t *pdp; 937 pdp = pmap_pdpe(pmap, va); 938 *pdp = 0; 939 } else { 940 /* PT page */ 941 pd_entry_t *pd; 942 pd = pmap_pde(pmap, va); 943 *pd = 0; 944 } 945 946 KKASSERT(pmap->pm_stats.resident_count > 0); 947 --pmap->pm_stats.resident_count; 948 949 if (pmap->pm_ptphint == m) 950 pmap->pm_ptphint = NULL; 951 952 if (m->pindex < NUPDE) { 953 /* We just released a PT, unhold the matching PD */ 954 vm_page_t pdpg; 955 956 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & VPTE_FRAME); 957 pmap_unwire_pte_hold(pmap, va, pdpg); 958 } 959 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 960 /* We just released a PD, unhold the matching PDP */ 961 vm_page_t pdppg; 962 963 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & VPTE_FRAME); 964 pmap_unwire_pte_hold(pmap, va, pdppg); 965 } 966 967 /* 968 * This was our last hold, the page had better be unwired 969 * after we decrement wire_count. 970 * 971 * FUTURE NOTE: shared page directory page could result in 972 * multiple wire counts. 973 */ 974 vm_page_unhold(m); 975 --m->wire_count; 976 KKASSERT(m->wire_count == 0); 977 atomic_add_int(&vmstats.v_wire_count, -1); 978 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 979 vm_page_flash(m); 980 vm_page_free_zero(m); 981 return 1; 982 } else { 983 KKASSERT(m->hold_count > 1); 984 vm_page_unhold(m); 985 vm_page_wakeup(m); 986 return 0; 987 } 988 } 989 990 static __inline int 991 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) 992 { 993 KKASSERT(m->hold_count > 0); 994 if (m->hold_count > 1) { 995 vm_page_unhold(m); 996 return 0; 997 } else { 998 return _pmap_unwire_pte_hold(pmap, va, m); 999 } 1000 } 1001 1002 /* 1003 * After removing a page table entry, this routine is used to 1004 * conditionally free the page, and manage the hold/wire counts. 1005 */ 1006 static int 1007 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1008 { 1009 /* JG Use FreeBSD/amd64 or FreeBSD/i386 ptepde approaches? */ 1010 vm_pindex_t ptepindex; 1011 1012 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1013 1014 if (mpte == NULL) { 1015 /* 1016 * page table pages in the kernel_pmap are not managed. 1017 */ 1018 if (pmap == &kernel_pmap) 1019 return(0); 1020 ptepindex = pmap_pde_pindex(va); 1021 if (pmap->pm_ptphint && 1022 (pmap->pm_ptphint->pindex == ptepindex)) { 1023 mpte = pmap->pm_ptphint; 1024 } else { 1025 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1026 pmap->pm_ptphint = mpte; 1027 vm_page_wakeup(mpte); 1028 } 1029 } 1030 1031 return pmap_unwire_pte_hold(pmap, va, mpte); 1032 } 1033 1034 /* 1035 * Initialize pmap0/vmspace0 . Since process 0 never enters user mode we 1036 * just dummy it up so it works well enough for fork(). 1037 * 1038 * In DragonFly, process pmaps may only be used to manipulate user address 1039 * space, never kernel address space. 1040 */ 1041 void 1042 pmap_pinit0(struct pmap *pmap) 1043 { 1044 pmap_pinit(pmap); 1045 } 1046 1047 /* 1048 * Initialize a preallocated and zeroed pmap structure, 1049 * such as one in a vmspace structure. 1050 */ 1051 void 1052 pmap_pinit(struct pmap *pmap) 1053 { 1054 vm_page_t ptdpg; 1055 1056 /* 1057 * No need to allocate page table space yet but we do need a valid 1058 * page directory table. 1059 */ 1060 if (pmap->pm_pml4 == NULL) { 1061 pmap->pm_pml4 = 1062 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 1063 } 1064 1065 /* 1066 * Allocate an object for the ptes 1067 */ 1068 if (pmap->pm_pteobj == NULL) 1069 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + PML4PML4I + 1); 1070 1071 /* 1072 * Allocate the page directory page, unless we already have 1073 * one cached. If we used the cached page the wire_count will 1074 * already be set appropriately. 1075 */ 1076 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1077 ptdpg = vm_page_grab(pmap->pm_pteobj, 1078 NUPDE + NUPDPE + PML4PML4I, 1079 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | 1080 VM_ALLOC_ZERO); 1081 pmap->pm_pdirm = ptdpg; 1082 vm_page_flag_clear(ptdpg, PG_MAPPED); 1083 vm_page_wire(ptdpg); 1084 vm_page_wakeup(ptdpg); 1085 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1086 } 1087 pmap->pm_count = 1; 1088 pmap->pm_active = 0; 1089 pmap->pm_ptphint = NULL; 1090 TAILQ_INIT(&pmap->pm_pvlist); 1091 TAILQ_INIT(&pmap->pm_pvlist_free); 1092 spin_init(&pmap->pm_spin); 1093 lwkt_token_init(&pmap->pm_token, "pmap_tok"); 1094 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1095 pmap->pm_stats.resident_count = 1; 1096 } 1097 1098 /* 1099 * Clean up a pmap structure so it can be physically freed. This routine 1100 * is called by the vmspace dtor function. A great deal of pmap data is 1101 * left passively mapped to improve vmspace management so we have a bit 1102 * of cleanup work to do here. 1103 * 1104 * No requirements. 1105 */ 1106 void 1107 pmap_puninit(pmap_t pmap) 1108 { 1109 vm_page_t p; 1110 1111 KKASSERT(pmap->pm_active == 0); 1112 if ((p = pmap->pm_pdirm) != NULL) { 1113 KKASSERT(pmap->pm_pml4 != NULL); 1114 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1115 vm_page_busy_wait(p, FALSE, "pgpun"); 1116 p->wire_count--; 1117 atomic_add_int(&vmstats.v_wire_count, -1); 1118 vm_page_free_zero(p); 1119 pmap->pm_pdirm = NULL; 1120 } 1121 if (pmap->pm_pml4) { 1122 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1123 pmap->pm_pml4 = NULL; 1124 } 1125 if (pmap->pm_pteobj) { 1126 vm_object_deallocate(pmap->pm_pteobj); 1127 pmap->pm_pteobj = NULL; 1128 } 1129 } 1130 1131 /* 1132 * Wire in kernel global address entries. To avoid a race condition 1133 * between pmap initialization and pmap_growkernel, this procedure 1134 * adds the pmap to the master list (which growkernel scans to update), 1135 * then copies the template. 1136 * 1137 * In a virtual kernel there are no kernel global address entries. 1138 * 1139 * No requirements. 1140 */ 1141 void 1142 pmap_pinit2(struct pmap *pmap) 1143 { 1144 spin_lock(&pmap_spin); 1145 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); 1146 spin_unlock(&pmap_spin); 1147 } 1148 1149 /* 1150 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1151 * 0 on failure (if the procedure had to sleep). 1152 * 1153 * When asked to remove the page directory page itself, we actually just 1154 * leave it cached so we do not have to incur the SMP inval overhead of 1155 * removing the kernel mapping. pmap_puninit() will take care of it. 1156 */ 1157 static int 1158 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1159 { 1160 /* 1161 * This code optimizes the case of freeing non-busy 1162 * page-table pages. Those pages are zero now, and 1163 * might as well be placed directly into the zero queue. 1164 */ 1165 if (vm_page_busy_try(p, FALSE)) { 1166 vm_page_sleep_busy(p, FALSE, "pmaprl"); 1167 return 0; 1168 } 1169 1170 /* 1171 * Remove the page table page from the processes address space. 1172 */ 1173 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1174 /* 1175 * We are the pml4 table itself. 1176 */ 1177 /* XXX anything to do here? */ 1178 } else if (p->pindex >= (NUPDE + NUPDPE)) { 1179 /* 1180 * We are a PDP page. 1181 * We look for the PML4 entry that points to us. 1182 */ 1183 vm_page_t m4 = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I); 1184 KKASSERT(m4 != NULL); 1185 pml4_entry_t *pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1186 int idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG; 1187 KKASSERT(pml4[idx] != 0); 1188 pml4[idx] = 0; 1189 m4->hold_count--; 1190 /* JG What about wire_count? */ 1191 } else if (p->pindex >= NUPDE) { 1192 /* 1193 * We are a PD page. 1194 * We look for the PDP entry that points to us. 1195 */ 1196 vm_page_t m3 = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG); 1197 KKASSERT(m3 != NULL); 1198 pdp_entry_t *pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1199 int idx = (p->pindex - NUPDE) % NPDPEPG; 1200 KKASSERT(pdp[idx] != 0); 1201 pdp[idx] = 0; 1202 m3->hold_count--; 1203 /* JG What about wire_count? */ 1204 } else { 1205 /* We are a PT page. 1206 * We look for the PD entry that points to us. 1207 */ 1208 vm_page_t m2 = vm_page_lookup(pmap->pm_pteobj, NUPDE + p->pindex / NPDEPG); 1209 KKASSERT(m2 != NULL); 1210 pd_entry_t *pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1211 int idx = p->pindex % NPDEPG; 1212 pd[idx] = 0; 1213 m2->hold_count--; 1214 /* JG What about wire_count? */ 1215 } 1216 KKASSERT(pmap->pm_stats.resident_count > 0); 1217 --pmap->pm_stats.resident_count; 1218 1219 if (p->hold_count) { 1220 panic("pmap_release: freeing held page table page"); 1221 } 1222 if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) 1223 pmap->pm_ptphint = NULL; 1224 1225 /* 1226 * We leave the top-level page table page cached, wired, and mapped in 1227 * the pmap until the dtor function (pmap_puninit()) gets called. 1228 * However, still clean it up so we can set PG_ZERO. 1229 */ 1230 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1231 bzero(pmap->pm_pml4, PAGE_SIZE); 1232 vm_page_flag_set(p, PG_ZERO); 1233 vm_page_wakeup(p); 1234 } else { 1235 abort(); 1236 p->wire_count--; 1237 atomic_add_int(&vmstats.v_wire_count, -1); 1238 /* JG eventually revert to using vm_page_free_zero() */ 1239 vm_page_free(p); 1240 } 1241 return 1; 1242 } 1243 1244 /* 1245 * this routine is called if the page table page is not 1246 * mapped correctly. 1247 */ 1248 static vm_page_t 1249 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1250 { 1251 vm_page_t m, pdppg, pdpg; 1252 1253 /* 1254 * Find or fabricate a new pagetable page. Handle allocation 1255 * races by checking m->valid. 1256 */ 1257 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1258 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1259 1260 KASSERT(m->queue == PQ_NONE, 1261 ("_pmap_allocpte: %p->queue != PQ_NONE", m)); 1262 1263 /* 1264 * Increment the hold count for the page we will be returning to 1265 * the caller. 1266 */ 1267 m->hold_count++; 1268 vm_page_wire(m); 1269 1270 /* 1271 * Map the pagetable page into the process address space, if 1272 * it isn't already there. 1273 */ 1274 ++pmap->pm_stats.resident_count; 1275 1276 if (ptepindex >= (NUPDE + NUPDPE)) { 1277 pml4_entry_t *pml4; 1278 vm_pindex_t pml4index; 1279 1280 /* Wire up a new PDP page */ 1281 pml4index = ptepindex - (NUPDE + NUPDPE); 1282 pml4 = &pmap->pm_pml4[pml4index]; 1283 *pml4 = VM_PAGE_TO_PHYS(m) | VPTE_R | VPTE_W | VPTE_V | 1284 VPTE_A | VPTE_M; 1285 } else if (ptepindex >= NUPDE) { 1286 vm_pindex_t pml4index; 1287 vm_pindex_t pdpindex; 1288 pml4_entry_t *pml4; 1289 pdp_entry_t *pdp; 1290 1291 /* Wire up a new PD page */ 1292 pdpindex = ptepindex - NUPDE; 1293 pml4index = pdpindex >> NPML4EPGSHIFT; 1294 1295 pml4 = &pmap->pm_pml4[pml4index]; 1296 if ((*pml4 & VPTE_V) == 0) { 1297 /* Have to allocate a new PDP page, recurse */ 1298 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index) 1299 == NULL) { 1300 --m->wire_count; 1301 vm_page_free(m); 1302 return (NULL); 1303 } 1304 } else { 1305 /* Add reference to the PDP page */ 1306 pdppg = PHYS_TO_VM_PAGE(*pml4 & VPTE_FRAME); 1307 pdppg->hold_count++; 1308 } 1309 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & VPTE_FRAME); 1310 1311 /* Now find the pdp page */ 1312 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1313 KKASSERT(*pdp == 0); /* JG DEBUG64 */ 1314 *pdp = VM_PAGE_TO_PHYS(m) | VPTE_R | VPTE_W | VPTE_V | 1315 VPTE_A | VPTE_M; 1316 } else { 1317 vm_pindex_t pml4index; 1318 vm_pindex_t pdpindex; 1319 pml4_entry_t *pml4; 1320 pdp_entry_t *pdp; 1321 pd_entry_t *pd; 1322 1323 /* Wire up a new PT page */ 1324 pdpindex = ptepindex >> NPDPEPGSHIFT; 1325 pml4index = pdpindex >> NPML4EPGSHIFT; 1326 1327 /* First, find the pdp and check that its valid. */ 1328 pml4 = &pmap->pm_pml4[pml4index]; 1329 if ((*pml4 & VPTE_V) == 0) { 1330 /* We miss a PDP page. We ultimately need a PD page. 1331 * Recursively allocating a PD page will allocate 1332 * the missing PDP page and will also allocate 1333 * the PD page we need. 1334 */ 1335 /* Have to allocate a new PD page, recurse */ 1336 if (_pmap_allocpte(pmap, NUPDE + pdpindex) 1337 == NULL) { 1338 --m->wire_count; 1339 vm_page_free(m); 1340 return (NULL); 1341 } 1342 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & VPTE_FRAME); 1343 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1344 } else { 1345 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & VPTE_FRAME); 1346 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1347 if ((*pdp & VPTE_V) == 0) { 1348 /* Have to allocate a new PD page, recurse */ 1349 if (_pmap_allocpte(pmap, NUPDE + pdpindex) 1350 == NULL) { 1351 --m->wire_count; 1352 vm_page_free(m); 1353 return (NULL); 1354 } 1355 } else { 1356 /* Add reference to the PD page */ 1357 pdpg = PHYS_TO_VM_PAGE(*pdp & VPTE_FRAME); 1358 pdpg->hold_count++; 1359 } 1360 } 1361 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & VPTE_FRAME); 1362 1363 /* Now we know where the page directory page is */ 1364 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1365 KKASSERT(*pd == 0); /* JG DEBUG64 */ 1366 *pd = VM_PAGE_TO_PHYS(m) | VPTE_R | VPTE_W | VPTE_V | 1367 VPTE_A | VPTE_M; 1368 } 1369 1370 /* 1371 * Set the page table hint 1372 */ 1373 pmap->pm_ptphint = m; 1374 vm_page_flag_set(m, PG_MAPPED); 1375 vm_page_wakeup(m); 1376 1377 return m; 1378 } 1379 1380 /* 1381 * Determine the page table page required to access the VA in the pmap 1382 * and allocate it if necessary. Return a held vm_page_t for the page. 1383 * 1384 * Only used with user pmaps. 1385 */ 1386 static vm_page_t 1387 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1388 { 1389 vm_pindex_t ptepindex; 1390 pd_entry_t *pd; 1391 vm_page_t m; 1392 1393 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1394 1395 /* 1396 * Calculate pagetable page index 1397 */ 1398 ptepindex = pmap_pde_pindex(va); 1399 1400 /* 1401 * Get the page directory entry 1402 */ 1403 pd = pmap_pde(pmap, va); 1404 1405 /* 1406 * This supports switching from a 2MB page to a 1407 * normal 4K page. 1408 */ 1409 if (pd != NULL && (*pd & (VPTE_PS | VPTE_V)) == (VPTE_PS | VPTE_V)) { 1410 panic("no promotion/demotion yet"); 1411 *pd = 0; 1412 pd = NULL; 1413 /*cpu_invltlb();*/ 1414 /*smp_invltlb();*/ 1415 } 1416 1417 /* 1418 * If the page table page is mapped, we just increment the 1419 * hold count, and activate it. 1420 */ 1421 if (pd != NULL && (*pd & VPTE_V) != 0) { 1422 /* YYY hint is used here on i386 */ 1423 m = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1424 pmap->pm_ptphint = m; 1425 vm_page_hold(m); 1426 vm_page_wakeup(m); 1427 return m; 1428 } 1429 /* 1430 * Here if the pte page isn't mapped, or if it has been deallocated. 1431 */ 1432 return _pmap_allocpte(pmap, ptepindex); 1433 } 1434 1435 1436 /*************************************************** 1437 * Pmap allocation/deallocation routines. 1438 ***************************************************/ 1439 1440 /* 1441 * Release any resources held by the given physical map. 1442 * Called when a pmap initialized by pmap_pinit is being released. 1443 * Should only be called if the map contains no valid mappings. 1444 * 1445 * Caller must hold pmap->pm_token 1446 */ 1447 static int pmap_release_callback(struct vm_page *p, void *data); 1448 1449 void 1450 pmap_release(struct pmap *pmap) 1451 { 1452 vm_object_t object = pmap->pm_pteobj; 1453 struct rb_vm_page_scan_info info; 1454 1455 KKASSERT(pmap != &kernel_pmap); 1456 1457 #if defined(DIAGNOSTIC) 1458 if (object->ref_count != 1) 1459 panic("pmap_release: pteobj reference count != 1"); 1460 #endif 1461 1462 info.pmap = pmap; 1463 info.object = object; 1464 1465 spin_lock(&pmap_spin); 1466 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); 1467 spin_unlock(&pmap_spin); 1468 1469 vm_object_hold(object); 1470 do { 1471 info.error = 0; 1472 info.mpte = NULL; 1473 info.limit = object->generation; 1474 1475 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1476 pmap_release_callback, &info); 1477 if (info.error == 0 && info.mpte) { 1478 if (!pmap_release_free_page(pmap, info.mpte)) 1479 info.error = 1; 1480 } 1481 } while (info.error); 1482 vm_object_drop(object); 1483 } 1484 1485 static int 1486 pmap_release_callback(struct vm_page *p, void *data) 1487 { 1488 struct rb_vm_page_scan_info *info = data; 1489 1490 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1491 info->mpte = p; 1492 return(0); 1493 } 1494 if (!pmap_release_free_page(info->pmap, p)) { 1495 info->error = 1; 1496 return(-1); 1497 } 1498 if (info->object->generation != info->limit) { 1499 info->error = 1; 1500 return(-1); 1501 } 1502 return(0); 1503 } 1504 1505 /* 1506 * Grow the number of kernel page table entries, if needed. 1507 * 1508 * No requirements. 1509 */ 1510 void 1511 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 1512 { 1513 vm_offset_t addr; 1514 vm_paddr_t paddr; 1515 vm_offset_t ptppaddr; 1516 vm_page_t nkpg; 1517 pd_entry_t *pde, newpdir; 1518 pdp_entry_t newpdp; 1519 1520 addr = kend; 1521 1522 vm_object_hold(kptobj); 1523 if (kernel_vm_end == 0) { 1524 kernel_vm_end = KvaStart; 1525 nkpt = 0; 1526 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & VPTE_V) != 0) { 1527 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1528 nkpt++; 1529 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1530 kernel_vm_end = kernel_map.max_offset; 1531 break; 1532 } 1533 } 1534 } 1535 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1536 if (addr - 1 >= kernel_map.max_offset) 1537 addr = kernel_map.max_offset; 1538 while (kernel_vm_end < addr) { 1539 pde = pmap_pde(&kernel_pmap, kernel_vm_end); 1540 if (pde == NULL) { 1541 /* We need a new PDP entry */ 1542 nkpg = vm_page_alloc(kptobj, nkpt, 1543 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM 1544 | VM_ALLOC_INTERRUPT); 1545 if (nkpg == NULL) { 1546 panic("pmap_growkernel: no memory to " 1547 "grow kernel"); 1548 } 1549 paddr = VM_PAGE_TO_PHYS(nkpg); 1550 if ((nkpg->flags & PG_ZERO) == 0) 1551 pmap_zero_page(paddr); 1552 vm_page_flag_clear(nkpg, PG_ZERO); 1553 newpdp = (pdp_entry_t)(paddr | VPTE_V | VPTE_R | 1554 VPTE_W | VPTE_A | VPTE_M); 1555 *pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp; 1556 nkpt++; 1557 continue; /* try again */ 1558 } 1559 if ((*pde & VPTE_V) != 0) { 1560 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1561 ~(PAGE_SIZE * NPTEPG - 1); 1562 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1563 kernel_vm_end = kernel_map.max_offset; 1564 break; 1565 } 1566 continue; 1567 } 1568 1569 /* 1570 * This index is bogus, but out of the way 1571 */ 1572 nkpg = vm_page_alloc(kptobj, nkpt, 1573 VM_ALLOC_NORMAL | 1574 VM_ALLOC_SYSTEM | 1575 VM_ALLOC_INTERRUPT); 1576 if (nkpg == NULL) 1577 panic("pmap_growkernel: no memory to grow kernel"); 1578 1579 vm_page_wire(nkpg); 1580 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1581 pmap_zero_page(ptppaddr); 1582 vm_page_flag_clear(nkpg, PG_ZERO); 1583 newpdir = (pd_entry_t)(ptppaddr | VPTE_V | VPTE_R | 1584 VPTE_W | VPTE_A | VPTE_M); 1585 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir; 1586 nkpt++; 1587 1588 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1589 ~(PAGE_SIZE * NPTEPG - 1); 1590 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1591 kernel_vm_end = kernel_map.max_offset; 1592 break; 1593 } 1594 } 1595 vm_object_drop(kptobj); 1596 } 1597 1598 /* 1599 * Retire the given physical map from service. Should only be called 1600 * if the map contains no valid mappings. 1601 * 1602 * No requirements. 1603 */ 1604 void 1605 pmap_destroy(pmap_t pmap) 1606 { 1607 if (pmap == NULL) 1608 return; 1609 1610 lwkt_gettoken(&vm_token); 1611 if (--pmap->pm_count == 0) { 1612 pmap_release(pmap); 1613 panic("destroying a pmap is not yet implemented"); 1614 } 1615 lwkt_reltoken(&vm_token); 1616 } 1617 1618 /* 1619 * Add a reference to the specified pmap. 1620 * 1621 * No requirements. 1622 */ 1623 void 1624 pmap_reference(pmap_t pmap) 1625 { 1626 if (pmap) { 1627 lwkt_gettoken(&vm_token); 1628 ++pmap->pm_count; 1629 lwkt_reltoken(&vm_token); 1630 } 1631 } 1632 1633 /************************************************************************ 1634 * VMSPACE MANAGEMENT * 1635 ************************************************************************ 1636 * 1637 * The VMSPACE management we do in our virtual kernel must be reflected 1638 * in the real kernel. This is accomplished by making vmspace system 1639 * calls to the real kernel. 1640 */ 1641 void 1642 cpu_vmspace_alloc(struct vmspace *vm) 1643 { 1644 int r; 1645 void *rp; 1646 vpte_t vpte; 1647 1648 #define USER_SIZE (VM_MAX_USER_ADDRESS - VM_MIN_USER_ADDRESS) 1649 1650 if (vmspace_create(&vm->vm_pmap, 0, NULL) < 0) 1651 panic("vmspace_create() failed"); 1652 1653 rp = vmspace_mmap(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1654 PROT_READ|PROT_WRITE, 1655 MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED, 1656 MemImageFd, 0); 1657 if (rp == MAP_FAILED) 1658 panic("vmspace_mmap: failed"); 1659 vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1660 MADV_NOSYNC, 0); 1661 vpte = VM_PAGE_TO_PHYS(vmspace_pmap(vm)->pm_pdirm) | VPTE_R | VPTE_W | VPTE_V; 1662 r = vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1663 MADV_SETMAP, vpte); 1664 if (r < 0) 1665 panic("vmspace_mcontrol: failed"); 1666 } 1667 1668 void 1669 cpu_vmspace_free(struct vmspace *vm) 1670 { 1671 if (vmspace_destroy(&vm->vm_pmap) < 0) 1672 panic("vmspace_destroy() failed"); 1673 } 1674 1675 /*************************************************** 1676 * page management routines. 1677 ***************************************************/ 1678 1679 /* 1680 * free the pv_entry back to the free list. This function may be 1681 * called from an interrupt. 1682 */ 1683 static __inline void 1684 free_pv_entry(pv_entry_t pv) 1685 { 1686 pv_entry_count--; 1687 KKASSERT(pv_entry_count >= 0); 1688 zfree(pvzone, pv); 1689 } 1690 1691 /* 1692 * get a new pv_entry, allocating a block from the system 1693 * when needed. This function may be called from an interrupt. 1694 */ 1695 static pv_entry_t 1696 get_pv_entry(void) 1697 { 1698 pv_entry_count++; 1699 if (pv_entry_high_water && 1700 (pv_entry_count > pv_entry_high_water) && 1701 (pmap_pagedaemon_waken == 0)) { 1702 pmap_pagedaemon_waken = 1; 1703 wakeup(&vm_pages_needed); 1704 } 1705 return zalloc(pvzone); 1706 } 1707 1708 /* 1709 * This routine is very drastic, but can save the system 1710 * in a pinch. 1711 * 1712 * No requirements. 1713 */ 1714 void 1715 pmap_collect(void) 1716 { 1717 int i; 1718 vm_page_t m; 1719 static int warningdone=0; 1720 1721 if (pmap_pagedaemon_waken == 0) 1722 return; 1723 lwkt_gettoken(&vm_token); 1724 pmap_pagedaemon_waken = 0; 1725 1726 if (warningdone < 5) { 1727 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); 1728 warningdone++; 1729 } 1730 1731 for (i = 0; i < vm_page_array_size; i++) { 1732 m = &vm_page_array[i]; 1733 if (m->wire_count || m->hold_count) 1734 continue; 1735 if (vm_page_busy_try(m, TRUE) == 0) { 1736 if (m->wire_count == 0 && m->hold_count == 0) { 1737 pmap_remove_all(m); 1738 } 1739 vm_page_wakeup(m); 1740 } 1741 } 1742 lwkt_reltoken(&vm_token); 1743 } 1744 1745 1746 /* 1747 * If it is the first entry on the list, it is actually 1748 * in the header and we must copy the following entry up 1749 * to the header. Otherwise we must search the list for 1750 * the entry. In either case we free the now unused entry. 1751 * 1752 * caller must hold vm_token. 1753 */ 1754 static int 1755 pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va) 1756 { 1757 pv_entry_t pv; 1758 int rtval; 1759 1760 if (m->md.pv_list_count < pmap->pm_stats.resident_count) { 1761 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 1762 if (pmap == pv->pv_pmap && va == pv->pv_va) 1763 break; 1764 } 1765 } else { 1766 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { 1767 if (va == pv->pv_va) 1768 break; 1769 } 1770 } 1771 1772 /* 1773 * Note that pv_ptem is NULL if the page table page itself is not 1774 * managed, even if the page being removed IS managed. 1775 */ 1776 rtval = 0; 1777 /* JGXXX When can 'pv' be NULL? */ 1778 if (pv) { 1779 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1780 m->md.pv_list_count--; 1781 atomic_add_int(&m->object->agg_pv_list_count, -1); 1782 KKASSERT(m->md.pv_list_count >= 0); 1783 if (TAILQ_EMPTY(&m->md.pv_list)) 1784 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1785 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 1786 ++pmap->pm_generation; 1787 KKASSERT(pmap->pm_pteobj != NULL); 1788 vm_object_hold(pmap->pm_pteobj); 1789 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); 1790 vm_object_drop(pmap->pm_pteobj); 1791 free_pv_entry(pv); 1792 } 1793 return rtval; 1794 } 1795 1796 /* 1797 * Create a pv entry for page at pa for (pmap, va). If the page table page 1798 * holding the VA is managed, mpte will be non-NULL. 1799 */ 1800 static void 1801 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) 1802 { 1803 pv_entry_t pv; 1804 1805 crit_enter(); 1806 pv = get_pv_entry(); 1807 pv->pv_va = va; 1808 pv->pv_pmap = pmap; 1809 pv->pv_ptem = mpte; 1810 1811 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); 1812 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1813 m->md.pv_list_count++; 1814 atomic_add_int(&m->object->agg_pv_list_count, 1); 1815 1816 crit_exit(); 1817 } 1818 1819 /* 1820 * pmap_remove_pte: do the things to unmap a page in a process 1821 */ 1822 static int 1823 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va) 1824 { 1825 pt_entry_t oldpte; 1826 vm_page_t m; 1827 1828 oldpte = pmap_inval_loadandclear(ptq, pmap, va); 1829 if (oldpte & VPTE_WIRED) 1830 --pmap->pm_stats.wired_count; 1831 KKASSERT(pmap->pm_stats.wired_count >= 0); 1832 1833 #if 0 1834 /* 1835 * Machines that don't support invlpg, also don't support 1836 * PG_G. XXX PG_G is disabled for SMP so don't worry about 1837 * the SMP case. 1838 */ 1839 if (oldpte & PG_G) 1840 cpu_invlpg((void *)va); 1841 #endif 1842 KKASSERT(pmap->pm_stats.resident_count > 0); 1843 --pmap->pm_stats.resident_count; 1844 if (oldpte & VPTE_MANAGED) { 1845 m = PHYS_TO_VM_PAGE(oldpte); 1846 if (oldpte & VPTE_M) { 1847 #if defined(PMAP_DIAGNOSTIC) 1848 if (pmap_nw_modified(oldpte)) { 1849 kprintf("pmap_remove: modified page not " 1850 "writable: va: 0x%lx, pte: 0x%lx\n", 1851 va, oldpte); 1852 } 1853 #endif 1854 if (pmap_track_modified(pmap, va)) 1855 vm_page_dirty(m); 1856 } 1857 if (oldpte & VPTE_A) 1858 vm_page_flag_set(m, PG_REFERENCED); 1859 return pmap_remove_entry(pmap, m, va); 1860 } else { 1861 return pmap_unuse_pt(pmap, va, NULL); 1862 } 1863 1864 return 0; 1865 } 1866 1867 /* 1868 * pmap_remove_page: 1869 * 1870 * Remove a single page from a process address space. 1871 * 1872 * This function may not be called from an interrupt if the pmap is 1873 * not kernel_pmap. 1874 */ 1875 static void 1876 pmap_remove_page(struct pmap *pmap, vm_offset_t va) 1877 { 1878 pt_entry_t *pte; 1879 1880 pte = pmap_pte(pmap, va); 1881 if (pte == NULL) 1882 return; 1883 if ((*pte & VPTE_V) == 0) 1884 return; 1885 pmap_remove_pte(pmap, pte, va); 1886 } 1887 1888 /* 1889 * Remove the given range of addresses from the specified map. 1890 * 1891 * It is assumed that the start and end are properly rounded to 1892 * the page size. 1893 * 1894 * This function may not be called from an interrupt if the pmap is 1895 * not kernel_pmap. 1896 * 1897 * No requirements. 1898 */ 1899 void 1900 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 1901 { 1902 vm_offset_t va_next; 1903 pml4_entry_t *pml4e; 1904 pdp_entry_t *pdpe; 1905 pd_entry_t ptpaddr, *pde; 1906 pt_entry_t *pte; 1907 1908 if (pmap == NULL) 1909 return; 1910 1911 vm_object_hold(pmap->pm_pteobj); 1912 lwkt_gettoken(&vm_token); 1913 KKASSERT(pmap->pm_stats.resident_count >= 0); 1914 if (pmap->pm_stats.resident_count == 0) { 1915 lwkt_reltoken(&vm_token); 1916 vm_object_drop(pmap->pm_pteobj); 1917 return; 1918 } 1919 1920 /* 1921 * special handling of removing one page. a very 1922 * common operation and easy to short circuit some 1923 * code. 1924 */ 1925 if (sva + PAGE_SIZE == eva) { 1926 pde = pmap_pde(pmap, sva); 1927 if (pde && (*pde & VPTE_PS) == 0) { 1928 pmap_remove_page(pmap, sva); 1929 lwkt_reltoken(&vm_token); 1930 vm_object_drop(pmap->pm_pteobj); 1931 return; 1932 } 1933 } 1934 1935 for (; sva < eva; sva = va_next) { 1936 pml4e = pmap_pml4e(pmap, sva); 1937 if ((*pml4e & VPTE_V) == 0) { 1938 va_next = (sva + NBPML4) & ~PML4MASK; 1939 if (va_next < sva) 1940 va_next = eva; 1941 continue; 1942 } 1943 1944 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 1945 if ((*pdpe & VPTE_V) == 0) { 1946 va_next = (sva + NBPDP) & ~PDPMASK; 1947 if (va_next < sva) 1948 va_next = eva; 1949 continue; 1950 } 1951 1952 /* 1953 * Calculate index for next page table. 1954 */ 1955 va_next = (sva + NBPDR) & ~PDRMASK; 1956 if (va_next < sva) 1957 va_next = eva; 1958 1959 pde = pmap_pdpe_to_pde(pdpe, sva); 1960 ptpaddr = *pde; 1961 1962 /* 1963 * Weed out invalid mappings. 1964 */ 1965 if (ptpaddr == 0) 1966 continue; 1967 1968 /* 1969 * Check for large page. 1970 */ 1971 if ((ptpaddr & VPTE_PS) != 0) { 1972 /* JG FreeBSD has more complex treatment here */ 1973 KKASSERT(*pde != 0); 1974 pmap_inval_pde(pde, pmap, sva); 1975 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 1976 continue; 1977 } 1978 1979 /* 1980 * Limit our scan to either the end of the va represented 1981 * by the current page table page, or to the end of the 1982 * range being removed. 1983 */ 1984 if (va_next > eva) 1985 va_next = eva; 1986 1987 /* 1988 * NOTE: pmap_remove_pte() can block. 1989 */ 1990 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 1991 sva += PAGE_SIZE) { 1992 if (*pte == 0) 1993 continue; 1994 if (pmap_remove_pte(pmap, pte, sva)) 1995 break; 1996 } 1997 } 1998 lwkt_reltoken(&vm_token); 1999 vm_object_drop(pmap->pm_pteobj); 2000 } 2001 2002 /* 2003 * Removes this physical page from all physical maps in which it resides. 2004 * Reflects back modify bits to the pager. 2005 * 2006 * This routine may not be called from an interrupt. 2007 * 2008 * No requirements. 2009 */ 2010 static void 2011 pmap_remove_all(vm_page_t m) 2012 { 2013 pt_entry_t *pte, tpte; 2014 pv_entry_t pv; 2015 2016 #if defined(PMAP_DIAGNOSTIC) 2017 /* 2018 * XXX this makes pmap_page_protect(NONE) illegal for non-managed 2019 * pages! 2020 */ 2021 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2022 panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m)); 2023 } 2024 #endif 2025 2026 lwkt_gettoken(&vm_token); 2027 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2028 KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0); 2029 --pv->pv_pmap->pm_stats.resident_count; 2030 2031 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2032 KKASSERT(pte != NULL); 2033 2034 tpte = pmap_inval_loadandclear(pte, pv->pv_pmap, pv->pv_va); 2035 if (tpte & VPTE_WIRED) 2036 pv->pv_pmap->pm_stats.wired_count--; 2037 KKASSERT(pv->pv_pmap->pm_stats.wired_count >= 0); 2038 2039 if (tpte & VPTE_A) 2040 vm_page_flag_set(m, PG_REFERENCED); 2041 2042 /* 2043 * Update the vm_page_t clean and reference bits. 2044 */ 2045 if (tpte & VPTE_M) { 2046 #if defined(PMAP_DIAGNOSTIC) 2047 if (pmap_nw_modified(tpte)) { 2048 kprintf( 2049 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2050 pv->pv_va, tpte); 2051 } 2052 #endif 2053 if (pmap_track_modified(pv->pv_pmap, pv->pv_va)) 2054 vm_page_dirty(m); 2055 } 2056 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2057 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 2058 ++pv->pv_pmap->pm_generation; 2059 m->md.pv_list_count--; 2060 atomic_add_int(&m->object->agg_pv_list_count, -1); 2061 KKASSERT(m->md.pv_list_count >= 0); 2062 if (TAILQ_EMPTY(&m->md.pv_list)) 2063 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2064 vm_object_hold(pv->pv_pmap->pm_pteobj); 2065 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); 2066 vm_object_drop(pv->pv_pmap->pm_pteobj); 2067 free_pv_entry(pv); 2068 } 2069 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2070 lwkt_reltoken(&vm_token); 2071 } 2072 2073 /* 2074 * Set the physical protection on the specified range of this map 2075 * as requested. 2076 * 2077 * This function may not be called from an interrupt if the map is 2078 * not the kernel_pmap. 2079 * 2080 * No requirements. 2081 */ 2082 void 2083 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2084 { 2085 vm_offset_t va_next; 2086 pml4_entry_t *pml4e; 2087 pdp_entry_t *pdpe; 2088 pd_entry_t ptpaddr, *pde; 2089 pt_entry_t *pte; 2090 2091 /* JG review for NX */ 2092 2093 if (pmap == NULL) 2094 return; 2095 2096 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2097 pmap_remove(pmap, sva, eva); 2098 return; 2099 } 2100 2101 if (prot & VM_PROT_WRITE) 2102 return; 2103 2104 lwkt_gettoken(&vm_token); 2105 2106 for (; sva < eva; sva = va_next) { 2107 2108 pml4e = pmap_pml4e(pmap, sva); 2109 if ((*pml4e & VPTE_V) == 0) { 2110 va_next = (sva + NBPML4) & ~PML4MASK; 2111 if (va_next < sva) 2112 va_next = eva; 2113 continue; 2114 } 2115 2116 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2117 if ((*pdpe & VPTE_V) == 0) { 2118 va_next = (sva + NBPDP) & ~PDPMASK; 2119 if (va_next < sva) 2120 va_next = eva; 2121 continue; 2122 } 2123 2124 va_next = (sva + NBPDR) & ~PDRMASK; 2125 if (va_next < sva) 2126 va_next = eva; 2127 2128 pde = pmap_pdpe_to_pde(pdpe, sva); 2129 ptpaddr = *pde; 2130 2131 /* 2132 * Check for large page. 2133 */ 2134 if ((ptpaddr & VPTE_PS) != 0) { 2135 /* JG correct? */ 2136 pmap_clean_pde(pde, pmap, sva); 2137 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2138 continue; 2139 } 2140 2141 /* 2142 * Weed out invalid mappings. Note: we assume that the page 2143 * directory table is always allocated, and in kernel virtual. 2144 */ 2145 if (ptpaddr == 0) 2146 continue; 2147 2148 if (va_next > eva) 2149 va_next = eva; 2150 2151 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2152 sva += PAGE_SIZE) { 2153 pt_entry_t pbits; 2154 vm_page_t m; 2155 2156 /* 2157 * Clean managed pages and also check the accessed 2158 * bit. Just remove write perms for unmanaged 2159 * pages. Be careful of races, turning off write 2160 * access will force a fault rather then setting 2161 * the modified bit at an unexpected time. 2162 */ 2163 if (*pte & VPTE_MANAGED) { 2164 pbits = pmap_clean_pte(pte, pmap, sva); 2165 m = NULL; 2166 if (pbits & VPTE_A) { 2167 m = PHYS_TO_VM_PAGE(pbits & VPTE_FRAME); 2168 vm_page_flag_set(m, PG_REFERENCED); 2169 atomic_clear_long(pte, VPTE_A); 2170 } 2171 if (pbits & VPTE_M) { 2172 if (pmap_track_modified(pmap, sva)) { 2173 if (m == NULL) 2174 m = PHYS_TO_VM_PAGE(pbits & VPTE_FRAME); 2175 vm_page_dirty(m); 2176 } 2177 } 2178 } else { 2179 pbits = pmap_setro_pte(pte, pmap, sva); 2180 } 2181 } 2182 } 2183 lwkt_reltoken(&vm_token); 2184 } 2185 2186 /* 2187 * Enter a managed page into a pmap. If the page is not wired related pmap 2188 * data can be destroyed at any time for later demand-operation. 2189 * 2190 * Insert the vm_page (m) at virtual address (v) in (pmap), with the 2191 * specified protection, and wire the mapping if requested. 2192 * 2193 * NOTE: This routine may not lazy-evaluate or lose information. The 2194 * page must actually be inserted into the given map NOW. 2195 * 2196 * NOTE: When entering a page at a KVA address, the pmap must be the 2197 * kernel_pmap. 2198 * 2199 * No requirements. 2200 */ 2201 void 2202 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2203 boolean_t wired) 2204 { 2205 vm_paddr_t pa; 2206 pd_entry_t *pde; 2207 pt_entry_t *pte; 2208 vm_paddr_t opa; 2209 pt_entry_t origpte, newpte; 2210 vm_page_t mpte; 2211 2212 if (pmap == NULL) 2213 return; 2214 2215 va = trunc_page(va); 2216 2217 vm_object_hold(pmap->pm_pteobj); 2218 lwkt_gettoken(&vm_token); 2219 2220 /* 2221 * Get the page table page. The kernel_pmap's page table pages 2222 * are preallocated and have no associated vm_page_t. 2223 */ 2224 if (pmap == &kernel_pmap) 2225 mpte = NULL; 2226 else 2227 mpte = pmap_allocpte(pmap, va); 2228 2229 pde = pmap_pde(pmap, va); 2230 if (pde != NULL && (*pde & VPTE_V) != 0) { 2231 if ((*pde & VPTE_PS) != 0) 2232 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2233 pte = pmap_pde_to_pte(pde, va); 2234 } else { 2235 panic("pmap_enter: invalid page directory va=%#lx", va); 2236 } 2237 2238 KKASSERT(pte != NULL); 2239 /* 2240 * Deal with races on the original mapping (though don't worry 2241 * about VPTE_A races) by cleaning it. This will force a fault 2242 * if an attempt is made to write to the page. 2243 */ 2244 pa = VM_PAGE_TO_PHYS(m); 2245 origpte = pmap_clean_pte(pte, pmap, va); 2246 opa = origpte & VPTE_FRAME; 2247 2248 if (origpte & VPTE_PS) 2249 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2250 2251 /* 2252 * Mapping has not changed, must be protection or wiring change. 2253 */ 2254 if (origpte && (opa == pa)) { 2255 /* 2256 * Wiring change, just update stats. We don't worry about 2257 * wiring PT pages as they remain resident as long as there 2258 * are valid mappings in them. Hence, if a user page is wired, 2259 * the PT page will be also. 2260 */ 2261 if (wired && ((origpte & VPTE_WIRED) == 0)) 2262 ++pmap->pm_stats.wired_count; 2263 else if (!wired && (origpte & VPTE_WIRED)) 2264 --pmap->pm_stats.wired_count; 2265 2266 /* 2267 * Remove the extra pte reference. Note that we cannot 2268 * optimize the RO->RW case because we have adjusted the 2269 * wiring count above and may need to adjust the wiring 2270 * bits below. 2271 */ 2272 if (mpte) 2273 mpte->hold_count--; 2274 2275 /* 2276 * We might be turning off write access to the page, 2277 * so we go ahead and sense modify status. 2278 */ 2279 if (origpte & VPTE_MANAGED) { 2280 if ((origpte & VPTE_M) && 2281 pmap_track_modified(pmap, va)) { 2282 vm_page_t om; 2283 om = PHYS_TO_VM_PAGE(opa); 2284 vm_page_dirty(om); 2285 } 2286 pa |= VPTE_MANAGED; 2287 KKASSERT(m->flags & PG_MAPPED); 2288 } 2289 goto validate; 2290 } 2291 /* 2292 * Mapping has changed, invalidate old range and fall through to 2293 * handle validating new mapping. 2294 */ 2295 if (opa) { 2296 int err; 2297 err = pmap_remove_pte(pmap, pte, va); 2298 if (err) 2299 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2300 } 2301 2302 /* 2303 * Enter on the PV list if part of our managed memory. Note that we 2304 * raise IPL while manipulating pv_table since pmap_enter can be 2305 * called at interrupt time. 2306 */ 2307 if (pmap_initialized && 2308 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2309 pmap_insert_entry(pmap, va, mpte, m); 2310 pa |= VPTE_MANAGED; 2311 vm_page_flag_set(m, PG_MAPPED); 2312 } 2313 2314 /* 2315 * Increment counters 2316 */ 2317 ++pmap->pm_stats.resident_count; 2318 if (wired) 2319 pmap->pm_stats.wired_count++; 2320 2321 validate: 2322 /* 2323 * Now validate mapping with desired protection/wiring. 2324 */ 2325 newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | VPTE_V); 2326 2327 if (wired) 2328 newpte |= VPTE_WIRED; 2329 if (pmap != &kernel_pmap) 2330 newpte |= VPTE_U; 2331 2332 /* 2333 * If the mapping or permission bits are different from the 2334 * (now cleaned) original pte, an update is needed. We've 2335 * already downgraded or invalidated the page so all we have 2336 * to do now is update the bits. 2337 * 2338 * XXX should we synchronize RO->RW changes to avoid another 2339 * fault? 2340 */ 2341 if ((origpte & ~(VPTE_W|VPTE_M|VPTE_A)) != newpte) { 2342 *pte = newpte | VPTE_A; 2343 if (newpte & VPTE_W) 2344 vm_page_flag_set(m, PG_WRITEABLE); 2345 } 2346 KKASSERT((newpte & VPTE_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2347 lwkt_reltoken(&vm_token); 2348 vm_object_drop(pmap->pm_pteobj); 2349 } 2350 2351 /* 2352 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 2353 * 2354 * Currently this routine may only be used on user pmaps, not kernel_pmap. 2355 * 2356 * No requirements. 2357 */ 2358 void 2359 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 2360 { 2361 pt_entry_t *pte; 2362 vm_paddr_t pa; 2363 vm_page_t mpte; 2364 vm_pindex_t ptepindex; 2365 pd_entry_t *ptepa; 2366 2367 KKASSERT(pmap != &kernel_pmap); 2368 2369 KKASSERT(va >= VM_MIN_USER_ADDRESS && va < VM_MAX_USER_ADDRESS); 2370 2371 /* 2372 * Calculate pagetable page index 2373 */ 2374 ptepindex = pmap_pde_pindex(va); 2375 2376 vm_object_hold(pmap->pm_pteobj); 2377 lwkt_gettoken(&vm_token); 2378 2379 do { 2380 /* 2381 * Get the page directory entry 2382 */ 2383 ptepa = pmap_pde(pmap, va); 2384 2385 /* 2386 * If the page table page is mapped, we just increment 2387 * the hold count, and activate it. 2388 */ 2389 if (ptepa && (*ptepa & VPTE_V) != 0) { 2390 if (*ptepa & VPTE_PS) 2391 panic("pmap_enter_quick: unexpected mapping into 2MB page"); 2392 if (pmap->pm_ptphint && 2393 (pmap->pm_ptphint->pindex == ptepindex)) { 2394 mpte = pmap->pm_ptphint; 2395 } else { 2396 mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 2397 pmap->pm_ptphint = mpte; 2398 vm_page_wakeup(mpte); 2399 } 2400 if (mpte) 2401 mpte->hold_count++; 2402 } else { 2403 mpte = _pmap_allocpte(pmap, ptepindex); 2404 } 2405 } while (mpte == NULL); 2406 2407 /* 2408 * Ok, now that the page table page has been validated, get the pte. 2409 * If the pte is already mapped undo mpte's hold_count and 2410 * just return. 2411 */ 2412 pte = pmap_pte(pmap, va); 2413 if (*pte & VPTE_V) { 2414 KKASSERT(mpte != NULL); 2415 pmap_unwire_pte_hold(pmap, va, mpte); 2416 pa = VM_PAGE_TO_PHYS(m); 2417 KKASSERT(((*pte ^ pa) & VPTE_FRAME) == 0); 2418 lwkt_reltoken(&vm_token); 2419 vm_object_drop(pmap->pm_pteobj); 2420 return; 2421 } 2422 2423 /* 2424 * Enter on the PV list if part of our managed memory 2425 */ 2426 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2427 pmap_insert_entry(pmap, va, mpte, m); 2428 vm_page_flag_set(m, PG_MAPPED); 2429 } 2430 2431 /* 2432 * Increment counters 2433 */ 2434 ++pmap->pm_stats.resident_count; 2435 2436 pa = VM_PAGE_TO_PHYS(m); 2437 2438 /* 2439 * Now validate mapping with RO protection 2440 */ 2441 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2442 *pte = (vpte_t)pa | VPTE_V | VPTE_U; 2443 else 2444 *pte = (vpte_t)pa | VPTE_V | VPTE_U | VPTE_MANAGED; 2445 /*pmap_inval_add(&info, pmap, va); shouldn't be needed 0->valid */ 2446 /*pmap_inval_flush(&info); don't need for vkernel */ 2447 lwkt_reltoken(&vm_token); 2448 vm_object_drop(pmap->pm_pteobj); 2449 } 2450 2451 /* 2452 * Make a temporary mapping for a physical address. This is only intended 2453 * to be used for panic dumps. 2454 * 2455 * The caller is responsible for calling smp_invltlb(). 2456 */ 2457 void * 2458 pmap_kenter_temporary(vm_paddr_t pa, long i) 2459 { 2460 pmap_kenter_quick(crashdumpmap + (i * PAGE_SIZE), pa); 2461 return ((void *)crashdumpmap); 2462 } 2463 2464 #define MAX_INIT_PT (96) 2465 2466 /* 2467 * This routine preloads the ptes for a given object into the specified pmap. 2468 * This eliminates the blast of soft faults on process startup and 2469 * immediately after an mmap. 2470 * 2471 * No requirements. 2472 */ 2473 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2474 2475 void 2476 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 2477 vm_object_t object, vm_pindex_t pindex, 2478 vm_size_t size, int limit) 2479 { 2480 struct rb_vm_page_scan_info info; 2481 struct lwp *lp; 2482 vm_size_t psize; 2483 2484 /* 2485 * We can't preinit if read access isn't set or there is no pmap 2486 * or object. 2487 */ 2488 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2489 return; 2490 2491 /* 2492 * We can't preinit if the pmap is not the current pmap 2493 */ 2494 lp = curthread->td_lwp; 2495 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2496 return; 2497 2498 psize = x86_64_btop(size); 2499 2500 if ((object->type != OBJT_VNODE) || 2501 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2502 (object->resident_page_count > MAX_INIT_PT))) { 2503 return; 2504 } 2505 2506 if (psize + pindex > object->size) { 2507 if (object->size < pindex) 2508 return; 2509 psize = object->size - pindex; 2510 } 2511 2512 if (psize == 0) 2513 return; 2514 2515 /* 2516 * Use a red-black scan to traverse the requested range and load 2517 * any valid pages found into the pmap. 2518 * 2519 * We cannot safely scan the object's memq unless we are in a 2520 * critical section since interrupts can remove pages from objects. 2521 */ 2522 info.start_pindex = pindex; 2523 info.end_pindex = pindex + psize - 1; 2524 info.limit = limit; 2525 info.mpte = NULL; 2526 info.addr = addr; 2527 info.pmap = pmap; 2528 2529 vm_object_hold(object); 2530 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2531 pmap_object_init_pt_callback, &info); 2532 vm_object_drop(object); 2533 } 2534 2535 static 2536 int 2537 pmap_object_init_pt_callback(vm_page_t p, void *data) 2538 { 2539 struct rb_vm_page_scan_info *info = data; 2540 vm_pindex_t rel_index; 2541 /* 2542 * don't allow an madvise to blow away our really 2543 * free pages allocating pv entries. 2544 */ 2545 if ((info->limit & MAP_PREFAULT_MADVISE) && 2546 vmstats.v_free_count < vmstats.v_free_reserved) { 2547 return(-1); 2548 } 2549 2550 /* 2551 * Ignore list markers and ignore pages we cannot instantly 2552 * busy (while holding the object token). 2553 */ 2554 if (p->flags & PG_MARKER) 2555 return 0; 2556 if (vm_page_busy_try(p, TRUE)) 2557 return 0; 2558 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2559 (p->flags & PG_FICTITIOUS) == 0) { 2560 if ((p->queue - p->pc) == PQ_CACHE) 2561 vm_page_deactivate(p); 2562 rel_index = p->pindex - info->start_pindex; 2563 pmap_enter_quick(info->pmap, 2564 info->addr + x86_64_ptob(rel_index), p); 2565 } 2566 vm_page_wakeup(p); 2567 return(0); 2568 } 2569 2570 /* 2571 * Return TRUE if the pmap is in shape to trivially 2572 * pre-fault the specified address. 2573 * 2574 * Returns FALSE if it would be non-trivial or if a 2575 * pte is already loaded into the slot. 2576 * 2577 * No requirements. 2578 */ 2579 int 2580 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2581 { 2582 pt_entry_t *pte; 2583 pd_entry_t *pde; 2584 int ret; 2585 2586 lwkt_gettoken(&vm_token); 2587 pde = pmap_pde(pmap, addr); 2588 if (pde == NULL || *pde == 0) { 2589 ret = 0; 2590 } else { 2591 pte = pmap_pde_to_pte(pde, addr); 2592 ret = (*pte) ? 0 : 1; 2593 } 2594 lwkt_reltoken(&vm_token); 2595 return (ret); 2596 } 2597 2598 /* 2599 * Change the wiring attribute for a map/virtual-address pair. 2600 * 2601 * The mapping must already exist in the pmap. 2602 * No other requirements. 2603 */ 2604 void 2605 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 2606 { 2607 pt_entry_t *pte; 2608 2609 if (pmap == NULL) 2610 return; 2611 2612 lwkt_gettoken(&vm_token); 2613 pte = pmap_pte(pmap, va); 2614 2615 if (wired && !pmap_pte_w(pte)) 2616 pmap->pm_stats.wired_count++; 2617 else if (!wired && pmap_pte_w(pte)) 2618 pmap->pm_stats.wired_count--; 2619 2620 /* 2621 * Wiring is not a hardware characteristic so there is no need to 2622 * invalidate TLB. However, in an SMP environment we must use 2623 * a locked bus cycle to update the pte (if we are not using 2624 * the pmap_inval_*() API that is)... it's ok to do this for simple 2625 * wiring changes. 2626 */ 2627 if (wired) 2628 atomic_set_long(pte, VPTE_WIRED); 2629 else 2630 atomic_clear_long(pte, VPTE_WIRED); 2631 lwkt_reltoken(&vm_token); 2632 } 2633 2634 /* 2635 * Copy the range specified by src_addr/len 2636 * from the source map to the range dst_addr/len 2637 * in the destination map. 2638 * 2639 * This routine is only advisory and need not do anything. 2640 */ 2641 void 2642 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2643 vm_size_t len, vm_offset_t src_addr) 2644 { 2645 /* 2646 * XXX BUGGY. Amoung other things srcmpte is assumed to remain 2647 * valid through blocking calls, and that's just not going to 2648 * be the case. 2649 * 2650 * FIXME! 2651 */ 2652 return; 2653 } 2654 2655 /* 2656 * pmap_zero_page: 2657 * 2658 * Zero the specified physical page. 2659 * 2660 * This function may be called from an interrupt and no locking is 2661 * required. 2662 */ 2663 void 2664 pmap_zero_page(vm_paddr_t phys) 2665 { 2666 vm_offset_t va = PHYS_TO_DMAP(phys); 2667 2668 bzero((void *)va, PAGE_SIZE); 2669 } 2670 2671 /* 2672 * pmap_page_assertzero: 2673 * 2674 * Assert that a page is empty, panic if it isn't. 2675 */ 2676 void 2677 pmap_page_assertzero(vm_paddr_t phys) 2678 { 2679 int i; 2680 2681 crit_enter(); 2682 vm_offset_t virt = PHYS_TO_DMAP(phys); 2683 2684 for (i = 0; i < PAGE_SIZE; i += sizeof(int)) { 2685 if (*(int *)((char *)virt + i) != 0) { 2686 panic("pmap_page_assertzero() @ %p not zero!", 2687 (void *)virt); 2688 } 2689 } 2690 crit_exit(); 2691 } 2692 2693 /* 2694 * pmap_zero_page: 2695 * 2696 * Zero part of a physical page by mapping it into memory and clearing 2697 * its contents with bzero. 2698 * 2699 * off and size may not cover an area beyond a single hardware page. 2700 */ 2701 void 2702 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 2703 { 2704 crit_enter(); 2705 vm_offset_t virt = PHYS_TO_DMAP(phys); 2706 bzero((char *)virt + off, size); 2707 crit_exit(); 2708 } 2709 2710 /* 2711 * pmap_copy_page: 2712 * 2713 * Copy the physical page from the source PA to the target PA. 2714 * This function may be called from an interrupt. No locking 2715 * is required. 2716 */ 2717 void 2718 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 2719 { 2720 vm_offset_t src_virt, dst_virt; 2721 2722 crit_enter(); 2723 src_virt = PHYS_TO_DMAP(src); 2724 dst_virt = PHYS_TO_DMAP(dst); 2725 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 2726 crit_exit(); 2727 } 2728 2729 /* 2730 * pmap_copy_page_frag: 2731 * 2732 * Copy the physical page from the source PA to the target PA. 2733 * This function may be called from an interrupt. No locking 2734 * is required. 2735 */ 2736 void 2737 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 2738 { 2739 vm_offset_t src_virt, dst_virt; 2740 2741 crit_enter(); 2742 src_virt = PHYS_TO_DMAP(src); 2743 dst_virt = PHYS_TO_DMAP(dst); 2744 bcopy((char *)src_virt + (src & PAGE_MASK), 2745 (char *)dst_virt + (dst & PAGE_MASK), 2746 bytes); 2747 crit_exit(); 2748 } 2749 2750 /* 2751 * Returns true if the pmap's pv is one of the first 16 pvs linked to 2752 * from this page. This count may be changed upwards or downwards 2753 * in the future; it is only necessary that true be returned for a small 2754 * subset of pmaps for proper page aging. 2755 * 2756 * No other requirements. 2757 */ 2758 boolean_t 2759 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 2760 { 2761 pv_entry_t pv; 2762 int loops = 0; 2763 2764 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2765 return FALSE; 2766 2767 crit_enter(); 2768 lwkt_gettoken(&vm_token); 2769 2770 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2771 if (pv->pv_pmap == pmap) { 2772 lwkt_reltoken(&vm_token); 2773 crit_exit(); 2774 return TRUE; 2775 } 2776 loops++; 2777 if (loops >= 16) 2778 break; 2779 } 2780 lwkt_reltoken(&vm_token); 2781 crit_exit(); 2782 return (FALSE); 2783 } 2784 2785 /* 2786 * Remove all pages from specified address space this aids process 2787 * exit speeds. Also, this code is special cased for current 2788 * process only, but can have the more generic (and slightly slower) 2789 * mode enabled. This is much faster than pmap_remove in the case 2790 * of running down an entire address space. 2791 * 2792 * No other requirements. 2793 */ 2794 void 2795 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2796 { 2797 pt_entry_t *pte, tpte; 2798 pv_entry_t pv, npv; 2799 vm_page_t m; 2800 int save_generation; 2801 2802 if (pmap->pm_pteobj) 2803 vm_object_hold(pmap->pm_pteobj); 2804 lwkt_gettoken(&vm_token); 2805 2806 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 2807 if (pv->pv_va >= eva || pv->pv_va < sva) { 2808 npv = TAILQ_NEXT(pv, pv_plist); 2809 continue; 2810 } 2811 2812 KKASSERT(pmap == pv->pv_pmap); 2813 2814 pte = pmap_pte(pmap, pv->pv_va); 2815 2816 /* 2817 * We cannot remove wired pages from a process' mapping 2818 * at this time 2819 */ 2820 if (*pte & VPTE_WIRED) { 2821 npv = TAILQ_NEXT(pv, pv_plist); 2822 continue; 2823 } 2824 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2825 2826 m = PHYS_TO_VM_PAGE(tpte & VPTE_FRAME); 2827 2828 KASSERT(m < &vm_page_array[vm_page_array_size], 2829 ("pmap_remove_pages: bad tpte %lx", tpte)); 2830 2831 KKASSERT(pmap->pm_stats.resident_count > 0); 2832 --pmap->pm_stats.resident_count; 2833 2834 /* 2835 * Update the vm_page_t clean and reference bits. 2836 */ 2837 if (tpte & VPTE_M) { 2838 vm_page_dirty(m); 2839 } 2840 2841 npv = TAILQ_NEXT(pv, pv_plist); 2842 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2843 save_generation = ++pmap->pm_generation; 2844 2845 m->md.pv_list_count--; 2846 atomic_add_int(&m->object->agg_pv_list_count, -1); 2847 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2848 if (TAILQ_EMPTY(&m->md.pv_list)) 2849 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2850 2851 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2852 free_pv_entry(pv); 2853 2854 /* 2855 * Restart the scan if we blocked during the unuse or free 2856 * calls and other removals were made. 2857 */ 2858 if (save_generation != pmap->pm_generation) { 2859 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 2860 npv = TAILQ_FIRST(&pmap->pm_pvlist); 2861 } 2862 } 2863 lwkt_reltoken(&vm_token); 2864 if (pmap->pm_pteobj) 2865 vm_object_drop(pmap->pm_pteobj); 2866 } 2867 2868 /* 2869 * pmap_testbit tests bits in active mappings of a VM page. 2870 */ 2871 static boolean_t 2872 pmap_testbit(vm_page_t m, int bit) 2873 { 2874 pv_entry_t pv; 2875 pt_entry_t *pte; 2876 2877 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2878 return FALSE; 2879 2880 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 2881 return FALSE; 2882 2883 crit_enter(); 2884 2885 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2886 /* 2887 * if the bit being tested is the modified bit, then 2888 * mark clean_map and ptes as never 2889 * modified. 2890 */ 2891 if (bit & (VPTE_A|VPTE_M)) { 2892 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) 2893 continue; 2894 } 2895 2896 #if defined(PMAP_DIAGNOSTIC) 2897 if (pv->pv_pmap == NULL) { 2898 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 2899 continue; 2900 } 2901 #endif 2902 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2903 if (*pte & bit) { 2904 crit_exit(); 2905 return TRUE; 2906 } 2907 } 2908 crit_exit(); 2909 return (FALSE); 2910 } 2911 2912 /* 2913 * This routine is used to clear bits in ptes. Certain bits require special 2914 * handling, in particular (on virtual kernels) the VPTE_M (modify) bit. 2915 * 2916 * This routine is only called with certain VPTE_* bit combinations. 2917 */ 2918 static __inline void 2919 pmap_clearbit(vm_page_t m, int bit) 2920 { 2921 pv_entry_t pv; 2922 pt_entry_t *pte; 2923 pt_entry_t pbits; 2924 2925 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2926 return; 2927 2928 crit_enter(); 2929 2930 /* 2931 * Loop over all current mappings setting/clearing as appropos If 2932 * setting RO do we need to clear the VAC? 2933 */ 2934 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2935 /* 2936 * don't write protect pager mappings 2937 */ 2938 if (bit == VPTE_W) { 2939 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) 2940 continue; 2941 } 2942 2943 #if defined(PMAP_DIAGNOSTIC) 2944 if (pv->pv_pmap == NULL) { 2945 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 2946 continue; 2947 } 2948 #endif 2949 2950 /* 2951 * Careful here. We can use a locked bus instruction to 2952 * clear VPTE_A or VPTE_M safely but we need to synchronize 2953 * with the target cpus when we mess with VPTE_W. 2954 * 2955 * On virtual kernels we must force a new fault-on-write 2956 * in the real kernel if we clear the Modify bit ourselves, 2957 * otherwise the real kernel will not get a new fault and 2958 * will never set our Modify bit again. 2959 */ 2960 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2961 if (*pte & bit) { 2962 if (bit == VPTE_W) { 2963 /* 2964 * We must also clear VPTE_M when clearing 2965 * VPTE_W 2966 */ 2967 pbits = pmap_clean_pte(pte, pv->pv_pmap, 2968 pv->pv_va); 2969 if (pbits & VPTE_M) 2970 vm_page_dirty(m); 2971 } else if (bit == VPTE_M) { 2972 /* 2973 * We do not have to make the page read-only 2974 * when clearing the Modify bit. The real 2975 * kernel will make the real PTE read-only 2976 * or otherwise detect the write and set 2977 * our VPTE_M again simply by us invalidating 2978 * the real kernel VA for the pmap (as we did 2979 * above). This allows the real kernel to 2980 * handle the write fault without forwarding 2981 * the fault to us. 2982 */ 2983 atomic_clear_long(pte, VPTE_M); 2984 } else if ((bit & (VPTE_W|VPTE_M)) == (VPTE_W|VPTE_M)) { 2985 /* 2986 * We've been asked to clear W & M, I guess 2987 * the caller doesn't want us to update 2988 * the dirty status of the VM page. 2989 */ 2990 pmap_clean_pte(pte, pv->pv_pmap, pv->pv_va); 2991 } else { 2992 /* 2993 * We've been asked to clear bits that do 2994 * not interact with hardware. 2995 */ 2996 atomic_clear_long(pte, bit); 2997 } 2998 } 2999 } 3000 crit_exit(); 3001 } 3002 3003 /* 3004 * Lower the permission for all mappings to a given page. 3005 * 3006 * No other requirements. 3007 */ 3008 void 3009 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3010 { 3011 /* JG NX support? */ 3012 if ((prot & VM_PROT_WRITE) == 0) { 3013 lwkt_gettoken(&vm_token); 3014 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3015 pmap_clearbit(m, VPTE_W); 3016 vm_page_flag_clear(m, PG_WRITEABLE); 3017 } else { 3018 pmap_remove_all(m); 3019 } 3020 lwkt_reltoken(&vm_token); 3021 } 3022 } 3023 3024 vm_paddr_t 3025 pmap_phys_address(vm_pindex_t ppn) 3026 { 3027 return (x86_64_ptob(ppn)); 3028 } 3029 3030 /* 3031 * Return a count of reference bits for a page, clearing those bits. 3032 * It is not necessary for every reference bit to be cleared, but it 3033 * is necessary that 0 only be returned when there are truly no 3034 * reference bits set. 3035 * 3036 * XXX: The exact number of bits to check and clear is a matter that 3037 * should be tested and standardized at some point in the future for 3038 * optimal aging of shared pages. 3039 * 3040 * No other requirements. 3041 */ 3042 int 3043 pmap_ts_referenced(vm_page_t m) 3044 { 3045 pv_entry_t pv, pvf, pvn; 3046 pt_entry_t *pte; 3047 int rtval = 0; 3048 3049 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3050 return (rtval); 3051 3052 crit_enter(); 3053 lwkt_gettoken(&vm_token); 3054 3055 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3056 3057 pvf = pv; 3058 3059 do { 3060 pvn = TAILQ_NEXT(pv, pv_list); 3061 3062 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3063 3064 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3065 3066 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) 3067 continue; 3068 3069 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3070 3071 if (pte && (*pte & VPTE_A)) { 3072 #ifdef SMP 3073 atomic_clear_long(pte, VPTE_A); 3074 #else 3075 atomic_clear_long_nonlocked(pte, VPTE_A); 3076 #endif 3077 rtval++; 3078 if (rtval > 4) { 3079 break; 3080 } 3081 } 3082 } while ((pv = pvn) != NULL && pv != pvf); 3083 } 3084 lwkt_reltoken(&vm_token); 3085 crit_exit(); 3086 3087 return (rtval); 3088 } 3089 3090 /* 3091 * Return whether or not the specified physical page was modified 3092 * in any physical maps. 3093 * 3094 * No other requirements. 3095 */ 3096 boolean_t 3097 pmap_is_modified(vm_page_t m) 3098 { 3099 boolean_t res; 3100 3101 lwkt_gettoken(&vm_token); 3102 res = pmap_testbit(m, VPTE_M); 3103 lwkt_reltoken(&vm_token); 3104 return (res); 3105 } 3106 3107 /* 3108 * Clear the modify bits on the specified physical page. 3109 * 3110 * No other requirements. 3111 */ 3112 void 3113 pmap_clear_modify(vm_page_t m) 3114 { 3115 lwkt_gettoken(&vm_token); 3116 pmap_clearbit(m, VPTE_M); 3117 lwkt_reltoken(&vm_token); 3118 } 3119 3120 /* 3121 * Clear the reference bit on the specified physical page. 3122 * 3123 * No other requirements. 3124 */ 3125 void 3126 pmap_clear_reference(vm_page_t m) 3127 { 3128 lwkt_gettoken(&vm_token); 3129 pmap_clearbit(m, VPTE_A); 3130 lwkt_reltoken(&vm_token); 3131 } 3132 3133 /* 3134 * Miscellaneous support routines follow 3135 */ 3136 3137 static void 3138 i386_protection_init(void) 3139 { 3140 int *kp, prot; 3141 3142 kp = protection_codes; 3143 for (prot = 0; prot < 8; prot++) { 3144 if (prot & VM_PROT_READ) 3145 *kp |= VPTE_R; 3146 if (prot & VM_PROT_WRITE) 3147 *kp |= VPTE_W; 3148 if (prot & VM_PROT_EXECUTE) 3149 *kp |= VPTE_X; 3150 ++kp; 3151 } 3152 } 3153 3154 /* 3155 * Perform the pmap work for mincore 3156 * 3157 * No other requirements. 3158 */ 3159 int 3160 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3161 { 3162 pt_entry_t *ptep, pte; 3163 vm_page_t m; 3164 int val = 0; 3165 3166 lwkt_gettoken(&vm_token); 3167 ptep = pmap_pte(pmap, addr); 3168 3169 if (ptep && (pte = *ptep) != 0) { 3170 vm_paddr_t pa; 3171 3172 val = MINCORE_INCORE; 3173 if ((pte & VPTE_MANAGED) == 0) 3174 goto done; 3175 3176 pa = pte & VPTE_FRAME; 3177 3178 m = PHYS_TO_VM_PAGE(pa); 3179 3180 /* 3181 * Modified by us 3182 */ 3183 if (pte & VPTE_M) 3184 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3185 /* 3186 * Modified by someone 3187 */ 3188 else if (m->dirty || pmap_is_modified(m)) 3189 val |= MINCORE_MODIFIED_OTHER; 3190 /* 3191 * Referenced by us 3192 */ 3193 if (pte & VPTE_A) 3194 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3195 3196 /* 3197 * Referenced by someone 3198 */ 3199 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3200 val |= MINCORE_REFERENCED_OTHER; 3201 vm_page_flag_set(m, PG_REFERENCED); 3202 } 3203 } 3204 done: 3205 lwkt_reltoken(&vm_token); 3206 return val; 3207 } 3208 3209 /* 3210 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3211 * vmspace will be ref'd and the old one will be deref'd. 3212 * 3213 * Caller must hold vmspace->vm_map.token for oldvm and newvm 3214 */ 3215 void 3216 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3217 { 3218 struct vmspace *oldvm; 3219 struct lwp *lp; 3220 3221 crit_enter(); 3222 oldvm = p->p_vmspace; 3223 if (oldvm != newvm) { 3224 p->p_vmspace = newvm; 3225 KKASSERT(p->p_nthreads == 1); 3226 lp = RB_ROOT(&p->p_lwp_tree); 3227 pmap_setlwpvm(lp, newvm); 3228 if (adjrefs) { 3229 sysref_get(&newvm->vm_sysref); 3230 sysref_put(&oldvm->vm_sysref); 3231 } 3232 } 3233 crit_exit(); 3234 } 3235 3236 /* 3237 * Set the vmspace for a LWP. The vmspace is almost universally set the 3238 * same as the process vmspace, but virtual kernels need to swap out contexts 3239 * on a per-lwp basis. 3240 */ 3241 void 3242 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3243 { 3244 struct vmspace *oldvm; 3245 struct pmap *pmap; 3246 3247 crit_enter(); 3248 oldvm = lp->lwp_vmspace; 3249 3250 if (oldvm != newvm) { 3251 lp->lwp_vmspace = newvm; 3252 if (curthread->td_lwp == lp) { 3253 pmap = vmspace_pmap(newvm); 3254 #if defined(SMP) 3255 atomic_set_cpumask(&pmap->pm_active, CPUMASK(mycpu->gd_cpuid)); 3256 #else 3257 pmap->pm_active |= 1; 3258 #endif 3259 #if defined(SWTCH_OPTIM_STATS) 3260 tlb_flush_count++; 3261 #endif 3262 pmap = vmspace_pmap(oldvm); 3263 #if defined(SMP) 3264 atomic_clear_cpumask(&pmap->pm_active, 3265 CPUMASK(mycpu->gd_cpuid)); 3266 #else 3267 pmap->pm_active &= ~(cpumask_t)1; 3268 #endif 3269 } 3270 } 3271 crit_exit(); 3272 } 3273 3274 vm_offset_t 3275 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3276 { 3277 3278 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3279 return addr; 3280 } 3281 3282 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 3283 return addr; 3284 } 3285 3286 /* 3287 * Used by kmalloc/kfree, page already exists at va 3288 */ 3289 vm_page_t 3290 pmap_kvtom(vm_offset_t va) 3291 { 3292 vpte_t *ptep; 3293 3294 KKASSERT(va >= KvaStart && va < KvaEnd); 3295 ptep = vtopte(va); 3296 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 3297 } 3298