1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 45 */ 46 47 /* 48 * Manages physical address maps. 49 */ 50 51 #include "opt_msgbuf.h" 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/vmspace.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/user.h> 76 #include <sys/thread2.h> 77 #include <sys/spinlock2.h> 78 #include <vm/vm_page2.h> 79 80 #include <machine/cputypes.h> 81 #include <machine/md_var.h> 82 #include <machine/specialreg.h> 83 #include <machine/smp.h> 84 #include <machine/globaldata.h> 85 #include <machine/pmap.h> 86 #include <machine/pmap_inval.h> 87 88 #include <ddb/ddb.h> 89 90 #include <stdio.h> 91 #include <assert.h> 92 #include <stdlib.h> 93 #include <pthread.h> 94 95 #define PMAP_KEEP_PDIRS 96 #ifndef PMAP_SHPGPERPROC 97 #define PMAP_SHPGPERPROC 1000 98 #endif 99 100 #if defined(DIAGNOSTIC) 101 #define PMAP_DIAGNOSTIC 102 #endif 103 104 #define MINPV 2048 105 106 #if !defined(PMAP_DIAGNOSTIC) 107 #define PMAP_INLINE __inline 108 #else 109 #define PMAP_INLINE 110 #endif 111 112 /* 113 * Get PDEs and PTEs for user/kernel address space 114 */ 115 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 116 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 117 118 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & VPTE_V) != 0) 119 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & VPTE_WIRED) != 0) 120 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & VPTE_M) != 0) 121 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & VPTE_A) != 0) 122 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & VPTE_V) != 0) 123 124 /* 125 * Given a map and a machine independent protection code, 126 * convert to a vax protection code. 127 */ 128 #define pte_prot(m, p) \ 129 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 130 static uint64_t protection_codes[8]; 131 132 struct pmap kernel_pmap; 133 134 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 135 136 static struct vm_object kptobj; 137 static int nkpt; 138 139 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 140 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 141 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 142 143 extern int vmm_enabled; 144 extern void *vkernel_stack; 145 146 /* 147 * Data for the pv entry allocation mechanism 148 */ 149 static vm_zone_t pvzone; 150 static struct vm_zone pvzone_store; 151 static int pv_entry_count = 0; 152 static int pv_entry_max = 0; 153 static int pv_entry_high_water = 0; 154 static int pmap_pagedaemon_waken = 0; 155 static struct pv_entry *pvinit; 156 157 /* 158 * All those kernel PT submaps that BSD is so fond of 159 */ 160 pt_entry_t *CMAP1 = NULL, *ptmmap; 161 caddr_t CADDR1 = NULL; 162 static pt_entry_t *msgbufmap; 163 164 uint64_t KPTphys; 165 166 static PMAP_INLINE void free_pv_entry (pv_entry_t pv); 167 static pv_entry_t get_pv_entry (void); 168 static void i386_protection_init (void); 169 static __inline void pmap_clearbit (vm_page_t m, int bit); 170 171 static void pmap_remove_all (vm_page_t m); 172 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 173 pt_entry_t oldpte, vm_offset_t sva); 174 static void pmap_remove_page (struct pmap *pmap, vm_offset_t va); 175 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 176 vm_offset_t va); 177 static boolean_t pmap_testbit (vm_page_t m, int bit); 178 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 179 vm_page_t mpte, vm_page_t m, pv_entry_t); 180 181 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 182 183 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 184 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 185 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 186 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t); 187 188 static int 189 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 190 { 191 if (pv1->pv_va < pv2->pv_va) 192 return(-1); 193 if (pv1->pv_va > pv2->pv_va) 194 return(1); 195 return(0); 196 } 197 198 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 199 pv_entry_compare, vm_offset_t, pv_va); 200 201 static __inline vm_pindex_t 202 pmap_pt_pindex(vm_offset_t va) 203 { 204 return va >> PDRSHIFT; 205 } 206 207 static __inline vm_pindex_t 208 pmap_pte_index(vm_offset_t va) 209 { 210 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 211 } 212 213 static __inline vm_pindex_t 214 pmap_pde_index(vm_offset_t va) 215 { 216 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 217 } 218 219 static __inline vm_pindex_t 220 pmap_pdpe_index(vm_offset_t va) 221 { 222 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 223 } 224 225 static __inline vm_pindex_t 226 pmap_pml4e_index(vm_offset_t va) 227 { 228 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 229 } 230 231 /* Return a pointer to the PML4 slot that corresponds to a VA */ 232 static __inline pml4_entry_t * 233 pmap_pml4e(pmap_t pmap, vm_offset_t va) 234 { 235 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 236 } 237 238 /* Return a pointer to the PDP slot that corresponds to a VA */ 239 static __inline pdp_entry_t * 240 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 241 { 242 pdp_entry_t *pdpe; 243 244 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & VPTE_FRAME); 245 return (&pdpe[pmap_pdpe_index(va)]); 246 } 247 248 /* Return a pointer to the PDP slot that corresponds to a VA */ 249 static __inline pdp_entry_t * 250 pmap_pdpe(pmap_t pmap, vm_offset_t va) 251 { 252 pml4_entry_t *pml4e; 253 254 pml4e = pmap_pml4e(pmap, va); 255 if ((*pml4e & VPTE_V) == 0) 256 return NULL; 257 return (pmap_pml4e_to_pdpe(pml4e, va)); 258 } 259 260 /* Return a pointer to the PD slot that corresponds to a VA */ 261 static __inline pd_entry_t * 262 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 263 { 264 pd_entry_t *pde; 265 266 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & VPTE_FRAME); 267 return (&pde[pmap_pde_index(va)]); 268 } 269 270 /* Return a pointer to the PD slot that corresponds to a VA */ 271 static __inline pd_entry_t * 272 pmap_pde(pmap_t pmap, vm_offset_t va) 273 { 274 pdp_entry_t *pdpe; 275 276 pdpe = pmap_pdpe(pmap, va); 277 if (pdpe == NULL || (*pdpe & VPTE_V) == 0) 278 return NULL; 279 return (pmap_pdpe_to_pde(pdpe, va)); 280 } 281 282 /* Return a pointer to the PT slot that corresponds to a VA */ 283 static __inline pt_entry_t * 284 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 285 { 286 pt_entry_t *pte; 287 288 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & VPTE_FRAME); 289 return (&pte[pmap_pte_index(va)]); 290 } 291 292 /* 293 * Hold pt_m for page table scans to prevent it from getting reused out 294 * from under us across blocking conditions in the body of the loop. 295 */ 296 static __inline 297 vm_page_t 298 pmap_hold_pt_page(pd_entry_t *pde, vm_offset_t va) 299 { 300 pt_entry_t pte; 301 vm_page_t pt_m; 302 303 pte = (pt_entry_t)*pde; 304 KKASSERT(pte != 0); 305 pt_m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME); 306 vm_page_hold(pt_m); 307 308 return pt_m; 309 } 310 311 /* Return a pointer to the PT slot that corresponds to a VA */ 312 static __inline pt_entry_t * 313 pmap_pte(pmap_t pmap, vm_offset_t va) 314 { 315 pd_entry_t *pde; 316 317 pde = pmap_pde(pmap, va); 318 if (pde == NULL || (*pde & VPTE_V) == 0) 319 return NULL; 320 if ((*pde & VPTE_PS) != 0) /* compat with i386 pmap_pte() */ 321 return ((pt_entry_t *)pde); 322 return (pmap_pde_to_pte(pde, va)); 323 } 324 325 static PMAP_INLINE pt_entry_t * 326 vtopte(vm_offset_t va) 327 { 328 pt_entry_t *x; 329 x = pmap_pte(&kernel_pmap, va); 330 assert(x != NULL); 331 return x; 332 } 333 334 static __inline pd_entry_t * 335 vtopde(vm_offset_t va) 336 { 337 pd_entry_t *x; 338 x = pmap_pde(&kernel_pmap, va); 339 assert(x != NULL); 340 return x; 341 } 342 343 static uint64_t 344 allocpages(vm_paddr_t *firstaddr, int n) 345 { 346 uint64_t ret; 347 348 ret = *firstaddr; 349 /*bzero((void *)ret, n * PAGE_SIZE); not mapped yet */ 350 *firstaddr += n * PAGE_SIZE; 351 return (ret); 352 } 353 354 static void 355 create_dmap_vmm(vm_paddr_t *firstaddr) 356 { 357 void *stack_addr; 358 int pml4_stack_index; 359 int pdp_stack_index; 360 int pd_stack_index; 361 long i,j; 362 int regs[4]; 363 int amd_feature; 364 365 uint64_t KPDP_DMAP_phys = allocpages(firstaddr, NDMPML4E); 366 uint64_t KPDP_VSTACK_phys = allocpages(firstaddr, 1); 367 uint64_t KPD_VSTACK_phys = allocpages(firstaddr, 1); 368 369 pml4_entry_t *KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 370 pdp_entry_t *KPDP_DMAP_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_DMAP_phys); 371 pdp_entry_t *KPDP_VSTACK_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_VSTACK_phys); 372 pd_entry_t *KPD_VSTACK_virt = (pd_entry_t *)PHYS_TO_DMAP(KPD_VSTACK_phys); 373 374 bzero(KPDP_DMAP_virt, NDMPML4E * PAGE_SIZE); 375 bzero(KPDP_VSTACK_virt, 1 * PAGE_SIZE); 376 bzero(KPD_VSTACK_virt, 1 * PAGE_SIZE); 377 378 do_cpuid(0x80000001, regs); 379 amd_feature = regs[3]; 380 381 /* Build the mappings for the first 512GB */ 382 if (amd_feature & AMDID_PAGE1GB) { 383 /* In pages of 1 GB, if supported */ 384 for (i = 0; i < NPDPEPG; i++) { 385 KPDP_DMAP_virt[i] = ((uint64_t)i << PDPSHIFT); 386 KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_PS | VPTE_U; 387 } 388 } else { 389 /* In page of 2MB, otherwise */ 390 for (i = 0; i < NPDPEPG; i++) { 391 uint64_t KPD_DMAP_phys; 392 pd_entry_t *KPD_DMAP_virt; 393 394 KPD_DMAP_phys = allocpages(firstaddr, 1); 395 KPD_DMAP_virt = 396 (pd_entry_t *)PHYS_TO_DMAP(KPD_DMAP_phys); 397 398 bzero(KPD_DMAP_virt, PAGE_SIZE); 399 400 KPDP_DMAP_virt[i] = KPD_DMAP_phys; 401 KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_U; 402 403 /* For each PD, we have to allocate NPTEPG PT */ 404 for (j = 0; j < NPTEPG; j++) { 405 KPD_DMAP_virt[j] = (i << PDPSHIFT) | 406 (j << PDRSHIFT); 407 KPD_DMAP_virt[j] |= VPTE_RW | VPTE_V | 408 VPTE_PS | VPTE_U; 409 } 410 } 411 } 412 413 /* DMAP for the first 512G */ 414 KPML4virt[0] = KPDP_DMAP_phys; 415 KPML4virt[0] |= VPTE_RW | VPTE_V | VPTE_U; 416 417 /* create a 2 MB map of the new stack */ 418 pml4_stack_index = (uint64_t)&stack_addr >> PML4SHIFT; 419 KPML4virt[pml4_stack_index] = KPDP_VSTACK_phys; 420 KPML4virt[pml4_stack_index] |= VPTE_RW | VPTE_V | VPTE_U; 421 422 pdp_stack_index = ((uint64_t)&stack_addr & PML4MASK) >> PDPSHIFT; 423 KPDP_VSTACK_virt[pdp_stack_index] = KPD_VSTACK_phys; 424 KPDP_VSTACK_virt[pdp_stack_index] |= VPTE_RW | VPTE_V | VPTE_U; 425 426 pd_stack_index = ((uint64_t)&stack_addr & PDPMASK) >> PDRSHIFT; 427 KPD_VSTACK_virt[pd_stack_index] = (uint64_t) vkernel_stack; 428 KPD_VSTACK_virt[pd_stack_index] |= VPTE_RW | VPTE_V | VPTE_U | VPTE_PS; 429 } 430 431 static void 432 create_pagetables(vm_paddr_t *firstaddr, int64_t ptov_offset) 433 { 434 int i; 435 pml4_entry_t *KPML4virt; 436 pdp_entry_t *KPDPvirt; 437 pd_entry_t *KPDvirt; 438 pt_entry_t *KPTvirt; 439 int kpml4i = pmap_pml4e_index(ptov_offset); 440 int kpdpi = pmap_pdpe_index(ptov_offset); 441 int kpdi = pmap_pde_index(ptov_offset); 442 443 /* 444 * Calculate NKPT - number of kernel page tables. We have to 445 * accomodoate prealloction of the vm_page_array, dump bitmap, 446 * MSGBUF_SIZE, and other stuff. Be generous. 447 * 448 * Maxmem is in pages. 449 */ 450 nkpt = (Maxmem * (sizeof(struct vm_page) * 2) + MSGBUF_SIZE) / NBPDR; 451 /* 452 * Allocate pages 453 */ 454 KPML4phys = allocpages(firstaddr, 1); 455 KPDPphys = allocpages(firstaddr, NKPML4E); 456 KPDphys = allocpages(firstaddr, NKPDPE); 457 KPTphys = allocpages(firstaddr, nkpt); 458 459 KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 460 KPDPvirt = (pdp_entry_t *)PHYS_TO_DMAP(KPDPphys); 461 KPDvirt = (pd_entry_t *)PHYS_TO_DMAP(KPDphys); 462 KPTvirt = (pt_entry_t *)PHYS_TO_DMAP(KPTphys); 463 464 bzero(KPML4virt, 1 * PAGE_SIZE); 465 bzero(KPDPvirt, NKPML4E * PAGE_SIZE); 466 bzero(KPDvirt, NKPDPE * PAGE_SIZE); 467 bzero(KPTvirt, nkpt * PAGE_SIZE); 468 469 /* Now map the page tables at their location within PTmap */ 470 for (i = 0; i < nkpt; i++) { 471 KPDvirt[i + kpdi] = KPTphys + (i << PAGE_SHIFT); 472 KPDvirt[i + kpdi] |= VPTE_RW | VPTE_V | VPTE_U; 473 } 474 475 /* And connect up the PD to the PDP */ 476 for (i = 0; i < NKPDPE; i++) { 477 KPDPvirt[i + kpdpi] = KPDphys + (i << PAGE_SHIFT); 478 KPDPvirt[i + kpdpi] |= VPTE_RW | VPTE_V | VPTE_U; 479 } 480 481 /* And recursively map PML4 to itself in order to get PTmap */ 482 KPML4virt[PML4PML4I] = KPML4phys; 483 KPML4virt[PML4PML4I] |= VPTE_RW | VPTE_V | VPTE_U; 484 485 /* Connect the KVA slot up to the PML4 */ 486 KPML4virt[kpml4i] = KPDPphys; 487 KPML4virt[kpml4i] |= VPTE_RW | VPTE_V | VPTE_U; 488 } 489 490 /* 491 * Typically used to initialize a fictitious page by vm/device_pager.c 492 */ 493 void 494 pmap_page_init(struct vm_page *m) 495 { 496 vm_page_init(m); 497 TAILQ_INIT(&m->md.pv_list); 498 } 499 500 /* 501 * Bootstrap the system enough to run with virtual memory. 502 * 503 * On the i386 this is called after mapping has already been enabled 504 * and just syncs the pmap module with what has already been done. 505 * [We can't call it easily with mapping off since the kernel is not 506 * mapped with PA == VA, hence we would have to relocate every address 507 * from the linked base (virtual) address "KERNBASE" to the actual 508 * (physical) address starting relative to 0] 509 */ 510 void 511 pmap_bootstrap(vm_paddr_t *firstaddr, int64_t ptov_offset) 512 { 513 vm_offset_t va; 514 pt_entry_t *pte; 515 516 /* 517 * Create an initial set of page tables to run the kernel in. 518 */ 519 create_pagetables(firstaddr, ptov_offset); 520 521 /* Create the DMAP for the VMM */ 522 if (vmm_enabled) { 523 create_dmap_vmm(firstaddr); 524 } 525 526 virtual_start = KvaStart; 527 virtual_end = KvaEnd; 528 529 /* 530 * Initialize protection array. 531 */ 532 i386_protection_init(); 533 534 /* 535 * The kernel's pmap is statically allocated so we don't have to use 536 * pmap_create, which is unlikely to work correctly at this part of 537 * the boot sequence (XXX and which no longer exists). 538 * 539 * The kernel_pmap's pm_pteobj is used only for locking and not 540 * for mmu pages. 541 */ 542 kernel_pmap.pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 543 kernel_pmap.pm_count = 1; 544 /* don't allow deactivation */ 545 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 546 kernel_pmap.pm_pteobj = NULL; /* see pmap_init */ 547 RB_INIT(&kernel_pmap.pm_pvroot); 548 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 549 550 /* 551 * Reserve some special page table entries/VA space for temporary 552 * mapping of pages. 553 */ 554 #define SYSMAP(c, p, v, n) \ 555 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 556 557 va = virtual_start; 558 pte = pmap_pte(&kernel_pmap, va); 559 /* 560 * CMAP1/CMAP2 are used for zeroing and copying pages. 561 */ 562 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 563 564 #if JGV 565 /* 566 * Crashdump maps. 567 */ 568 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 569 #endif 570 571 /* 572 * ptvmmap is used for reading arbitrary physical pages via 573 * /dev/mem. 574 */ 575 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 576 577 /* 578 * msgbufp is used to map the system message buffer. 579 * XXX msgbufmap is not used. 580 */ 581 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 582 atop(round_page(MSGBUF_SIZE))) 583 584 virtual_start = va; 585 586 *CMAP1 = 0; 587 /* Not ready to do an invltlb yet for VMM*/ 588 if (!vmm_enabled) 589 cpu_invltlb(); 590 591 } 592 593 /* 594 * Initialize the pmap module. 595 * Called by vm_init, to initialize any structures that the pmap 596 * system needs to map virtual memory. 597 * pmap_init has been enhanced to support in a fairly consistant 598 * way, discontiguous physical memory. 599 */ 600 void 601 pmap_init(void) 602 { 603 int i; 604 int initial_pvs; 605 606 /* 607 * object for kernel page table pages 608 */ 609 /* JG I think the number can be arbitrary */ 610 vm_object_init(&kptobj, 5); 611 kernel_pmap.pm_pteobj = &kptobj; 612 613 /* 614 * Allocate memory for random pmap data structures. Includes the 615 * pv_head_table. 616 */ 617 for(i = 0; i < vm_page_array_size; i++) { 618 vm_page_t m; 619 620 m = &vm_page_array[i]; 621 TAILQ_INIT(&m->md.pv_list); 622 m->md.pv_list_count = 0; 623 } 624 625 /* 626 * init the pv free list 627 */ 628 initial_pvs = vm_page_array_size; 629 if (initial_pvs < MINPV) 630 initial_pvs = MINPV; 631 pvzone = &pvzone_store; 632 pvinit = (struct pv_entry *) 633 kmem_alloc(&kernel_map, 634 initial_pvs * sizeof (struct pv_entry), 635 VM_SUBSYS_PVENTRY); 636 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, 637 initial_pvs); 638 639 /* 640 * Now it is safe to enable pv_table recording. 641 */ 642 pmap_initialized = TRUE; 643 } 644 645 /* 646 * Initialize the address space (zone) for the pv_entries. Set a 647 * high water mark so that the system can recover from excessive 648 * numbers of pv entries. 649 */ 650 void 651 pmap_init2(void) 652 { 653 int shpgperproc = PMAP_SHPGPERPROC; 654 655 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 656 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 657 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 658 pv_entry_high_water = 9 * (pv_entry_max / 10); 659 zinitna(pvzone, NULL, 0, pv_entry_max, ZONE_INTERRUPT); 660 } 661 662 663 /*************************************************** 664 * Low level helper routines..... 665 ***************************************************/ 666 667 /* 668 * The modification bit is not tracked for any pages in this range. XXX 669 * such pages in this maps should always use pmap_k*() functions and not 670 * be managed anyhow. 671 * 672 * XXX User and kernel address spaces are independant for virtual kernels, 673 * this function only applies to the kernel pmap. 674 */ 675 int 676 pmap_track_modified(pmap_t pmap, vm_offset_t va) 677 { 678 if (pmap != &kernel_pmap) 679 return 1; 680 if ((va < clean_sva) || (va >= clean_eva)) 681 return 1; 682 else 683 return 0; 684 } 685 686 /* 687 * Extract the physical page address associated with the map/VA pair. 688 * 689 * No requirements. 690 */ 691 vm_paddr_t 692 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 693 { 694 vm_paddr_t rtval; 695 pt_entry_t *pte; 696 pd_entry_t pde, *pdep; 697 698 vm_object_hold(pmap->pm_pteobj); 699 rtval = 0; 700 pdep = pmap_pde(pmap, va); 701 if (pdep != NULL) { 702 pde = *pdep; 703 if (pde) { 704 if ((pde & VPTE_PS) != 0) { 705 /* JGV */ 706 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 707 } else { 708 pte = pmap_pde_to_pte(pdep, va); 709 rtval = (*pte & VPTE_FRAME) | (va & PAGE_MASK); 710 } 711 } 712 } 713 if (handlep) 714 *handlep = NULL; /* XXX */ 715 vm_object_drop(pmap->pm_pteobj); 716 717 return rtval; 718 } 719 720 void 721 pmap_extract_done(void *handle) 722 { 723 pmap_t pmap; 724 725 if (handle) { 726 pmap = handle; 727 vm_object_drop(pmap->pm_pteobj); 728 } 729 } 730 731 /* 732 * Similar to extract but checks protections, SMP-friendly short-cut for 733 * vm_fault_page[_quick](). 734 * 735 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NEITHER IT NOR ITS TARGET 736 * DATA IS SUITABLE FOR WRITING. Writing can interfere with 737 * pageouts flushes, msync, etc. The hold_count is not enough 738 * to avoid races against pageouts and other flush code doesn't 739 * care about hold_count. 740 */ 741 vm_page_t 742 pmap_fault_page_quick(pmap_t pmap __unused, vm_offset_t vaddr __unused, 743 vm_prot_t prot __unused, int *busyp __unused) 744 { 745 return(NULL); 746 } 747 748 /* 749 * Routine: pmap_kextract 750 * Function: 751 * Extract the physical page address associated 752 * kernel virtual address. 753 */ 754 vm_paddr_t 755 pmap_kextract(vm_offset_t va) 756 { 757 pd_entry_t pde; 758 vm_paddr_t pa; 759 760 KKASSERT(va >= KvaStart && va < KvaEnd); 761 762 /* 763 * The DMAP region is not included in [KvaStart, KvaEnd) 764 */ 765 #if 0 766 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 767 pa = DMAP_TO_PHYS(va); 768 } else { 769 #endif 770 pde = *vtopde(va); 771 if (pde & VPTE_PS) { 772 /* JGV */ 773 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 774 } else { 775 /* 776 * Beware of a concurrent promotion that changes the 777 * PDE at this point! For example, vtopte() must not 778 * be used to access the PTE because it would use the 779 * new PDE. It is, however, safe to use the old PDE 780 * because the page table page is preserved by the 781 * promotion. 782 */ 783 pa = *pmap_pde_to_pte(&pde, va); 784 pa = (pa & VPTE_FRAME) | (va & PAGE_MASK); 785 } 786 #if 0 787 } 788 #endif 789 return pa; 790 } 791 792 /*************************************************** 793 * Low level mapping routines..... 794 ***************************************************/ 795 796 /* 797 * Enter a mapping into kernel_pmap. Mappings created in this fashion 798 * are not managed. Mappings must be immediately accessible on all cpus. 799 * 800 * Call pmap_inval_pte() to invalidate the virtual pte and clean out the 801 * real pmap and handle related races before storing the new vpte. The 802 * new semantics for kenter require use to do an UNCONDITIONAL invalidation, 803 * because the entry may have previously been cleared without an invalidation. 804 */ 805 void 806 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 807 { 808 pt_entry_t *ptep; 809 pt_entry_t npte; 810 811 KKASSERT(va >= KvaStart && va < KvaEnd); 812 npte = pa | VPTE_RW | VPTE_V | VPTE_U; 813 ptep = vtopte(va); 814 815 #if 1 816 pmap_inval_pte(ptep, &kernel_pmap, va); 817 #else 818 if (*pte & VPTE_V) 819 pmap_inval_pte(ptep, &kernel_pmap, va); 820 #endif 821 atomic_swap_long(ptep, npte); 822 } 823 824 /* 825 * Enter an unmanaged KVA mapping for the private use of the current 826 * cpu only. 827 * 828 * It is illegal for the mapping to be accessed by other cpus without 829 * proper invalidation. 830 */ 831 int 832 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 833 { 834 pt_entry_t *ptep; 835 pt_entry_t npte; 836 int res; 837 838 KKASSERT(va >= KvaStart && va < KvaEnd); 839 840 npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; 841 ptep = vtopte(va); 842 843 #if 1 844 pmap_inval_pte_quick(ptep, &kernel_pmap, va); 845 res = 1; 846 #else 847 /* FUTURE */ 848 res = (*ptep != 0); 849 if (*pte & VPTE_V) 850 pmap_inval_pte(pte, &kernel_pmap, va); 851 #endif 852 atomic_swap_long(ptep, npte); 853 854 return res; 855 } 856 857 /* 858 * Invalidation will occur later, ok to be lazy here. 859 */ 860 int 861 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 862 { 863 pt_entry_t *ptep; 864 pt_entry_t npte; 865 int res; 866 867 KKASSERT(va >= KvaStart && va < KvaEnd); 868 869 npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; 870 ptep = vtopte(va); 871 #if 1 872 res = 1; 873 #else 874 /* FUTURE */ 875 res = (*ptep != 0); 876 #endif 877 atomic_swap_long(ptep, npte); 878 879 return res; 880 } 881 882 /* 883 * Remove an unmanaged mapping created with pmap_kenter*(). 884 */ 885 void 886 pmap_kremove(vm_offset_t va) 887 { 888 pt_entry_t *ptep; 889 890 KKASSERT(va >= KvaStart && va < KvaEnd); 891 892 ptep = vtopte(va); 893 atomic_swap_long(ptep, 0); 894 pmap_inval_pte(ptep, &kernel_pmap, va); 895 } 896 897 /* 898 * Remove an unmanaged mapping created with pmap_kenter*() but synchronize 899 * only with this cpu. 900 * 901 * Unfortunately because we optimize new entries by testing VPTE_V later 902 * on, we actually still have to synchronize with all the cpus. XXX maybe 903 * store a junk value and test against 0 in the other places instead? 904 */ 905 void 906 pmap_kremove_quick(vm_offset_t va) 907 { 908 pt_entry_t *ptep; 909 910 KKASSERT(va >= KvaStart && va < KvaEnd); 911 912 ptep = vtopte(va); 913 atomic_swap_long(ptep, 0); 914 pmap_inval_pte(ptep, &kernel_pmap, va); /* NOT _quick */ 915 } 916 917 /* 918 * Invalidation will occur later, ok to be lazy here. 919 */ 920 void 921 pmap_kremove_noinval(vm_offset_t va) 922 { 923 pt_entry_t *ptep; 924 925 KKASSERT(va >= KvaStart && va < KvaEnd); 926 927 ptep = vtopte(va); 928 atomic_swap_long(ptep, 0); 929 } 930 931 /* 932 * Used to map a range of physical addresses into kernel 933 * virtual address space. 934 * 935 * For now, VM is already on, we only need to map the 936 * specified memory. 937 */ 938 vm_offset_t 939 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 940 { 941 return PHYS_TO_DMAP(start); 942 } 943 944 /* 945 * Map a set of unmanaged VM pages into KVM. 946 */ 947 static __inline void 948 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) 949 { 950 vm_offset_t end_va; 951 vm_offset_t va; 952 953 end_va = beg_va + count * PAGE_SIZE; 954 KKASSERT(beg_va >= KvaStart && end_va <= KvaEnd); 955 956 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 957 pt_entry_t *ptep; 958 959 ptep = vtopte(va); 960 atomic_swap_long(ptep, VM_PAGE_TO_PHYS(*m) | 961 VPTE_RW | VPTE_V | VPTE_U); 962 ++m; 963 } 964 if (doinval) 965 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 966 /* pmap_inval_pte(pte, &kernel_pmap, va); */ 967 } 968 969 void 970 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 971 { 972 _pmap_qenter(beg_va, m, count, 1); 973 } 974 975 void 976 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) 977 { 978 _pmap_qenter(beg_va, m, count, 0); 979 } 980 981 /* 982 * Undo the effects of pmap_qenter*(). 983 */ 984 void 985 pmap_qremove(vm_offset_t beg_va, int count) 986 { 987 vm_offset_t end_va; 988 vm_offset_t va; 989 990 end_va = beg_va + count * PAGE_SIZE; 991 KKASSERT(beg_va >= KvaStart && end_va < KvaEnd); 992 993 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 994 pt_entry_t *ptep; 995 996 ptep = vtopte(va); 997 atomic_swap_long(ptep, 0); 998 } 999 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1000 } 1001 1002 /* 1003 * Unlike the real pmap code, we can't avoid calling the real-kernel. 1004 */ 1005 void 1006 pmap_qremove_quick(vm_offset_t va, int count) 1007 { 1008 pmap_qremove(va, count); 1009 } 1010 1011 void 1012 pmap_qremove_noinval(vm_offset_t va, int count) 1013 { 1014 pmap_qremove(va, count); 1015 } 1016 1017 /* 1018 * This routine works like vm_page_lookup() but also blocks as long as the 1019 * page is busy. This routine does not busy the page it returns. 1020 * 1021 * Unless the caller is managing objects whos pages are in a known state, 1022 * the call should be made with a critical section held so the page's object 1023 * association remains valid on return. 1024 */ 1025 static vm_page_t 1026 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1027 { 1028 vm_page_t m; 1029 1030 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1031 m = vm_page_lookup_busy_wait(object, pindex, TRUE, "pplookp"); 1032 1033 return(m); 1034 } 1035 1036 /* 1037 * Create a new thread and optionally associate it with a (new) process. 1038 * NOTE! the new thread's cpu may not equal the current cpu. 1039 */ 1040 void 1041 pmap_init_thread(thread_t td) 1042 { 1043 /* enforce pcb placement */ 1044 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1045 td->td_savefpu = &td->td_pcb->pcb_save; 1046 td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */ 1047 } 1048 1049 /* 1050 * This routine directly affects the fork perf for a process. 1051 */ 1052 void 1053 pmap_init_proc(struct proc *p) 1054 { 1055 } 1056 1057 /* 1058 * Unwire a page table which has been removed from the pmap. We own the 1059 * wire_count, so the page cannot go away. The page representing the page 1060 * table is passed in unbusied and must be busied if we cannot trivially 1061 * unwire it. 1062 * 1063 * XXX NOTE! This code is not usually run because we do not currently 1064 * implement dynamic page table page removal. The page in 1065 * its parent assumes at least 1 wire count, so no call to this 1066 * function ever sees a wire count less than 2. 1067 */ 1068 static int 1069 pmap_unwire_pgtable(pmap_t pmap, vm_offset_t va, vm_page_t m) 1070 { 1071 /* 1072 * Try to unwire optimally. If non-zero is returned the wire_count 1073 * is 1 and we must busy the page to unwire it. 1074 */ 1075 if (vm_page_unwire_quick(m) == 0) 1076 return 0; 1077 1078 vm_page_busy_wait(m, TRUE, "pmuwpt"); 1079 KASSERT(m->queue == PQ_NONE, 1080 ("_pmap_unwire_pgtable: %p->queue != PQ_NONE", m)); 1081 1082 if (m->wire_count == 1) { 1083 /* 1084 * Unmap the page table page. 1085 */ 1086 /* pmap_inval_add(info, pmap, -1); */ 1087 1088 if (m->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1089 /* PDP page */ 1090 pml4_entry_t *pml4; 1091 pml4 = pmap_pml4e(pmap, va); 1092 *pml4 = 0; 1093 } else if (m->pindex >= NUPT_TOTAL) { 1094 /* PD page */ 1095 pdp_entry_t *pdp; 1096 pdp = pmap_pdpe(pmap, va); 1097 *pdp = 0; 1098 } else { 1099 /* PT page */ 1100 pd_entry_t *pd; 1101 pd = pmap_pde(pmap, va); 1102 *pd = 0; 1103 } 1104 1105 KKASSERT(pmap->pm_stats.resident_count > 0); 1106 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1107 1108 if (pmap->pm_ptphint == m) 1109 pmap->pm_ptphint = NULL; 1110 1111 if (m->pindex < NUPT_TOTAL) { 1112 /* We just released a PT, unhold the matching PD */ 1113 vm_page_t pdpg; 1114 1115 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & 1116 VPTE_FRAME); 1117 pmap_unwire_pgtable(pmap, va, pdpg); 1118 } 1119 if (m->pindex >= NUPT_TOTAL && 1120 m->pindex < (NUPT_TOTAL + NUPD_TOTAL)) { 1121 /* We just released a PD, unhold the matching PDP */ 1122 vm_page_t pdppg; 1123 1124 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & 1125 VPTE_FRAME); 1126 pmap_unwire_pgtable(pmap, va, pdppg); 1127 } 1128 1129 /* 1130 * This was our last wire, the page had better be unwired 1131 * after we decrement wire_count. 1132 * 1133 * FUTURE NOTE: shared page directory page could result in 1134 * multiple wire counts. 1135 */ 1136 vm_page_unwire(m, 0); 1137 KKASSERT(m->wire_count == 0); 1138 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1139 vm_page_flash(m); 1140 vm_page_free(m); 1141 return 1; 1142 } else { 1143 /* XXX SMP race to 1 if not holding vmobj */ 1144 vm_page_unwire(m, 0); 1145 vm_page_wakeup(m); 1146 return 0; 1147 } 1148 } 1149 1150 /* 1151 * After removing a page table entry, this routine is used to 1152 * conditionally free the page, and manage the hold/wire counts. 1153 * 1154 * If not NULL the caller owns a wire_count on mpte, so it can't disappear. 1155 * If NULL the caller owns a wire_count on what would be the mpte, we must 1156 * look it up. 1157 */ 1158 static int 1159 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1160 { 1161 vm_pindex_t ptepindex; 1162 1163 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1164 1165 if (mpte == NULL) { 1166 /* 1167 * page table pages in the kernel_pmap are not managed. 1168 */ 1169 if (pmap == &kernel_pmap) 1170 return(0); 1171 ptepindex = pmap_pt_pindex(va); 1172 if (pmap->pm_ptphint && 1173 (pmap->pm_ptphint->pindex == ptepindex)) { 1174 mpte = pmap->pm_ptphint; 1175 } else { 1176 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1177 pmap->pm_ptphint = mpte; 1178 vm_page_wakeup(mpte); 1179 } 1180 } 1181 return pmap_unwire_pgtable(pmap, va, mpte); 1182 } 1183 1184 /* 1185 * Initialize pmap0/vmspace0 . Since process 0 never enters user mode we 1186 * just dummy it up so it works well enough for fork(). 1187 * 1188 * In DragonFly, process pmaps may only be used to manipulate user address 1189 * space, never kernel address space. 1190 */ 1191 void 1192 pmap_pinit0(struct pmap *pmap) 1193 { 1194 pmap_pinit(pmap); 1195 } 1196 1197 /* 1198 * Initialize a preallocated and zeroed pmap structure, 1199 * such as one in a vmspace structure. 1200 */ 1201 void 1202 pmap_pinit(struct pmap *pmap) 1203 { 1204 vm_page_t ptdpg; 1205 1206 /* 1207 * No need to allocate page table space yet but we do need a valid 1208 * page directory table. 1209 */ 1210 if (pmap->pm_pml4 == NULL) { 1211 pmap->pm_pml4 = (pml4_entry_t *) 1212 kmem_alloc_pageable(&kernel_map, PAGE_SIZE, 1213 VM_SUBSYS_PML4); 1214 } 1215 1216 /* 1217 * Allocate an object for the ptes 1218 */ 1219 if (pmap->pm_pteobj == NULL) 1220 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL + 1); 1221 1222 /* 1223 * Allocate the page directory page, unless we already have 1224 * one cached. If we used the cached page the wire_count will 1225 * already be set appropriately. 1226 */ 1227 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1228 ptdpg = vm_page_grab(pmap->pm_pteobj, 1229 NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL, 1230 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | 1231 VM_ALLOC_ZERO); 1232 pmap->pm_pdirm = ptdpg; 1233 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_WRITEABLE); 1234 vm_page_wire(ptdpg); 1235 vm_page_wakeup(ptdpg); 1236 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1237 } 1238 pmap->pm_count = 1; 1239 CPUMASK_ASSZERO(pmap->pm_active); 1240 pmap->pm_ptphint = NULL; 1241 RB_INIT(&pmap->pm_pvroot); 1242 spin_init(&pmap->pm_spin, "pmapinit"); 1243 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1244 pmap->pm_stats.resident_count = 1; 1245 pmap->pm_stats.wired_count = 1; 1246 } 1247 1248 /* 1249 * Clean up a pmap structure so it can be physically freed. This routine 1250 * is called by the vmspace dtor function. A great deal of pmap data is 1251 * left passively mapped to improve vmspace management so we have a bit 1252 * of cleanup work to do here. 1253 * 1254 * No requirements. 1255 */ 1256 void 1257 pmap_puninit(pmap_t pmap) 1258 { 1259 vm_page_t p; 1260 1261 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 1262 if ((p = pmap->pm_pdirm) != NULL) { 1263 KKASSERT(pmap->pm_pml4 != NULL); 1264 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1265 vm_page_busy_wait(p, TRUE, "pgpun"); 1266 vm_page_unwire(p, 0); 1267 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1268 vm_page_free(p); 1269 pmap->pm_pdirm = NULL; 1270 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1271 KKASSERT(pmap->pm_stats.wired_count == 0); 1272 } 1273 if (pmap->pm_pml4) { 1274 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1275 pmap->pm_pml4 = NULL; 1276 } 1277 if (pmap->pm_pteobj) { 1278 vm_object_deallocate(pmap->pm_pteobj); 1279 pmap->pm_pteobj = NULL; 1280 } 1281 } 1282 1283 /* 1284 * This function is now unused (used to add the pmap to the pmap_list) 1285 */ 1286 void 1287 pmap_pinit2(struct pmap *pmap) 1288 { 1289 } 1290 1291 /* 1292 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1293 * 0 on failure (if the procedure had to sleep). 1294 * 1295 * When asked to remove the page directory page itself, we actually just 1296 * leave it cached so we do not have to incur the SMP inval overhead of 1297 * removing the kernel mapping. pmap_puninit() will take care of it. 1298 */ 1299 static int 1300 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1301 { 1302 /* 1303 * This code optimizes the case of freeing non-busy 1304 * page-table pages. Those pages are zero now, and 1305 * might as well be placed directly into the zero queue. 1306 */ 1307 if (vm_page_busy_try(p, TRUE)) { 1308 vm_page_sleep_busy(p, TRUE, "pmaprl"); 1309 return 1; 1310 } 1311 1312 /* 1313 * Remove the page table page from the processes address space. 1314 */ 1315 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1316 /* 1317 * We are the pml4 table itself. 1318 */ 1319 /* XXX anything to do here? */ 1320 } else if (p->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1321 /* 1322 * We are a PDP page. 1323 * We look for the PML4 entry that points to us. 1324 */ 1325 vm_page_t m4; 1326 pml4_entry_t *pml4; 1327 int idx; 1328 1329 m4 = vm_page_lookup(pmap->pm_pteobj, 1330 NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 1331 KKASSERT(m4 != NULL); 1332 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1333 idx = (p->pindex - (NUPT_TOTAL + NUPD_TOTAL)) % NPML4EPG; 1334 KKASSERT(pml4[idx] != 0); 1335 if (pml4[idx] == 0) 1336 kprintf("pmap_release: Unmapped PML4\n"); 1337 pml4[idx] = 0; 1338 vm_page_unwire_quick(m4); 1339 } else if (p->pindex >= NUPT_TOTAL) { 1340 /* 1341 * We are a PD page. 1342 * We look for the PDP entry that points to us. 1343 */ 1344 vm_page_t m3; 1345 pdp_entry_t *pdp; 1346 int idx; 1347 1348 m3 = vm_page_lookup(pmap->pm_pteobj, 1349 NUPT_TOTAL + NUPD_TOTAL + 1350 (p->pindex - NUPT_TOTAL) / NPDPEPG); 1351 KKASSERT(m3 != NULL); 1352 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1353 idx = (p->pindex - NUPT_TOTAL) % NPDPEPG; 1354 KKASSERT(pdp[idx] != 0); 1355 if (pdp[idx] == 0) 1356 kprintf("pmap_release: Unmapped PDP %d\n", idx); 1357 pdp[idx] = 0; 1358 vm_page_unwire_quick(m3); 1359 } else { 1360 /* We are a PT page. 1361 * We look for the PD entry that points to us. 1362 */ 1363 vm_page_t m2; 1364 pd_entry_t *pd; 1365 int idx; 1366 1367 m2 = vm_page_lookup(pmap->pm_pteobj, 1368 NUPT_TOTAL + p->pindex / NPDEPG); 1369 KKASSERT(m2 != NULL); 1370 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1371 idx = p->pindex % NPDEPG; 1372 if (pd[idx] == 0) 1373 kprintf("pmap_release: Unmapped PD %d\n", idx); 1374 pd[idx] = 0; 1375 vm_page_unwire_quick(m2); 1376 } 1377 KKASSERT(pmap->pm_stats.resident_count > 0); 1378 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1379 1380 if (p->wire_count > 1) { 1381 panic("pmap_release: freeing held pt page " 1382 "pmap=%p pg=%p dmap=%p pi=%ld {%ld,%ld,%ld}", 1383 pmap, p, (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(p)), 1384 p->pindex, NUPT_TOTAL, NUPD_TOTAL, NUPDP_TOTAL); 1385 } 1386 1387 if (pmap->pm_ptphint == p) 1388 pmap->pm_ptphint = NULL; 1389 1390 /* 1391 * We leave the top-level page table page cached, wired, and mapped in 1392 * the pmap until the dtor function (pmap_puninit()) gets called. 1393 * However, still clean it up. 1394 */ 1395 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1396 bzero(pmap->pm_pml4, PAGE_SIZE); 1397 vm_page_wakeup(p); 1398 } else { 1399 vm_page_unwire(p, 0); 1400 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1401 vm_page_free(p); 1402 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1403 } 1404 return 0; 1405 } 1406 1407 /* 1408 * Locate the requested PT, PD, or PDP page table page. 1409 * 1410 * Returns a busied page, caller must vm_page_wakeup() when done. 1411 */ 1412 static vm_page_t 1413 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1414 { 1415 vm_page_t m; 1416 vm_page_t pm; 1417 vm_pindex_t pindex; 1418 pt_entry_t *ptep; 1419 pt_entry_t data; 1420 1421 /* 1422 * Find or fabricate a new pagetable page. A non-zero wire_count 1423 * indicates that the page has already been mapped into its parent. 1424 */ 1425 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1426 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1427 if (m->wire_count != 0) 1428 return m; 1429 1430 /* 1431 * Map the page table page into its parent, giving it 1 wire count. 1432 */ 1433 vm_page_wire(m); 1434 vm_page_unmanage(m); 1435 atomic_add_long(&pmap->pm_stats.resident_count, 1); 1436 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 1437 1438 data = VM_PAGE_TO_PHYS(m) | 1439 VPTE_RW | VPTE_V | VPTE_U | VPTE_A | VPTE_M | VPTE_WIRED; 1440 atomic_add_long(&pmap->pm_stats.wired_count, 1); 1441 1442 if (ptepindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1443 /* 1444 * Map PDP into the PML4 1445 */ 1446 pindex = ptepindex - (NUPT_TOTAL + NUPD_TOTAL); 1447 pindex &= (NUPDP_TOTAL - 1); 1448 ptep = (pt_entry_t *)pmap->pm_pml4; 1449 pm = NULL; 1450 } else if (ptepindex >= NUPT_TOTAL) { 1451 /* 1452 * Map PD into its PDP 1453 */ 1454 pindex = (ptepindex - NUPT_TOTAL) >> NPDPEPGSHIFT; 1455 pindex += NUPT_TOTAL + NUPD_TOTAL; 1456 pm = _pmap_allocpte(pmap, pindex); 1457 pindex = (ptepindex - NUPT_TOTAL) & (NPDPEPG - 1); 1458 ptep = (void *)PHYS_TO_DMAP(pm->phys_addr); 1459 } else { 1460 /* 1461 * Map PT into its PD 1462 */ 1463 pindex = ptepindex >> NPDPEPGSHIFT; 1464 pindex += NUPT_TOTAL; 1465 pm = _pmap_allocpte(pmap, pindex); 1466 pindex = ptepindex & (NPTEPG - 1); 1467 ptep = (void *)PHYS_TO_DMAP(pm->phys_addr); 1468 } 1469 1470 /* 1471 * Install the pte in (pm). (m) prevents races. 1472 */ 1473 ptep += pindex; 1474 data = atomic_swap_long(ptep, data); 1475 if (pm) { 1476 vm_page_wire_quick(pm); 1477 vm_page_wakeup(pm); 1478 } 1479 pmap->pm_ptphint = pm; 1480 1481 return m; 1482 } 1483 1484 /* 1485 * Determine the page table page required to access the VA in the pmap 1486 * and allocate it if necessary. Return a held vm_page_t for the page. 1487 * 1488 * Only used with user pmaps. 1489 */ 1490 static vm_page_t 1491 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1492 { 1493 vm_pindex_t ptepindex; 1494 vm_page_t m; 1495 1496 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1497 1498 /* 1499 * Calculate pagetable page index, and return the PT page to 1500 * the caller. 1501 */ 1502 ptepindex = pmap_pt_pindex(va); 1503 m = _pmap_allocpte(pmap, ptepindex); 1504 1505 return m; 1506 } 1507 1508 /*************************************************** 1509 * Pmap allocation/deallocation routines. 1510 ***************************************************/ 1511 1512 /* 1513 * Release any resources held by the given physical map. 1514 * Called when a pmap initialized by pmap_pinit is being released. 1515 * Should only be called if the map contains no valid mappings. 1516 */ 1517 static int pmap_release_callback(struct vm_page *p, void *data); 1518 1519 void 1520 pmap_release(struct pmap *pmap) 1521 { 1522 vm_object_t object = pmap->pm_pteobj; 1523 struct rb_vm_page_scan_info info; 1524 1525 KKASSERT(pmap != &kernel_pmap); 1526 1527 #if defined(DIAGNOSTIC) 1528 if (object->ref_count != 1) 1529 panic("pmap_release: pteobj reference count != 1"); 1530 #endif 1531 1532 info.pmap = pmap; 1533 info.object = object; 1534 1535 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 1536 ("pmap %p still active! %016jx", 1537 pmap, 1538 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 1539 1540 vm_object_hold(object); 1541 do { 1542 info.error = 0; 1543 info.mpte = NULL; 1544 info.limit = object->generation; 1545 1546 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1547 pmap_release_callback, &info); 1548 if (info.error == 0 && info.mpte) { 1549 if (pmap_release_free_page(pmap, info.mpte)) 1550 info.error = 1; 1551 } 1552 } while (info.error); 1553 1554 pmap->pm_ptphint = NULL; 1555 1556 KASSERT((pmap->pm_stats.wired_count == (pmap->pm_pdirm != NULL)), 1557 ("pmap_release: dangling count %p %ld", 1558 pmap, pmap->pm_stats.wired_count)); 1559 1560 vm_object_drop(object); 1561 } 1562 1563 static int 1564 pmap_release_callback(struct vm_page *p, void *data) 1565 { 1566 struct rb_vm_page_scan_info *info = data; 1567 1568 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1569 info->mpte = p; 1570 return(0); 1571 } 1572 if (pmap_release_free_page(info->pmap, p)) { 1573 info->error = 1; 1574 return(-1); 1575 } 1576 if (info->object->generation != info->limit) { 1577 info->error = 1; 1578 return(-1); 1579 } 1580 return(0); 1581 } 1582 1583 /* 1584 * Grow the number of kernel page table entries, if needed. 1585 * 1586 * kernel_map must be locked exclusively by the caller. 1587 */ 1588 void 1589 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 1590 { 1591 vm_offset_t addr; 1592 vm_paddr_t paddr; 1593 vm_offset_t ptppaddr; 1594 vm_page_t nkpg; 1595 pd_entry_t *pde, newpdir; 1596 pdp_entry_t newpdp; 1597 1598 addr = kend; 1599 1600 vm_object_hold(&kptobj); 1601 if (kernel_vm_end == 0) { 1602 kernel_vm_end = KvaStart; 1603 nkpt = 0; 1604 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & VPTE_V) != 0) { 1605 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1606 nkpt++; 1607 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1608 kernel_vm_end = kernel_map.max_offset; 1609 break; 1610 } 1611 } 1612 } 1613 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1614 if (addr - 1 >= kernel_map.max_offset) 1615 addr = kernel_map.max_offset; 1616 while (kernel_vm_end < addr) { 1617 pde = pmap_pde(&kernel_pmap, kernel_vm_end); 1618 if (pde == NULL) { 1619 /* We need a new PDP entry */ 1620 nkpg = vm_page_alloc(&kptobj, nkpt, 1621 VM_ALLOC_NORMAL | 1622 VM_ALLOC_SYSTEM | 1623 VM_ALLOC_INTERRUPT); 1624 if (nkpg == NULL) { 1625 panic("pmap_growkernel: no memory to " 1626 "grow kernel"); 1627 } 1628 paddr = VM_PAGE_TO_PHYS(nkpg); 1629 pmap_zero_page(paddr); 1630 newpdp = (pdp_entry_t)(paddr | 1631 VPTE_V | VPTE_RW | VPTE_U | 1632 VPTE_A | VPTE_M | VPTE_WIRED); 1633 *pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp; 1634 atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1); 1635 nkpt++; 1636 continue; /* try again */ 1637 } 1638 if ((*pde & VPTE_V) != 0) { 1639 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1640 ~(PAGE_SIZE * NPTEPG - 1); 1641 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1642 kernel_vm_end = kernel_map.max_offset; 1643 break; 1644 } 1645 continue; 1646 } 1647 1648 /* 1649 * This index is bogus, but out of the way 1650 */ 1651 nkpg = vm_page_alloc(&kptobj, nkpt, 1652 VM_ALLOC_NORMAL | 1653 VM_ALLOC_SYSTEM | 1654 VM_ALLOC_INTERRUPT); 1655 if (nkpg == NULL) 1656 panic("pmap_growkernel: no memory to grow kernel"); 1657 1658 vm_page_wire(nkpg); 1659 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1660 pmap_zero_page(ptppaddr); 1661 newpdir = (pd_entry_t)(ptppaddr | 1662 VPTE_V | VPTE_RW | VPTE_U | 1663 VPTE_A | VPTE_M | VPTE_WIRED); 1664 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir; 1665 atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1); 1666 nkpt++; 1667 1668 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1669 ~(PAGE_SIZE * NPTEPG - 1); 1670 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1671 kernel_vm_end = kernel_map.max_offset; 1672 break; 1673 } 1674 } 1675 vm_object_drop(&kptobj); 1676 } 1677 1678 /* 1679 * Add a reference to the specified pmap. 1680 * 1681 * No requirements. 1682 */ 1683 void 1684 pmap_reference(pmap_t pmap) 1685 { 1686 if (pmap) 1687 atomic_add_int(&pmap->pm_count, 1); 1688 } 1689 1690 /************************************************************************ 1691 * VMSPACE MANAGEMENT * 1692 ************************************************************************ 1693 * 1694 * The VMSPACE management we do in our virtual kernel must be reflected 1695 * in the real kernel. This is accomplished by making vmspace system 1696 * calls to the real kernel. 1697 */ 1698 void 1699 cpu_vmspace_alloc(struct vmspace *vm) 1700 { 1701 int r; 1702 void *rp; 1703 vpte_t vpte; 1704 1705 /* 1706 * If VMM enable, don't do nothing, we 1707 * are able to use real page tables 1708 */ 1709 if (vmm_enabled) 1710 return; 1711 1712 #define USER_SIZE (VM_MAX_USER_ADDRESS - VM_MIN_USER_ADDRESS) 1713 1714 if (vmspace_create(&vm->vm_pmap, 0, NULL) < 0) 1715 panic("vmspace_create() failed"); 1716 1717 rp = vmspace_mmap(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1718 PROT_READ|PROT_WRITE|PROT_EXEC, 1719 MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED, 1720 MemImageFd, 0); 1721 if (rp == MAP_FAILED) 1722 panic("vmspace_mmap: failed"); 1723 vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1724 MADV_NOSYNC, 0); 1725 vpte = VM_PAGE_TO_PHYS(vmspace_pmap(vm)->pm_pdirm) | 1726 VPTE_RW | VPTE_V | VPTE_U; 1727 r = vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1728 MADV_SETMAP, vpte); 1729 if (r < 0) 1730 panic("vmspace_mcontrol: failed"); 1731 } 1732 1733 void 1734 cpu_vmspace_free(struct vmspace *vm) 1735 { 1736 /* 1737 * If VMM enable, don't do nothing, we 1738 * are able to use real page tables 1739 */ 1740 if (vmm_enabled) 1741 return; 1742 1743 if (vmspace_destroy(&vm->vm_pmap) < 0) 1744 panic("vmspace_destroy() failed"); 1745 } 1746 1747 /*************************************************** 1748 * page management routines. 1749 ***************************************************/ 1750 1751 /* 1752 * free the pv_entry back to the free list. This function may be 1753 * called from an interrupt. 1754 */ 1755 static __inline void 1756 free_pv_entry(pv_entry_t pv) 1757 { 1758 atomic_add_int(&pv_entry_count, -1); 1759 KKASSERT(pv_entry_count >= 0); 1760 zfree(pvzone, pv); 1761 } 1762 1763 /* 1764 * get a new pv_entry, allocating a block from the system 1765 * when needed. This function may be called from an interrupt. 1766 */ 1767 static pv_entry_t 1768 get_pv_entry(void) 1769 { 1770 atomic_add_int(&pv_entry_count, 1); 1771 if (pv_entry_high_water && 1772 (pv_entry_count > pv_entry_high_water) && 1773 atomic_swap_int(&pmap_pagedaemon_waken, 1) == 0) { 1774 wakeup(&vm_pages_needed); 1775 } 1776 return zalloc(pvzone); 1777 } 1778 1779 /* 1780 * This routine is very drastic, but can save the system 1781 * in a pinch. 1782 * 1783 * No requirements. 1784 */ 1785 void 1786 pmap_collect(void) 1787 { 1788 int i; 1789 vm_page_t m; 1790 static int warningdone=0; 1791 1792 if (pmap_pagedaemon_waken == 0) 1793 return; 1794 pmap_pagedaemon_waken = 0; 1795 1796 if (warningdone < 5) { 1797 kprintf("pmap_collect: collecting pv entries -- " 1798 "suggest increasing PMAP_SHPGPERPROC\n"); 1799 warningdone++; 1800 } 1801 1802 for (i = 0; i < vm_page_array_size; i++) { 1803 m = &vm_page_array[i]; 1804 if (m->wire_count || m->hold_count) 1805 continue; 1806 if (vm_page_busy_try(m, TRUE) == 0) { 1807 if (m->wire_count == 0 && m->hold_count == 0) { 1808 pmap_remove_all(m); 1809 } 1810 vm_page_wakeup(m); 1811 } 1812 } 1813 } 1814 1815 1816 /* 1817 * If it is the first entry on the list, it is actually 1818 * in the header and we must copy the following entry up 1819 * to the header. Otherwise we must search the list for 1820 * the entry. In either case we free the now unused entry. 1821 * 1822 * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller. 1823 */ 1824 static int 1825 pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va) 1826 { 1827 pv_entry_t pv; 1828 int rtval; 1829 1830 vm_page_spin_lock(m); 1831 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, va); 1832 1833 /* 1834 * Note that pv_ptem is NULL if the page table page itself is not 1835 * managed, even if the page being removed IS managed. 1836 */ 1837 rtval = 0; 1838 if (pv) { 1839 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1840 if (TAILQ_EMPTY(&m->md.pv_list)) 1841 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1842 m->md.pv_list_count--; 1843 KKASSERT(m->md.pv_list_count >= 0); 1844 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 1845 atomic_add_int(&pmap->pm_generation, 1); 1846 vm_page_spin_unlock(m); 1847 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); 1848 free_pv_entry(pv); 1849 } else { 1850 vm_page_spin_unlock(m); 1851 kprintf("pmap_remove_entry: could not find " 1852 "pmap=%p m=%p va=%016jx\n", 1853 pmap, m, va); 1854 } 1855 return rtval; 1856 } 1857 1858 /* 1859 * Create a pv entry for page at pa for (pmap, va). If the page table page 1860 * holding the VA is managed, mpte will be non-NULL. 1861 * 1862 * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller. 1863 */ 1864 static void 1865 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m, 1866 pv_entry_t pv) 1867 { 1868 pv->pv_va = va; 1869 pv->pv_pmap = pmap; 1870 pv->pv_ptem = mpte; 1871 1872 m->md.pv_list_count++; 1873 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1874 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pv); 1875 vm_page_flag_set(m, PG_MAPPED); 1876 KKASSERT(pv == NULL); 1877 } 1878 1879 /* 1880 * pmap_remove_pte: do the things to unmap a page in a process 1881 * 1882 * Caller holds pmap->pm_pteobj and holds the associated page table 1883 * page busy to prevent races. 1884 */ 1885 static int 1886 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, pt_entry_t oldpte, 1887 vm_offset_t va) 1888 { 1889 vm_page_t m; 1890 int error; 1891 1892 if (ptq) 1893 oldpte = pmap_inval_loadandclear(ptq, pmap, va); 1894 1895 if (oldpte & VPTE_WIRED) 1896 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1897 KKASSERT(pmap->pm_stats.wired_count >= 0); 1898 1899 #if 0 1900 /* 1901 * Machines that don't support invlpg, also don't support 1902 * PG_G. XXX PG_G is disabled for SMP so don't worry about 1903 * the SMP case. 1904 */ 1905 if (oldpte & PG_G) 1906 cpu_invlpg((void *)va); 1907 #endif 1908 KKASSERT(pmap->pm_stats.resident_count > 0); 1909 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1910 if (oldpte & VPTE_MANAGED) { 1911 m = PHYS_TO_VM_PAGE(oldpte); 1912 1913 /* 1914 * NOTE: pmap_remove_entry() will spin-lock the page 1915 */ 1916 if (oldpte & VPTE_M) { 1917 #if defined(PMAP_DIAGNOSTIC) 1918 if (pmap_nw_modified(oldpte)) { 1919 kprintf("pmap_remove: modified page not " 1920 "writable: va: 0x%lx, pte: 0x%lx\n", 1921 va, oldpte); 1922 } 1923 #endif 1924 if (pmap_track_modified(pmap, va)) 1925 vm_page_dirty(m); 1926 } 1927 if (oldpte & VPTE_A) 1928 vm_page_flag_set(m, PG_REFERENCED); 1929 error = pmap_remove_entry(pmap, m, va); 1930 } else { 1931 error = pmap_unuse_pt(pmap, va, NULL); 1932 } 1933 return error; 1934 } 1935 1936 /* 1937 * pmap_remove_page: 1938 * 1939 * Remove a single page from a process address space. 1940 * 1941 * This function may not be called from an interrupt if the pmap is 1942 * not kernel_pmap. 1943 * 1944 * Caller holds pmap->pm_pteobj 1945 */ 1946 static void 1947 pmap_remove_page(struct pmap *pmap, vm_offset_t va) 1948 { 1949 pt_entry_t *pte; 1950 1951 pte = pmap_pte(pmap, va); 1952 if (pte == NULL) 1953 return; 1954 if ((*pte & VPTE_V) == 0) 1955 return; 1956 pmap_remove_pte(pmap, pte, 0, va); 1957 } 1958 1959 /* 1960 * Remove the given range of addresses from the specified map. 1961 * 1962 * It is assumed that the start and end are properly rounded to 1963 * the page size. 1964 * 1965 * This function may not be called from an interrupt if the pmap is 1966 * not kernel_pmap. 1967 * 1968 * No requirements. 1969 */ 1970 void 1971 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 1972 { 1973 vm_offset_t va_next; 1974 pml4_entry_t *pml4e; 1975 pdp_entry_t *pdpe; 1976 pd_entry_t ptpaddr, *pde; 1977 pt_entry_t *pte; 1978 vm_page_t pt_m; 1979 1980 if (pmap == NULL) 1981 return; 1982 1983 vm_object_hold(pmap->pm_pteobj); 1984 KKASSERT(pmap->pm_stats.resident_count >= 0); 1985 if (pmap->pm_stats.resident_count == 0) { 1986 vm_object_drop(pmap->pm_pteobj); 1987 return; 1988 } 1989 1990 /* 1991 * special handling of removing one page. a very 1992 * common operation and easy to short circuit some 1993 * code. 1994 */ 1995 if (sva + PAGE_SIZE == eva) { 1996 pde = pmap_pde(pmap, sva); 1997 if (pde && (*pde & VPTE_PS) == 0) { 1998 pmap_remove_page(pmap, sva); 1999 vm_object_drop(pmap->pm_pteobj); 2000 return; 2001 } 2002 } 2003 2004 for (; sva < eva; sva = va_next) { 2005 pml4e = pmap_pml4e(pmap, sva); 2006 if ((*pml4e & VPTE_V) == 0) { 2007 va_next = (sva + NBPML4) & ~PML4MASK; 2008 if (va_next < sva) 2009 va_next = eva; 2010 continue; 2011 } 2012 2013 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2014 if ((*pdpe & VPTE_V) == 0) { 2015 va_next = (sva + NBPDP) & ~PDPMASK; 2016 if (va_next < sva) 2017 va_next = eva; 2018 continue; 2019 } 2020 2021 /* 2022 * Calculate index for next page table. 2023 */ 2024 va_next = (sva + NBPDR) & ~PDRMASK; 2025 if (va_next < sva) 2026 va_next = eva; 2027 2028 pde = pmap_pdpe_to_pde(pdpe, sva); 2029 ptpaddr = *pde; 2030 2031 /* 2032 * Weed out invalid mappings. 2033 */ 2034 if (ptpaddr == 0) 2035 continue; 2036 2037 /* 2038 * Check for large page. 2039 */ 2040 if ((ptpaddr & VPTE_PS) != 0) { 2041 /* JG FreeBSD has more complex treatment here */ 2042 KKASSERT(*pde != 0); 2043 pmap_inval_pde(pde, pmap, sva); 2044 atomic_add_long(&pmap->pm_stats.resident_count, 2045 -NBPDR / PAGE_SIZE); 2046 continue; 2047 } 2048 2049 /* 2050 * Limit our scan to either the end of the va represented 2051 * by the current page table page, or to the end of the 2052 * range being removed. 2053 */ 2054 if (va_next > eva) 2055 va_next = eva; 2056 2057 /* 2058 * NOTE: pmap_remove_pte() can block. 2059 */ 2060 pt_m = pmap_hold_pt_page(pde, sva); 2061 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2062 sva += PAGE_SIZE) { 2063 if (*pte) { 2064 if (pmap_remove_pte(pmap, pte, 0, sva)) 2065 break; 2066 } 2067 } 2068 vm_page_unhold(pt_m); 2069 } 2070 vm_object_drop(pmap->pm_pteobj); 2071 } 2072 2073 /* 2074 * Removes this physical page from all physical maps in which it resides. 2075 * Reflects back modify bits to the pager. 2076 * 2077 * This routine may not be called from an interrupt. 2078 * 2079 * No requirements. 2080 */ 2081 static void 2082 pmap_remove_all(vm_page_t m) 2083 { 2084 pt_entry_t *pte, tpte; 2085 pv_entry_t pv; 2086 vm_object_t pmobj; 2087 pmap_t pmap; 2088 2089 #if defined(PMAP_DIAGNOSTIC) 2090 /* 2091 * XXX this makes pmap_page_protect(NONE) illegal for non-managed 2092 * pages! 2093 */ 2094 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2095 panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m)); 2096 } 2097 #endif 2098 2099 restart: 2100 vm_page_spin_lock(m); 2101 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2102 pmap = pv->pv_pmap; 2103 pmobj = pmap->pm_pteobj; 2104 2105 /* 2106 * Handle reversed lock ordering 2107 */ 2108 if (vm_object_hold_try(pmobj) == 0) { 2109 refcount_acquire(&pmobj->hold_count); 2110 vm_page_spin_unlock(m); 2111 vm_object_lock(pmobj); 2112 vm_page_spin_lock(m); 2113 if (pv != TAILQ_FIRST(&m->md.pv_list) || 2114 pmap != pv->pv_pmap || 2115 pmobj != pmap->pm_pteobj) { 2116 vm_page_spin_unlock(m); 2117 vm_object_drop(pmobj); 2118 goto restart; 2119 } 2120 } 2121 2122 KKASSERT(pmap->pm_stats.resident_count > 0); 2123 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2124 2125 pte = pmap_pte(pmap, pv->pv_va); 2126 KKASSERT(pte != NULL); 2127 2128 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2129 if (tpte & VPTE_WIRED) 2130 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2131 KKASSERT(pmap->pm_stats.wired_count >= 0); 2132 2133 if (tpte & VPTE_A) 2134 vm_page_flag_set(m, PG_REFERENCED); 2135 2136 /* 2137 * Update the vm_page_t clean and reference bits. 2138 */ 2139 if (tpte & VPTE_M) { 2140 #if defined(PMAP_DIAGNOSTIC) 2141 if (pmap_nw_modified(tpte)) { 2142 kprintf( 2143 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2144 pv->pv_va, tpte); 2145 } 2146 #endif 2147 if (pmap_track_modified(pmap, pv->pv_va)) 2148 vm_page_dirty(m); 2149 } 2150 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2151 if (TAILQ_EMPTY(&m->md.pv_list)) 2152 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2153 m->md.pv_list_count--; 2154 KKASSERT(m->md.pv_list_count >= 0); 2155 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2156 atomic_add_int(&pmap->pm_generation, 1); 2157 vm_page_spin_unlock(m); 2158 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2159 free_pv_entry(pv); 2160 2161 vm_object_drop(pmobj); 2162 vm_page_spin_lock(m); 2163 } 2164 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2165 vm_page_spin_unlock(m); 2166 } 2167 2168 /* 2169 * Removes the page from a particular pmap 2170 */ 2171 void 2172 pmap_remove_specific(pmap_t pmap, vm_page_t m) 2173 { 2174 pt_entry_t *pte, tpte; 2175 pv_entry_t pv; 2176 2177 vm_object_hold(pmap->pm_pteobj); 2178 again: 2179 vm_page_spin_lock(m); 2180 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2181 if (pv->pv_pmap != pmap) 2182 continue; 2183 2184 KKASSERT(pmap->pm_stats.resident_count > 0); 2185 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2186 2187 pte = pmap_pte(pmap, pv->pv_va); 2188 KKASSERT(pte != NULL); 2189 2190 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2191 if (tpte & VPTE_WIRED) 2192 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2193 KKASSERT(pmap->pm_stats.wired_count >= 0); 2194 2195 if (tpte & VPTE_A) 2196 vm_page_flag_set(m, PG_REFERENCED); 2197 2198 /* 2199 * Update the vm_page_t clean and reference bits. 2200 */ 2201 if (tpte & VPTE_M) { 2202 if (pmap_track_modified(pmap, pv->pv_va)) 2203 vm_page_dirty(m); 2204 } 2205 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2206 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2207 atomic_add_int(&pmap->pm_generation, 1); 2208 m->md.pv_list_count--; 2209 KKASSERT(m->md.pv_list_count >= 0); 2210 if (TAILQ_EMPTY(&m->md.pv_list)) 2211 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2212 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2213 vm_page_spin_unlock(m); 2214 free_pv_entry(pv); 2215 goto again; 2216 } 2217 vm_page_spin_unlock(m); 2218 vm_object_drop(pmap->pm_pteobj); 2219 } 2220 2221 /* 2222 * Set the physical protection on the specified range of this map 2223 * as requested. 2224 * 2225 * This function may not be called from an interrupt if the map is 2226 * not the kernel_pmap. 2227 * 2228 * No requirements. 2229 */ 2230 void 2231 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2232 { 2233 vm_offset_t va_next; 2234 pml4_entry_t *pml4e; 2235 pdp_entry_t *pdpe; 2236 pd_entry_t ptpaddr, *pde; 2237 pt_entry_t *pte; 2238 vm_page_t pt_m; 2239 2240 if (pmap == NULL) 2241 return; 2242 2243 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 2244 pmap_remove(pmap, sva, eva); 2245 return; 2246 } 2247 2248 if (prot & VM_PROT_WRITE) 2249 return; 2250 2251 vm_object_hold(pmap->pm_pteobj); 2252 2253 for (; sva < eva; sva = va_next) { 2254 pml4e = pmap_pml4e(pmap, sva); 2255 if ((*pml4e & VPTE_V) == 0) { 2256 va_next = (sva + NBPML4) & ~PML4MASK; 2257 if (va_next < sva) 2258 va_next = eva; 2259 continue; 2260 } 2261 2262 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2263 if ((*pdpe & VPTE_V) == 0) { 2264 va_next = (sva + NBPDP) & ~PDPMASK; 2265 if (va_next < sva) 2266 va_next = eva; 2267 continue; 2268 } 2269 2270 va_next = (sva + NBPDR) & ~PDRMASK; 2271 if (va_next < sva) 2272 va_next = eva; 2273 2274 pde = pmap_pdpe_to_pde(pdpe, sva); 2275 ptpaddr = *pde; 2276 2277 #if 0 2278 /* 2279 * Check for large page. 2280 */ 2281 if ((ptpaddr & VPTE_PS) != 0) { 2282 /* JG correct? */ 2283 pmap_clean_pde(pde, pmap, sva); 2284 atomic_add_long(&pmap->pm_stats.resident_count, 2285 -NBPDR / PAGE_SIZE); 2286 continue; 2287 } 2288 #endif 2289 2290 /* 2291 * Weed out invalid mappings. Note: we assume that the page 2292 * directory table is always allocated, and in kernel virtual. 2293 */ 2294 if (ptpaddr == 0) 2295 continue; 2296 2297 if (va_next > eva) 2298 va_next = eva; 2299 2300 pt_m = pmap_hold_pt_page(pde, sva); 2301 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2302 sva += PAGE_SIZE) { 2303 /* 2304 * Clean managed pages and also check the accessed 2305 * bit. Just remove write perms for unmanaged 2306 * pages. Be careful of races, turning off write 2307 * access will force a fault rather then setting 2308 * the modified bit at an unexpected time. 2309 */ 2310 pmap_clean_pte(pte, pmap, sva, NULL); 2311 } 2312 vm_page_unhold(pt_m); 2313 } 2314 vm_object_drop(pmap->pm_pteobj); 2315 } 2316 2317 /* 2318 * Enter a managed page into a pmap. If the page is not wired related pmap 2319 * data can be destroyed at any time for later demand-operation. 2320 * 2321 * Insert the vm_page (m) at virtual address (v) in (pmap), with the 2322 * specified protection, and wire the mapping if requested. 2323 * 2324 * NOTE: This routine may not lazy-evaluate or lose information. The 2325 * page must actually be inserted into the given map NOW. 2326 * 2327 * NOTE: When entering a page at a KVA address, the pmap must be the 2328 * kernel_pmap. 2329 * 2330 * No requirements. 2331 */ 2332 void 2333 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2334 boolean_t wired, vm_map_entry_t entry __unused) 2335 { 2336 vm_paddr_t pa; 2337 pv_entry_t pv; 2338 pt_entry_t *pte; 2339 pt_entry_t origpte, newpte; 2340 vm_paddr_t opa; 2341 vm_page_t mpte; 2342 2343 if (pmap == NULL) 2344 return; 2345 2346 va = trunc_page(va); 2347 2348 vm_object_hold(pmap->pm_pteobj); 2349 2350 /* 2351 * Get the page table page. The kernel_pmap's page table pages 2352 * are preallocated and have no associated vm_page_t. 2353 * 2354 * If not NULL, mpte will be busied and we must vm_page_wakeup() 2355 * to cleanup. There will already be at least one wire count from 2356 * it being mapped into its parent. 2357 */ 2358 if (pmap == &kernel_pmap) { 2359 mpte = NULL; 2360 pte = vtopte(va); 2361 } else { 2362 mpte = pmap_allocpte(pmap, va); 2363 pte = (void *)PHYS_TO_DMAP(mpte->phys_addr); 2364 pte += pmap_pte_index(va); 2365 } 2366 2367 /* 2368 * Deal with races against the kernel's real MMU by cleaning the 2369 * page, even if we are re-entering the same page. 2370 */ 2371 pa = VM_PAGE_TO_PHYS(m); 2372 origpte = pmap_inval_loadandclear(pte, pmap, va); 2373 /*origpte = pmap_clean_pte(pte, pmap, va, NULL);*/ 2374 opa = origpte & VPTE_FRAME; 2375 2376 if (origpte & VPTE_PS) 2377 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2378 2379 if ((origpte & (VPTE_MANAGED|VPTE_M)) == (VPTE_MANAGED|VPTE_M)) { 2380 if (pmap_track_modified(pmap, va)) { 2381 vm_page_t om = PHYS_TO_VM_PAGE(opa); 2382 vm_page_dirty(om); 2383 } 2384 } 2385 2386 /* 2387 * Mapping has not changed, must be protection or wiring change. 2388 */ 2389 if (origpte && (opa == pa)) { 2390 /* 2391 * Wiring change, just update stats. We don't worry about 2392 * wiring PT pages as they remain resident as long as there 2393 * are valid mappings in them. Hence, if a user page is wired, 2394 * the PT page will be also. 2395 */ 2396 if (wired && ((origpte & VPTE_WIRED) == 0)) 2397 atomic_add_long(&pmap->pm_stats.wired_count, 1); 2398 else if (!wired && (origpte & VPTE_WIRED)) 2399 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2400 2401 if (origpte & VPTE_MANAGED) { 2402 pa |= VPTE_MANAGED; 2403 KKASSERT(m->flags & PG_MAPPED); 2404 KKASSERT(!(m->flags & (PG_FICTITIOUS|PG_UNMANAGED))); 2405 } else { 2406 KKASSERT((m->flags & (PG_FICTITIOUS|PG_UNMANAGED))); 2407 } 2408 vm_page_spin_lock(m); 2409 goto validate; 2410 } 2411 2412 /* 2413 * Bump the wire_count for the page table page. 2414 */ 2415 if (mpte) 2416 vm_page_wire_quick(mpte); 2417 2418 /* 2419 * Mapping has changed, invalidate old range and fall through to 2420 * handle validating new mapping. Don't inherit anything from 2421 * oldpte. 2422 */ 2423 if (opa) { 2424 int err; 2425 err = pmap_remove_pte(pmap, NULL, origpte, va); 2426 origpte = 0; 2427 if (err) 2428 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2429 } 2430 2431 /* 2432 * Enter on the PV list if part of our managed memory. Note that we 2433 * raise IPL while manipulating pv_table since pmap_enter can be 2434 * called at interrupt time. 2435 */ 2436 if (pmap_initialized) { 2437 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2438 /* 2439 * WARNING! We are using m's spin-lock as a 2440 * man's pte lock to interlock against 2441 * pmap_page_protect() operations. 2442 * 2443 * This is a bad hack (obviously). 2444 */ 2445 pv = get_pv_entry(); 2446 vm_page_spin_lock(m); 2447 pmap_insert_entry(pmap, va, mpte, m, pv); 2448 pa |= VPTE_MANAGED; 2449 /* vm_page_spin_unlock(m); */ 2450 } else { 2451 vm_page_spin_lock(m); 2452 } 2453 } else { 2454 vm_page_spin_lock(m); 2455 } 2456 2457 /* 2458 * Increment counters 2459 */ 2460 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2461 if (wired) 2462 atomic_add_long(&pmap->pm_stats.wired_count, 1); 2463 2464 validate: 2465 /* 2466 * Now validate mapping with desired protection/wiring. 2467 */ 2468 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | VPTE_V | VPTE_U); 2469 newpte |= VPTE_A; 2470 2471 if (wired) 2472 newpte |= VPTE_WIRED; 2473 // if (pmap != &kernel_pmap) 2474 newpte |= VPTE_U; 2475 if (newpte & VPTE_RW) 2476 vm_page_flag_set(m, PG_WRITEABLE); 2477 KKASSERT((newpte & VPTE_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2478 2479 origpte = atomic_swap_long(pte, newpte); 2480 if (origpte & VPTE_M) { 2481 kprintf("pmap [M] race @ %016jx\n", va); 2482 atomic_set_long(pte, VPTE_M); 2483 } 2484 vm_page_spin_unlock(m); 2485 2486 if (mpte) 2487 vm_page_wakeup(mpte); 2488 vm_object_drop(pmap->pm_pteobj); 2489 } 2490 2491 /* 2492 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 2493 * 2494 * Currently this routine may only be used on user pmaps, not kernel_pmap. 2495 * 2496 * No requirements. 2497 */ 2498 void 2499 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 2500 { 2501 pmap_enter(pmap, va, m, VM_PROT_READ, 0, NULL); 2502 } 2503 2504 /* 2505 * Make a temporary mapping for a physical address. This is only intended 2506 * to be used for panic dumps. 2507 * 2508 * The caller is responsible for calling smp_invltlb(). 2509 */ 2510 void * 2511 pmap_kenter_temporary(vm_paddr_t pa, long i) 2512 { 2513 pmap_kenter_quick(crashdumpmap + (i * PAGE_SIZE), pa); 2514 return ((void *)crashdumpmap); 2515 } 2516 2517 #define MAX_INIT_PT (96) 2518 2519 /* 2520 * This routine preloads the ptes for a given object into the specified pmap. 2521 * This eliminates the blast of soft faults on process startup and 2522 * immediately after an mmap. 2523 * 2524 * No requirements. 2525 */ 2526 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2527 2528 void 2529 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 2530 vm_object_t object, vm_pindex_t pindex, 2531 vm_size_t size, int limit) 2532 { 2533 struct rb_vm_page_scan_info info; 2534 struct lwp *lp; 2535 vm_size_t psize; 2536 2537 /* 2538 * We can't preinit if read access isn't set or there is no pmap 2539 * or object. 2540 */ 2541 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2542 return; 2543 2544 /* 2545 * We can't preinit if the pmap is not the current pmap 2546 */ 2547 lp = curthread->td_lwp; 2548 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2549 return; 2550 2551 /* 2552 * Misc additional checks 2553 */ 2554 psize = x86_64_btop(size); 2555 2556 if ((object->type != OBJT_VNODE) || 2557 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2558 (object->resident_page_count > MAX_INIT_PT))) { 2559 return; 2560 } 2561 2562 if (psize + pindex > object->size) { 2563 if (object->size < pindex) 2564 return; 2565 psize = object->size - pindex; 2566 } 2567 2568 if (psize == 0) 2569 return; 2570 2571 /* 2572 * Use a red-black scan to traverse the requested range and load 2573 * any valid pages found into the pmap. 2574 * 2575 * We cannot safely scan the object's memq unless we are in a 2576 * critical section since interrupts can remove pages from objects. 2577 */ 2578 info.start_pindex = pindex; 2579 info.end_pindex = pindex + psize - 1; 2580 info.limit = limit; 2581 info.mpte = NULL; 2582 info.addr = addr; 2583 info.pmap = pmap; 2584 2585 vm_object_hold_shared(object); 2586 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2587 pmap_object_init_pt_callback, &info); 2588 vm_object_drop(object); 2589 } 2590 2591 static 2592 int 2593 pmap_object_init_pt_callback(vm_page_t p, void *data) 2594 { 2595 struct rb_vm_page_scan_info *info = data; 2596 vm_pindex_t rel_index; 2597 /* 2598 * don't allow an madvise to blow away our really 2599 * free pages allocating pv entries. 2600 */ 2601 if ((info->limit & MAP_PREFAULT_MADVISE) && 2602 vmstats.v_free_count < vmstats.v_free_reserved) { 2603 return(-1); 2604 } 2605 2606 /* 2607 * Ignore list markers and ignore pages we cannot instantly 2608 * busy (while holding the object token). 2609 */ 2610 if (p->flags & PG_MARKER) 2611 return 0; 2612 if (vm_page_busy_try(p, TRUE)) 2613 return 0; 2614 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2615 (p->flags & PG_FICTITIOUS) == 0) { 2616 if ((p->queue - p->pc) == PQ_CACHE) 2617 vm_page_deactivate(p); 2618 rel_index = p->pindex - info->start_pindex; 2619 pmap_enter_quick(info->pmap, 2620 info->addr + x86_64_ptob(rel_index), p); 2621 } 2622 vm_page_wakeup(p); 2623 return(0); 2624 } 2625 2626 /* 2627 * Return TRUE if the pmap is in shape to trivially 2628 * pre-fault the specified address. 2629 * 2630 * Returns FALSE if it would be non-trivial or if a 2631 * pte is already loaded into the slot. 2632 * 2633 * No requirements. 2634 */ 2635 int 2636 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2637 { 2638 pt_entry_t *pte; 2639 pd_entry_t *pde; 2640 int ret; 2641 2642 vm_object_hold(pmap->pm_pteobj); 2643 pde = pmap_pde(pmap, addr); 2644 if (pde == NULL || *pde == 0) { 2645 ret = 0; 2646 } else { 2647 pte = pmap_pde_to_pte(pde, addr); 2648 ret = (*pte) ? 0 : 1; 2649 } 2650 vm_object_drop(pmap->pm_pteobj); 2651 2652 return (ret); 2653 } 2654 2655 /* 2656 * Change the wiring attribute for a map/virtual-address pair. 2657 * 2658 * The mapping must already exist in the pmap. 2659 * No other requirements. 2660 */ 2661 vm_page_t 2662 pmap_unwire(pmap_t pmap, vm_offset_t va) 2663 { 2664 pt_entry_t *pte; 2665 vm_paddr_t pa; 2666 vm_page_t m; 2667 2668 if (pmap == NULL) 2669 return NULL; 2670 2671 vm_object_hold(pmap->pm_pteobj); 2672 pte = pmap_pte(pmap, va); 2673 2674 if (pte == NULL || (*pte & VPTE_V) == 0) { 2675 vm_object_drop(pmap->pm_pteobj); 2676 return NULL; 2677 } 2678 2679 /* 2680 * Wiring is not a hardware characteristic so there is no need to 2681 * invalidate TLB. However, in an SMP environment we must use 2682 * a locked bus cycle to update the pte (if we are not using 2683 * the pmap_inval_*() API that is)... it's ok to do this for simple 2684 * wiring changes. 2685 */ 2686 if (pmap_pte_w(pte)) 2687 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2688 /* XXX else return NULL so caller doesn't unwire m ? */ 2689 atomic_clear_long(pte, VPTE_WIRED); 2690 2691 pa = *pte & VPTE_FRAME; 2692 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 2693 2694 vm_object_drop(pmap->pm_pteobj); 2695 2696 return m; 2697 } 2698 2699 /* 2700 * Copy the range specified by src_addr/len 2701 * from the source map to the range dst_addr/len 2702 * in the destination map. 2703 * 2704 * This routine is only advisory and need not do anything. 2705 */ 2706 void 2707 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2708 vm_size_t len, vm_offset_t src_addr) 2709 { 2710 /* 2711 * XXX BUGGY. Amoung other things srcmpte is assumed to remain 2712 * valid through blocking calls, and that's just not going to 2713 * be the case. 2714 * 2715 * FIXME! 2716 */ 2717 return; 2718 } 2719 2720 /* 2721 * pmap_zero_page: 2722 * 2723 * Zero the specified physical page. 2724 * 2725 * This function may be called from an interrupt and no locking is 2726 * required. 2727 */ 2728 void 2729 pmap_zero_page(vm_paddr_t phys) 2730 { 2731 vm_offset_t va = PHYS_TO_DMAP(phys); 2732 2733 bzero((void *)va, PAGE_SIZE); 2734 } 2735 2736 /* 2737 * pmap_zero_page: 2738 * 2739 * Zero part of a physical page by mapping it into memory and clearing 2740 * its contents with bzero. 2741 * 2742 * off and size may not cover an area beyond a single hardware page. 2743 */ 2744 void 2745 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 2746 { 2747 vm_offset_t virt = PHYS_TO_DMAP(phys); 2748 2749 bzero((char *)virt + off, size); 2750 } 2751 2752 /* 2753 * pmap_copy_page: 2754 * 2755 * Copy the physical page from the source PA to the target PA. 2756 * This function may be called from an interrupt. No locking 2757 * is required. 2758 */ 2759 void 2760 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 2761 { 2762 vm_offset_t src_virt, dst_virt; 2763 2764 src_virt = PHYS_TO_DMAP(src); 2765 dst_virt = PHYS_TO_DMAP(dst); 2766 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 2767 } 2768 2769 /* 2770 * pmap_copy_page_frag: 2771 * 2772 * Copy the physical page from the source PA to the target PA. 2773 * This function may be called from an interrupt. No locking 2774 * is required. 2775 */ 2776 void 2777 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 2778 { 2779 vm_offset_t src_virt, dst_virt; 2780 2781 src_virt = PHYS_TO_DMAP(src); 2782 dst_virt = PHYS_TO_DMAP(dst); 2783 bcopy((char *)src_virt + (src & PAGE_MASK), 2784 (char *)dst_virt + (dst & PAGE_MASK), 2785 bytes); 2786 } 2787 2788 /* 2789 * Returns true if the pmap's pv is one of the first 16 pvs linked to 2790 * from this page. This count may be changed upwards or downwards 2791 * in the future; it is only necessary that true be returned for a small 2792 * subset of pmaps for proper page aging. 2793 * 2794 * No other requirements. 2795 */ 2796 boolean_t 2797 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 2798 { 2799 pv_entry_t pv; 2800 int loops = 0; 2801 2802 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2803 return FALSE; 2804 2805 vm_page_spin_lock(m); 2806 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2807 if (pv->pv_pmap == pmap) { 2808 vm_page_spin_unlock(m); 2809 return TRUE; 2810 } 2811 loops++; 2812 if (loops >= 16) 2813 break; 2814 } 2815 vm_page_spin_unlock(m); 2816 2817 return (FALSE); 2818 } 2819 2820 /* 2821 * Remove all pages from specified address space this aids process 2822 * exit speeds. Also, this code is special cased for current 2823 * process only, but can have the more generic (and slightly slower) 2824 * mode enabled. This is much faster than pmap_remove in the case 2825 * of running down an entire address space. 2826 * 2827 * No other requirements. 2828 */ 2829 void 2830 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2831 { 2832 pmap_remove(pmap, sva, eva); 2833 #if 0 2834 pt_entry_t *pte, tpte; 2835 pv_entry_t pv, npv; 2836 vm_page_t m; 2837 int save_generation; 2838 2839 if (pmap->pm_pteobj) 2840 vm_object_hold(pmap->pm_pteobj); 2841 2842 pmap_invalidate_range(pmap, sva, eva); 2843 2844 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 2845 if (pv->pv_va >= eva || pv->pv_va < sva) { 2846 npv = TAILQ_NEXT(pv, pv_plist); 2847 continue; 2848 } 2849 2850 KKASSERT(pmap == pv->pv_pmap); 2851 2852 pte = pmap_pte(pmap, pv->pv_va); 2853 2854 /* 2855 * We cannot remove wired pages from a process' mapping 2856 * at this time 2857 */ 2858 if (*pte & VPTE_WIRED) { 2859 npv = TAILQ_NEXT(pv, pv_plist); 2860 continue; 2861 } 2862 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2863 2864 m = PHYS_TO_VM_PAGE(tpte & VPTE_FRAME); 2865 vm_page_spin_lock(m); 2866 2867 KASSERT(m < &vm_page_array[vm_page_array_size], 2868 ("pmap_remove_pages: bad tpte %lx", tpte)); 2869 2870 KKASSERT(pmap->pm_stats.resident_count > 0); 2871 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2872 2873 /* 2874 * Update the vm_page_t clean and reference bits. 2875 */ 2876 if (tpte & VPTE_M) { 2877 vm_page_dirty(m); 2878 } 2879 2880 npv = TAILQ_NEXT(pv, pv_plist); 2881 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2882 atomic_add_int(&pmap->pm_generation, 1); 2883 save_generation = pmap->pm_generation; 2884 m->md.pv_list_count--; 2885 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2886 if (TAILQ_EMPTY(&m->md.pv_list)) 2887 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2888 vm_page_spin_unlock(m); 2889 2890 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2891 free_pv_entry(pv); 2892 2893 /* 2894 * Restart the scan if we blocked during the unuse or free 2895 * calls and other removals were made. 2896 */ 2897 if (save_generation != pmap->pm_generation) { 2898 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 2899 npv = TAILQ_FIRST(&pmap->pm_pvlist); 2900 } 2901 } 2902 if (pmap->pm_pteobj) 2903 vm_object_drop(pmap->pm_pteobj); 2904 pmap_remove(pmap, sva, eva); 2905 #endif 2906 } 2907 2908 /* 2909 * pmap_testbit tests bits in active mappings of a VM page. 2910 */ 2911 static boolean_t 2912 pmap_testbit(vm_page_t m, int bit) 2913 { 2914 pv_entry_t pv; 2915 pt_entry_t *pte; 2916 2917 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2918 return FALSE; 2919 2920 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 2921 return FALSE; 2922 2923 vm_page_spin_lock(m); 2924 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2925 /* 2926 * if the bit being tested is the modified bit, then 2927 * mark clean_map and ptes as never 2928 * modified. 2929 */ 2930 if (bit & (VPTE_A|VPTE_M)) { 2931 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) 2932 continue; 2933 } 2934 2935 #if defined(PMAP_DIAGNOSTIC) 2936 if (pv->pv_pmap == NULL) { 2937 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 2938 continue; 2939 } 2940 #endif 2941 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2942 if (*pte & bit) { 2943 vm_page_spin_unlock(m); 2944 return TRUE; 2945 } 2946 } 2947 vm_page_spin_unlock(m); 2948 return (FALSE); 2949 } 2950 2951 /* 2952 * This routine is used to clear bits in ptes. Certain bits require special 2953 * handling, in particular (on virtual kernels) the VPTE_M (modify) bit. 2954 * 2955 * This routine is only called with certain VPTE_* bit combinations. 2956 */ 2957 static __inline void 2958 pmap_clearbit(vm_page_t m, int bit) 2959 { 2960 pv_entry_t pv; 2961 pt_entry_t *pte; 2962 pt_entry_t pbits; 2963 vm_object_t pmobj; 2964 pmap_t pmap; 2965 2966 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2967 if (bit == VPTE_RW) 2968 vm_page_flag_clear(m, PG_WRITEABLE); 2969 return; 2970 } 2971 2972 /* 2973 * Loop over all current mappings setting/clearing as appropos If 2974 * setting RO do we need to clear the VAC? 2975 */ 2976 restart: 2977 vm_page_spin_lock(m); 2978 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2979 /* 2980 * Need the pmap object lock(?) 2981 */ 2982 pmap = pv->pv_pmap; 2983 pmobj = pmap->pm_pteobj; 2984 2985 if (vm_object_hold_try(pmobj) == 0) { 2986 refcount_acquire(&pmobj->hold_count); 2987 vm_page_spin_unlock(m); 2988 vm_object_lock(pmobj); 2989 vm_object_drop(pmobj); 2990 goto restart; 2991 } 2992 2993 /* 2994 * don't write protect pager mappings 2995 */ 2996 if (bit == VPTE_RW) { 2997 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) { 2998 vm_object_drop(pmobj); 2999 continue; 3000 } 3001 } 3002 3003 #if defined(PMAP_DIAGNOSTIC) 3004 if (pv->pv_pmap == NULL) { 3005 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 3006 vm_object_drop(pmobj); 3007 continue; 3008 } 3009 #endif 3010 3011 /* 3012 * Careful here. We can use a locked bus instruction to 3013 * clear VPTE_A or VPTE_M safely but we need to synchronize 3014 * with the target cpus when we mess with VPTE_RW. 3015 * 3016 * On virtual kernels we must force a new fault-on-write 3017 * in the real kernel if we clear the Modify bit ourselves, 3018 * otherwise the real kernel will not get a new fault and 3019 * will never set our Modify bit again. 3020 */ 3021 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3022 if (*pte & bit) { 3023 if (bit == VPTE_RW) { 3024 /* 3025 * We must also clear VPTE_M when clearing 3026 * VPTE_RW and synchronize its state to 3027 * the page. 3028 */ 3029 pbits = pmap_clean_pte(pte, pv->pv_pmap, 3030 pv->pv_va, m); 3031 } else if (bit == VPTE_M) { 3032 /* 3033 * We must invalidate the real-kernel pte 3034 * when clearing VPTE_M bit to force the 3035 * real-kernel to take a new fault to re-set 3036 * VPTE_M. 3037 */ 3038 atomic_clear_long(pte, VPTE_M); 3039 if (*pte & VPTE_RW) { 3040 pmap_invalidate_range(pv->pv_pmap, 3041 pv->pv_va, 3042 pv->pv_va + PAGE_SIZE); 3043 } 3044 } else if ((bit & (VPTE_RW|VPTE_M)) == 3045 (VPTE_RW|VPTE_M)) { 3046 /* 3047 * We've been asked to clear W & M, I guess 3048 * the caller doesn't want us to update 3049 * the dirty status of the VM page. 3050 */ 3051 pmap_clean_pte(pte, pv->pv_pmap, pv->pv_va, m); 3052 panic("shouldn't be called"); 3053 } else { 3054 /* 3055 * We've been asked to clear bits that do 3056 * not interact with hardware. 3057 */ 3058 atomic_clear_long(pte, bit); 3059 } 3060 } 3061 vm_object_drop(pmobj); 3062 } 3063 if (bit == VPTE_RW) 3064 vm_page_flag_clear(m, PG_WRITEABLE); 3065 vm_page_spin_unlock(m); 3066 } 3067 3068 /* 3069 * Lower the permission for all mappings to a given page. 3070 * 3071 * No other requirements. 3072 */ 3073 void 3074 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3075 { 3076 if ((prot & VM_PROT_WRITE) == 0) { 3077 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3078 pmap_clearbit(m, VPTE_RW); 3079 } else { 3080 pmap_remove_all(m); 3081 } 3082 } 3083 } 3084 3085 vm_paddr_t 3086 pmap_phys_address(vm_pindex_t ppn) 3087 { 3088 return (x86_64_ptob(ppn)); 3089 } 3090 3091 /* 3092 * Return a count of reference bits for a page, clearing those bits. 3093 * It is not necessary for every reference bit to be cleared, but it 3094 * is necessary that 0 only be returned when there are truly no 3095 * reference bits set. 3096 * 3097 * XXX: The exact number of bits to check and clear is a matter that 3098 * should be tested and standardized at some point in the future for 3099 * optimal aging of shared pages. 3100 * 3101 * No other requirements. 3102 */ 3103 int 3104 pmap_ts_referenced(vm_page_t m) 3105 { 3106 pv_entry_t pv, pvf, pvn; 3107 pt_entry_t *pte; 3108 int rtval = 0; 3109 3110 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3111 return (rtval); 3112 3113 vm_page_spin_lock(m); 3114 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3115 pvf = pv; 3116 do { 3117 pvn = TAILQ_NEXT(pv, pv_list); 3118 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3119 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3120 3121 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) 3122 continue; 3123 3124 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3125 3126 if (pte && (*pte & VPTE_A)) { 3127 atomic_clear_long(pte, VPTE_A); 3128 rtval++; 3129 if (rtval > 4) { 3130 break; 3131 } 3132 } 3133 } while ((pv = pvn) != NULL && pv != pvf); 3134 } 3135 vm_page_spin_unlock(m); 3136 3137 return (rtval); 3138 } 3139 3140 /* 3141 * Return whether or not the specified physical page was modified 3142 * in any physical maps. 3143 * 3144 * No other requirements. 3145 */ 3146 boolean_t 3147 pmap_is_modified(vm_page_t m) 3148 { 3149 boolean_t res; 3150 3151 res = pmap_testbit(m, VPTE_M); 3152 3153 return (res); 3154 } 3155 3156 /* 3157 * Clear the modify bits on the specified physical page. For the vkernel 3158 * we really need to clean the page, which clears VPTE_RW and VPTE_M, in 3159 * order to ensure that we take a fault on the next write to the page. 3160 * Otherwise the page may become dirty without us knowing it. 3161 * 3162 * No other requirements. 3163 */ 3164 void 3165 pmap_clear_modify(vm_page_t m) 3166 { 3167 pmap_clearbit(m, VPTE_RW); 3168 } 3169 3170 /* 3171 * Clear the reference bit on the specified physical page. 3172 * 3173 * No other requirements. 3174 */ 3175 void 3176 pmap_clear_reference(vm_page_t m) 3177 { 3178 pmap_clearbit(m, VPTE_A); 3179 } 3180 3181 /* 3182 * Miscellaneous support routines follow 3183 */ 3184 static void 3185 i386_protection_init(void) 3186 { 3187 uint64_t *kp; 3188 int prot; 3189 3190 kp = protection_codes; 3191 for (prot = 0; prot < 8; prot++) { 3192 if (prot & VM_PROT_READ) 3193 *kp |= 0; /* R */ 3194 if (prot & VM_PROT_WRITE) 3195 *kp |= VPTE_RW; /* R+W */ 3196 if (prot && (prot & VM_PROT_EXECUTE) == 0) 3197 *kp |= VPTE_NX; /* NX - !executable */ 3198 ++kp; 3199 } 3200 } 3201 3202 /* 3203 * Sets the memory attribute for the specified page. 3204 */ 3205 void 3206 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 3207 { 3208 /* This is a vkernel, do nothing */ 3209 } 3210 3211 /* 3212 * Change the PAT attribute on an existing kernel memory map. Caller 3213 * must ensure that the virtual memory in question is not accessed 3214 * during the adjustment. 3215 */ 3216 void 3217 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 3218 { 3219 /* This is a vkernel, do nothing */ 3220 } 3221 3222 /* 3223 * Perform the pmap work for mincore 3224 * 3225 * No other requirements. 3226 */ 3227 int 3228 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3229 { 3230 pt_entry_t *ptep, pte; 3231 vm_page_t m; 3232 int val = 0; 3233 3234 vm_object_hold(pmap->pm_pteobj); 3235 ptep = pmap_pte(pmap, addr); 3236 3237 if (ptep && (pte = *ptep) != 0) { 3238 vm_paddr_t pa; 3239 3240 val = MINCORE_INCORE; 3241 if ((pte & VPTE_MANAGED) == 0) 3242 goto done; 3243 3244 pa = pte & VPTE_FRAME; 3245 3246 m = PHYS_TO_VM_PAGE(pa); 3247 3248 /* 3249 * Modified by us 3250 */ 3251 if (pte & VPTE_M) 3252 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3253 /* 3254 * Modified by someone 3255 */ 3256 else if (m->dirty || pmap_is_modified(m)) 3257 val |= MINCORE_MODIFIED_OTHER; 3258 /* 3259 * Referenced by us 3260 */ 3261 if (pte & VPTE_A) 3262 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3263 3264 /* 3265 * Referenced by someone 3266 */ 3267 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3268 val |= MINCORE_REFERENCED_OTHER; 3269 vm_page_flag_set(m, PG_REFERENCED); 3270 } 3271 } 3272 done: 3273 vm_object_drop(pmap->pm_pteobj); 3274 3275 return val; 3276 } 3277 3278 /* 3279 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3280 * vmspace will be ref'd and the old one will be deref'd. 3281 * 3282 * Caller must hold vmspace->vm_map.token for oldvm and newvm 3283 */ 3284 void 3285 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3286 { 3287 struct vmspace *oldvm; 3288 struct lwp *lp; 3289 3290 oldvm = p->p_vmspace; 3291 if (oldvm != newvm) { 3292 if (adjrefs) 3293 vmspace_ref(newvm); 3294 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 3295 p->p_vmspace = newvm; 3296 KKASSERT(p->p_nthreads == 1); 3297 lp = RB_ROOT(&p->p_lwp_tree); 3298 pmap_setlwpvm(lp, newvm); 3299 if (adjrefs) 3300 vmspace_rel(oldvm); 3301 } 3302 } 3303 3304 /* 3305 * Set the vmspace for a LWP. The vmspace is almost universally set the 3306 * same as the process vmspace, but virtual kernels need to swap out contexts 3307 * on a per-lwp basis. 3308 */ 3309 void 3310 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3311 { 3312 struct vmspace *oldvm; 3313 struct pmap *pmap; 3314 3315 oldvm = lp->lwp_vmspace; 3316 if (oldvm != newvm) { 3317 crit_enter(); 3318 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 3319 lp->lwp_vmspace = newvm; 3320 if (curthread->td_lwp == lp) { 3321 pmap = vmspace_pmap(newvm); 3322 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 3323 if (pmap->pm_active_lock & CPULOCK_EXCL) 3324 pmap_interlock_wait(newvm); 3325 #if defined(SWTCH_OPTIM_STATS) 3326 tlb_flush_count++; 3327 #endif 3328 pmap = vmspace_pmap(oldvm); 3329 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 3330 mycpu->gd_cpuid); 3331 } 3332 crit_exit(); 3333 } 3334 } 3335 3336 /* 3337 * The swtch code tried to switch in a heavy weight process whos pmap 3338 * is locked by another cpu. We have to wait for the lock to clear before 3339 * the pmap can be used. 3340 */ 3341 void 3342 pmap_interlock_wait (struct vmspace *vm) 3343 { 3344 pmap_t pmap = vmspace_pmap(vm); 3345 3346 if (pmap->pm_active_lock & CPULOCK_EXCL) { 3347 crit_enter(); 3348 while (pmap->pm_active_lock & CPULOCK_EXCL) { 3349 cpu_ccfence(); 3350 pthread_yield(); 3351 } 3352 crit_exit(); 3353 } 3354 } 3355 3356 vm_offset_t 3357 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3358 { 3359 3360 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3361 return addr; 3362 } 3363 3364 addr = roundup2(addr, NBPDR); 3365 return addr; 3366 } 3367 3368 /* 3369 * Used by kmalloc/kfree, page already exists at va 3370 */ 3371 vm_page_t 3372 pmap_kvtom(vm_offset_t va) 3373 { 3374 vpte_t *ptep; 3375 3376 KKASSERT(va >= KvaStart && va < KvaEnd); 3377 ptep = vtopte(va); 3378 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 3379 } 3380 3381 void 3382 pmap_object_init(vm_object_t object) 3383 { 3384 /* empty */ 3385 } 3386 3387 void 3388 pmap_object_free(vm_object_t object) 3389 { 3390 /* empty */ 3391 } 3392 3393 void 3394 pmap_pgscan(struct pmap_pgscan_info *pginfo) 3395 { 3396 pmap_t pmap = pginfo->pmap; 3397 vm_offset_t sva = pginfo->beg_addr; 3398 vm_offset_t eva = pginfo->end_addr; 3399 vm_offset_t va_next; 3400 pml4_entry_t *pml4e; 3401 pdp_entry_t *pdpe; 3402 pd_entry_t ptpaddr, *pde; 3403 pt_entry_t *pte; 3404 vm_page_t pt_m; 3405 int stop = 0; 3406 3407 vm_object_hold(pmap->pm_pteobj); 3408 3409 for (; sva < eva; sva = va_next) { 3410 if (stop) 3411 break; 3412 3413 pml4e = pmap_pml4e(pmap, sva); 3414 if ((*pml4e & VPTE_V) == 0) { 3415 va_next = (sva + NBPML4) & ~PML4MASK; 3416 if (va_next < sva) 3417 va_next = eva; 3418 continue; 3419 } 3420 3421 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3422 if ((*pdpe & VPTE_V) == 0) { 3423 va_next = (sva + NBPDP) & ~PDPMASK; 3424 if (va_next < sva) 3425 va_next = eva; 3426 continue; 3427 } 3428 3429 va_next = (sva + NBPDR) & ~PDRMASK; 3430 if (va_next < sva) 3431 va_next = eva; 3432 3433 pde = pmap_pdpe_to_pde(pdpe, sva); 3434 ptpaddr = *pde; 3435 3436 #if 0 3437 /* 3438 * Check for large page (ignore). 3439 */ 3440 if ((ptpaddr & VPTE_PS) != 0) { 3441 #if 0 3442 pmap_clean_pde(pde, pmap, sva); 3443 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 3444 #endif 3445 continue; 3446 } 3447 #endif 3448 3449 /* 3450 * Weed out invalid mappings. Note: we assume that the page 3451 * directory table is always allocated, and in kernel virtual. 3452 */ 3453 if (ptpaddr == 0) 3454 continue; 3455 3456 if (va_next > eva) 3457 va_next = eva; 3458 3459 pt_m = pmap_hold_pt_page(pde, sva); 3460 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3461 sva += PAGE_SIZE) { 3462 vm_page_t m; 3463 3464 if (stop) 3465 break; 3466 if ((*pte & VPTE_MANAGED) == 0) 3467 continue; 3468 3469 m = PHYS_TO_VM_PAGE(*pte & VPTE_FRAME); 3470 if (vm_page_busy_try(m, TRUE) == 0) { 3471 if (pginfo->callback(pginfo, sva, m) < 0) 3472 stop = 1; 3473 } 3474 } 3475 vm_page_unhold(pt_m); 3476 } 3477 vm_object_drop(pmap->pm_pteobj); 3478 } 3479