1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008-2019 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 45 */ 46 47 /* 48 * Manages physical address maps. 49 */ 50 51 #include "opt_msgbuf.h" 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/vmspace.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/thread2.h> 76 #include <sys/spinlock2.h> 77 #include <vm/vm_page2.h> 78 79 #include <machine/cputypes.h> 80 #include <machine/md_var.h> 81 #include <machine/specialreg.h> 82 #include <machine/smp.h> 83 #include <machine/globaldata.h> 84 #include <machine/pcb.h> 85 #include <machine/pmap.h> 86 #include <machine/pmap_inval.h> 87 88 #include <ddb/ddb.h> 89 90 #include <stdio.h> 91 #include <assert.h> 92 #include <stdlib.h> 93 94 #define PMAP_KEEP_PDIRS 95 #ifndef PMAP_SHPGPERPROC 96 #define PMAP_SHPGPERPROC 1000 97 #endif 98 99 #if defined(DIAGNOSTIC) 100 #define PMAP_DIAGNOSTIC 101 #endif 102 103 #define MINPV 2048 104 105 #if !defined(PMAP_DIAGNOSTIC) 106 #define PMAP_INLINE __inline 107 #else 108 #define PMAP_INLINE 109 #endif 110 111 /* 112 * Get PDEs and PTEs for user/kernel address space 113 */ 114 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 115 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 116 117 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & VPTE_V) != 0) 118 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & VPTE_WIRED) != 0) 119 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & VPTE_M) != 0) 120 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & VPTE_A) != 0) 121 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & VPTE_V) != 0) 122 123 /* 124 * Given a map and a machine independent protection code, 125 * convert to a vax protection code. 126 */ 127 #define pte_prot(m, p) \ 128 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 129 static uint64_t protection_codes[8]; 130 131 struct pmap kernel_pmap; 132 133 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 134 135 static struct vm_object kptobj; 136 static int nkpt; 137 138 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 139 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 140 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 141 142 extern int vmm_enabled; 143 extern void *vkernel_stack; 144 145 /* 146 * Data for the pv entry allocation mechanism 147 */ 148 static vm_zone_t pvzone; 149 static struct vm_zone pvzone_store; 150 static vm_pindex_t pv_entry_count = 0; 151 static vm_pindex_t pv_entry_max = 0; 152 static vm_pindex_t pv_entry_high_water = 0; 153 static int pmap_pagedaemon_waken = 0; 154 static struct pv_entry *pvinit; 155 156 /* 157 * All those kernel PT submaps that BSD is so fond of 158 */ 159 pt_entry_t *CMAP1 = NULL, *ptmmap; 160 caddr_t CADDR1 = NULL; 161 static pt_entry_t *msgbufmap; 162 163 uint64_t KPTphys; 164 165 static PMAP_INLINE void free_pv_entry (pv_entry_t pv); 166 static pv_entry_t get_pv_entry (void); 167 static void x86_64_protection_init (void); 168 static __inline void pmap_clearbit (vm_page_t m, int bit); 169 170 static void pmap_remove_all (vm_page_t m); 171 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 172 pt_entry_t oldpte, vm_offset_t sva); 173 static void pmap_remove_page (struct pmap *pmap, vm_offset_t va); 174 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 175 vm_offset_t va); 176 static boolean_t pmap_testbit (vm_page_t m, int bit); 177 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 178 vm_page_t mpte, vm_page_t m, pv_entry_t); 179 180 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 181 182 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 183 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 184 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 185 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t); 186 187 static int 188 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 189 { 190 if (pv1->pv_va < pv2->pv_va) 191 return(-1); 192 if (pv1->pv_va > pv2->pv_va) 193 return(1); 194 return(0); 195 } 196 197 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 198 pv_entry_compare, vm_offset_t, pv_va); 199 200 static __inline vm_pindex_t 201 pmap_pt_pindex(vm_offset_t va) 202 { 203 return va >> PDRSHIFT; 204 } 205 206 static __inline vm_pindex_t 207 pmap_pte_index(vm_offset_t va) 208 { 209 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 210 } 211 212 static __inline vm_pindex_t 213 pmap_pde_index(vm_offset_t va) 214 { 215 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 216 } 217 218 static __inline vm_pindex_t 219 pmap_pdpe_index(vm_offset_t va) 220 { 221 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 222 } 223 224 static __inline vm_pindex_t 225 pmap_pml4e_index(vm_offset_t va) 226 { 227 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 228 } 229 230 /* Return a pointer to the PML4 slot that corresponds to a VA */ 231 static __inline pml4_entry_t * 232 pmap_pml4e(pmap_t pmap, vm_offset_t va) 233 { 234 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 235 } 236 237 /* Return a pointer to the PDP slot that corresponds to a VA */ 238 static __inline pdp_entry_t * 239 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 240 { 241 pdp_entry_t *pdpe; 242 243 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & VPTE_FRAME); 244 return (&pdpe[pmap_pdpe_index(va)]); 245 } 246 247 /* Return a pointer to the PDP slot that corresponds to a VA */ 248 static __inline pdp_entry_t * 249 pmap_pdpe(pmap_t pmap, vm_offset_t va) 250 { 251 pml4_entry_t *pml4e; 252 253 pml4e = pmap_pml4e(pmap, va); 254 if ((*pml4e & VPTE_V) == 0) 255 return NULL; 256 return (pmap_pml4e_to_pdpe(pml4e, va)); 257 } 258 259 /* Return a pointer to the PD slot that corresponds to a VA */ 260 static __inline pd_entry_t * 261 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 262 { 263 pd_entry_t *pde; 264 265 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & VPTE_FRAME); 266 return (&pde[pmap_pde_index(va)]); 267 } 268 269 /* Return a pointer to the PD slot that corresponds to a VA */ 270 static __inline pd_entry_t * 271 pmap_pde(pmap_t pmap, vm_offset_t va) 272 { 273 pdp_entry_t *pdpe; 274 275 pdpe = pmap_pdpe(pmap, va); 276 if (pdpe == NULL || (*pdpe & VPTE_V) == 0) 277 return NULL; 278 return (pmap_pdpe_to_pde(pdpe, va)); 279 } 280 281 /* Return a pointer to the PT slot that corresponds to a VA */ 282 static __inline pt_entry_t * 283 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 284 { 285 pt_entry_t *pte; 286 287 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & VPTE_FRAME); 288 return (&pte[pmap_pte_index(va)]); 289 } 290 291 /* 292 * Hold pt_m for page table scans to prevent it from getting reused out 293 * from under us across blocking conditions in the body of the loop. 294 */ 295 static __inline 296 vm_page_t 297 pmap_hold_pt_page(pd_entry_t *pde, vm_offset_t va) 298 { 299 pt_entry_t pte; 300 vm_page_t pt_m; 301 302 pte = (pt_entry_t)*pde; 303 KKASSERT(pte != 0); 304 pt_m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME); 305 vm_page_hold(pt_m); 306 307 return pt_m; 308 } 309 310 /* Return a pointer to the PT slot that corresponds to a VA */ 311 static __inline pt_entry_t * 312 pmap_pte(pmap_t pmap, vm_offset_t va) 313 { 314 pd_entry_t *pde; 315 316 pde = pmap_pde(pmap, va); 317 if (pde == NULL || (*pde & VPTE_V) == 0) 318 return NULL; 319 if ((*pde & VPTE_PS) != 0) /* compat with x86 pmap_pte() */ 320 return ((pt_entry_t *)pde); 321 return (pmap_pde_to_pte(pde, va)); 322 } 323 324 static PMAP_INLINE pt_entry_t * 325 vtopte(vm_offset_t va) 326 { 327 pt_entry_t *x; 328 x = pmap_pte(&kernel_pmap, va); 329 assert(x != NULL); 330 return x; 331 } 332 333 static __inline pd_entry_t * 334 vtopde(vm_offset_t va) 335 { 336 pd_entry_t *x; 337 x = pmap_pde(&kernel_pmap, va); 338 assert(x != NULL); 339 return x; 340 } 341 342 /* 343 * Returns the physical address translation from va for a user address. 344 * (vm_paddr_t)-1 is returned on failure. 345 */ 346 vm_paddr_t 347 uservtophys(vm_offset_t va) 348 { 349 struct vmspace *vm = curproc->p_vmspace; 350 vm_page_t m; 351 vm_paddr_t pa; 352 int error; 353 int busy; 354 355 /* XXX No idea how to handle this case in a simple way, just abort */ 356 if (PAGE_SIZE - (va & PAGE_MASK) < sizeof(u_int)) 357 return ((vm_paddr_t)-1); 358 359 m = vm_fault_page(&vm->vm_map, trunc_page(va), 360 VM_PROT_READ|VM_PROT_WRITE, 361 VM_FAULT_NORMAL, 362 &error, &busy); 363 if (error) 364 return ((vm_paddr_t)-1); 365 366 pa = VM_PAGE_TO_PHYS(m) | (va & PAGE_MASK); 367 if (busy) 368 vm_page_wakeup(m); 369 else 370 vm_page_unhold(m); 371 372 return pa; 373 } 374 375 static uint64_t 376 allocpages(vm_paddr_t *firstaddr, int n) 377 { 378 uint64_t ret; 379 380 ret = *firstaddr; 381 /*bzero((void *)ret, n * PAGE_SIZE); not mapped yet */ 382 *firstaddr += n * PAGE_SIZE; 383 return (ret); 384 } 385 386 static void 387 create_dmap_vmm(vm_paddr_t *firstaddr) 388 { 389 void *stack_addr; 390 int pml4_stack_index; 391 int pdp_stack_index; 392 int pd_stack_index; 393 long i,j; 394 int regs[4]; 395 int amd_feature; 396 397 uint64_t KPDP_DMAP_phys = allocpages(firstaddr, NDMPML4E); 398 uint64_t KPDP_VSTACK_phys = allocpages(firstaddr, 1); 399 uint64_t KPD_VSTACK_phys = allocpages(firstaddr, 1); 400 401 pml4_entry_t *KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 402 pdp_entry_t *KPDP_DMAP_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_DMAP_phys); 403 pdp_entry_t *KPDP_VSTACK_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_VSTACK_phys); 404 pd_entry_t *KPD_VSTACK_virt = (pd_entry_t *)PHYS_TO_DMAP(KPD_VSTACK_phys); 405 406 bzero(KPDP_DMAP_virt, NDMPML4E * PAGE_SIZE); 407 bzero(KPDP_VSTACK_virt, 1 * PAGE_SIZE); 408 bzero(KPD_VSTACK_virt, 1 * PAGE_SIZE); 409 410 do_cpuid(0x80000001, regs); 411 amd_feature = regs[3]; 412 413 /* Build the mappings for the first 512GB */ 414 if (amd_feature & AMDID_PAGE1GB) { 415 /* In pages of 1 GB, if supported */ 416 for (i = 0; i < NPDPEPG; i++) { 417 KPDP_DMAP_virt[i] = ((uint64_t)i << PDPSHIFT); 418 KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_PS | VPTE_U; 419 } 420 } else { 421 /* In page of 2MB, otherwise */ 422 for (i = 0; i < NPDPEPG; i++) { 423 uint64_t KPD_DMAP_phys; 424 pd_entry_t *KPD_DMAP_virt; 425 426 KPD_DMAP_phys = allocpages(firstaddr, 1); 427 KPD_DMAP_virt = 428 (pd_entry_t *)PHYS_TO_DMAP(KPD_DMAP_phys); 429 430 bzero(KPD_DMAP_virt, PAGE_SIZE); 431 432 KPDP_DMAP_virt[i] = KPD_DMAP_phys; 433 KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_U; 434 435 /* For each PD, we have to allocate NPTEPG PT */ 436 for (j = 0; j < NPTEPG; j++) { 437 KPD_DMAP_virt[j] = (i << PDPSHIFT) | 438 (j << PDRSHIFT); 439 KPD_DMAP_virt[j] |= VPTE_RW | VPTE_V | 440 VPTE_PS | VPTE_U; 441 } 442 } 443 } 444 445 /* DMAP for the first 512G */ 446 KPML4virt[0] = KPDP_DMAP_phys; 447 KPML4virt[0] |= VPTE_RW | VPTE_V | VPTE_U; 448 449 /* create a 2 MB map of the new stack */ 450 pml4_stack_index = (uint64_t)&stack_addr >> PML4SHIFT; 451 KPML4virt[pml4_stack_index] = KPDP_VSTACK_phys; 452 KPML4virt[pml4_stack_index] |= VPTE_RW | VPTE_V | VPTE_U; 453 454 pdp_stack_index = ((uint64_t)&stack_addr & PML4MASK) >> PDPSHIFT; 455 KPDP_VSTACK_virt[pdp_stack_index] = KPD_VSTACK_phys; 456 KPDP_VSTACK_virt[pdp_stack_index] |= VPTE_RW | VPTE_V | VPTE_U; 457 458 pd_stack_index = ((uint64_t)&stack_addr & PDPMASK) >> PDRSHIFT; 459 KPD_VSTACK_virt[pd_stack_index] = (uint64_t) vkernel_stack; 460 KPD_VSTACK_virt[pd_stack_index] |= VPTE_RW | VPTE_V | VPTE_U | VPTE_PS; 461 } 462 463 static void 464 create_pagetables(vm_paddr_t *firstaddr, int64_t ptov_offset) 465 { 466 int i; 467 pml4_entry_t *KPML4virt; 468 pdp_entry_t *KPDPvirt; 469 pd_entry_t *KPDvirt; 470 pt_entry_t *KPTvirt; 471 int kpml4i = pmap_pml4e_index(ptov_offset); 472 int kpdpi = pmap_pdpe_index(ptov_offset); 473 int kpdi = pmap_pde_index(ptov_offset); 474 475 /* 476 * Calculate NKPT - number of kernel page tables. We have to 477 * accomodoate prealloction of the vm_page_array, dump bitmap, 478 * MSGBUF_SIZE, and other stuff. Be generous. 479 * 480 * Maxmem is in pages. 481 */ 482 nkpt = (Maxmem * (sizeof(struct vm_page) * 2) + MSGBUF_SIZE) / NBPDR; 483 /* 484 * Allocate pages 485 */ 486 KPML4phys = allocpages(firstaddr, 1); 487 KPDPphys = allocpages(firstaddr, NKPML4E); 488 KPDphys = allocpages(firstaddr, NKPDPE); 489 KPTphys = allocpages(firstaddr, nkpt); 490 491 KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 492 KPDPvirt = (pdp_entry_t *)PHYS_TO_DMAP(KPDPphys); 493 KPDvirt = (pd_entry_t *)PHYS_TO_DMAP(KPDphys); 494 KPTvirt = (pt_entry_t *)PHYS_TO_DMAP(KPTphys); 495 496 bzero(KPML4virt, 1 * PAGE_SIZE); 497 bzero(KPDPvirt, NKPML4E * PAGE_SIZE); 498 bzero(KPDvirt, NKPDPE * PAGE_SIZE); 499 bzero(KPTvirt, nkpt * PAGE_SIZE); 500 501 /* Now map the page tables at their location within PTmap */ 502 for (i = 0; i < nkpt; i++) { 503 KPDvirt[i + kpdi] = KPTphys + (i << PAGE_SHIFT); 504 KPDvirt[i + kpdi] |= VPTE_RW | VPTE_V | VPTE_U; 505 } 506 507 /* And connect up the PD to the PDP */ 508 for (i = 0; i < NKPDPE; i++) { 509 KPDPvirt[i + kpdpi] = KPDphys + (i << PAGE_SHIFT); 510 KPDPvirt[i + kpdpi] |= VPTE_RW | VPTE_V | VPTE_U; 511 } 512 513 /* And recursively map PML4 to itself in order to get PTmap */ 514 KPML4virt[PML4PML4I] = KPML4phys; 515 KPML4virt[PML4PML4I] |= VPTE_RW | VPTE_V | VPTE_U; 516 517 /* Connect the KVA slot up to the PML4 */ 518 KPML4virt[kpml4i] = KPDPphys; 519 KPML4virt[kpml4i] |= VPTE_RW | VPTE_V | VPTE_U; 520 } 521 522 /* 523 * Typically used to initialize a fictitious page by vm/device_pager.c 524 */ 525 void 526 pmap_page_init(struct vm_page *m) 527 { 528 vm_page_init(m); 529 TAILQ_INIT(&m->md.pv_list); 530 } 531 532 /* 533 * Bootstrap the system enough to run with virtual memory. 534 * 535 * On x86_64 this is called after mapping has already been enabled 536 * and just syncs the pmap module with what has already been done. 537 * [We can't call it easily with mapping off since the kernel is not 538 * mapped with PA == VA, hence we would have to relocate every address 539 * from the linked base (virtual) address "KERNBASE" to the actual 540 * (physical) address starting relative to 0] 541 */ 542 void 543 pmap_bootstrap(vm_paddr_t *firstaddr, int64_t ptov_offset) 544 { 545 vm_offset_t va; 546 pt_entry_t *pte; 547 548 /* 549 * Create an initial set of page tables to run the kernel in. 550 */ 551 create_pagetables(firstaddr, ptov_offset); 552 553 /* Create the DMAP for the VMM */ 554 if (vmm_enabled) { 555 create_dmap_vmm(firstaddr); 556 } 557 558 virtual_start = KvaStart; 559 virtual_end = KvaEnd; 560 561 /* 562 * Initialize protection array. 563 */ 564 x86_64_protection_init(); 565 566 /* 567 * The kernel's pmap is statically allocated so we don't have to use 568 * pmap_create, which is unlikely to work correctly at this part of 569 * the boot sequence (XXX and which no longer exists). 570 * 571 * The kernel_pmap's pm_pteobj is used only for locking and not 572 * for mmu pages. 573 */ 574 kernel_pmap.pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 575 kernel_pmap.pm_count = 1; 576 /* don't allow deactivation */ 577 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 578 kernel_pmap.pm_pteobj = NULL; /* see pmap_init */ 579 RB_INIT(&kernel_pmap.pm_pvroot); 580 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 581 582 /* 583 * Reserve some special page table entries/VA space for temporary 584 * mapping of pages. 585 */ 586 #define SYSMAP(c, p, v, n) \ 587 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 588 589 va = virtual_start; 590 pte = pmap_pte(&kernel_pmap, va); 591 /* 592 * CMAP1/CMAP2 are used for zeroing and copying pages. 593 */ 594 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 595 596 #if 0 /* JGV */ 597 /* 598 * Crashdump maps. 599 */ 600 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 601 #endif 602 603 /* 604 * ptvmmap is used for reading arbitrary physical pages via 605 * /dev/mem. 606 */ 607 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 608 609 /* 610 * msgbufp is used to map the system message buffer. 611 * XXX msgbufmap is not used. 612 */ 613 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 614 atop(round_page(MSGBUF_SIZE))) 615 616 virtual_start = va; 617 618 *CMAP1 = 0; 619 /* Not ready to do an invltlb yet for VMM*/ 620 if (!vmm_enabled) 621 cpu_invltlb(); 622 623 } 624 625 /* 626 * Initialize the pmap module. 627 * Called by vm_init, to initialize any structures that the pmap 628 * system needs to map virtual memory. 629 * pmap_init has been enhanced to support in a fairly consistant 630 * way, discontiguous physical memory. 631 */ 632 void 633 pmap_init(void) 634 { 635 vm_pindex_t i; 636 vm_pindex_t initial_pvs; 637 638 /* 639 * object for kernel page table pages 640 */ 641 /* JG I think the number can be arbitrary */ 642 vm_object_init(&kptobj, 5); 643 kernel_pmap.pm_pteobj = &kptobj; 644 645 /* 646 * Allocate memory for random pmap data structures. Includes the 647 * pv_head_table. 648 */ 649 for (i = 0; i < vm_page_array_size; i++) { 650 vm_page_t m; 651 652 m = &vm_page_array[i]; 653 TAILQ_INIT(&m->md.pv_list); 654 m->md.pv_list_count = 0; 655 } 656 657 /* 658 * init the pv free list 659 */ 660 initial_pvs = vm_page_array_size; 661 if (initial_pvs < MINPV) 662 initial_pvs = MINPV; 663 pvzone = &pvzone_store; 664 pvinit = (struct pv_entry *) 665 kmem_alloc(&kernel_map, 666 initial_pvs * sizeof (struct pv_entry), 667 VM_SUBSYS_PVENTRY); 668 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, 669 initial_pvs); 670 671 /* 672 * Now it is safe to enable pv_table recording. 673 */ 674 pmap_initialized = TRUE; 675 } 676 677 /* 678 * Initialize the address space (zone) for the pv_entries. Set a 679 * high water mark so that the system can recover from excessive 680 * numbers of pv entries. 681 */ 682 void 683 pmap_init2(void) 684 { 685 vm_pindex_t shpgperproc = PMAP_SHPGPERPROC; 686 687 TUNABLE_LONG_FETCH("vm.pmap.shpgperproc", &shpgperproc); 688 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 689 TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &pv_entry_max); 690 pv_entry_high_water = 9 * (pv_entry_max / 10); 691 zinitna(pvzone, NULL, 0, pv_entry_max, ZONE_INTERRUPT); 692 } 693 694 695 /*************************************************** 696 * Low level helper routines..... 697 ***************************************************/ 698 699 /* 700 * The modification bit is not tracked for any pages in this range. XXX 701 * such pages in this maps should always use pmap_k*() functions and not 702 * be managed anyhow. 703 * 704 * XXX User and kernel address spaces are independant for virtual kernels, 705 * this function only applies to the kernel pmap. 706 */ 707 static void 708 pmap_track_modified(pmap_t pmap, vm_offset_t va) 709 { 710 KKASSERT(pmap != &kernel_pmap || 711 va < clean_sva || va >= clean_eva); 712 } 713 714 /* 715 * Extract the physical page address associated with the map/VA pair. 716 * 717 * No requirements. 718 */ 719 vm_paddr_t 720 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 721 { 722 vm_paddr_t rtval; 723 pt_entry_t *pte; 724 pd_entry_t pde, *pdep; 725 726 vm_object_hold(pmap->pm_pteobj); 727 rtval = 0; 728 pdep = pmap_pde(pmap, va); 729 if (pdep != NULL) { 730 pde = *pdep; 731 if (pde) { 732 if ((pde & VPTE_PS) != 0) { 733 /* JGV */ 734 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 735 } else { 736 pte = pmap_pde_to_pte(pdep, va); 737 rtval = (*pte & VPTE_FRAME) | (va & PAGE_MASK); 738 } 739 } 740 } 741 if (handlep) 742 *handlep = NULL; /* XXX */ 743 vm_object_drop(pmap->pm_pteobj); 744 745 return rtval; 746 } 747 748 void 749 pmap_extract_done(void *handle) 750 { 751 pmap_t pmap; 752 753 if (handle) { 754 pmap = handle; 755 vm_object_drop(pmap->pm_pteobj); 756 } 757 } 758 759 /* 760 * Similar to extract but checks protections, SMP-friendly short-cut for 761 * vm_fault_page[_quick](). 762 * 763 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NEITHER IT NOR ITS TARGET 764 * DATA IS SUITABLE FOR WRITING. Writing can interfere with 765 * pageouts flushes, msync, etc. The hold_count is not enough 766 * to avoid races against pageouts and other flush code doesn't 767 * care about hold_count. 768 */ 769 vm_page_t 770 pmap_fault_page_quick(pmap_t pmap __unused, vm_offset_t vaddr __unused, 771 vm_prot_t prot __unused, int *busyp __unused) 772 { 773 return(NULL); 774 } 775 776 /* 777 * Routine: pmap_kextract 778 * Function: 779 * Extract the physical page address associated 780 * kernel virtual address. 781 */ 782 vm_paddr_t 783 pmap_kextract(vm_offset_t va) 784 { 785 pd_entry_t pde; 786 vm_paddr_t pa; 787 788 KKASSERT(va >= KvaStart && va < KvaEnd); 789 790 /* 791 * The DMAP region is not included in [KvaStart, KvaEnd) 792 */ 793 #if 0 794 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 795 pa = DMAP_TO_PHYS(va); 796 } else { 797 #endif 798 pde = *vtopde(va); 799 if (pde & VPTE_PS) { 800 /* JGV */ 801 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 802 } else { 803 /* 804 * Beware of a concurrent promotion that changes the 805 * PDE at this point! For example, vtopte() must not 806 * be used to access the PTE because it would use the 807 * new PDE. It is, however, safe to use the old PDE 808 * because the page table page is preserved by the 809 * promotion. 810 */ 811 pa = *pmap_pde_to_pte(&pde, va); 812 pa = (pa & VPTE_FRAME) | (va & PAGE_MASK); 813 } 814 #if 0 815 } 816 #endif 817 return pa; 818 } 819 820 /*************************************************** 821 * Low level mapping routines..... 822 ***************************************************/ 823 824 /* 825 * Enter a mapping into kernel_pmap. Mappings created in this fashion 826 * are not managed. Mappings must be immediately accessible on all cpus. 827 * 828 * Call pmap_inval_pte() to invalidate the virtual pte and clean out the 829 * real pmap and handle related races before storing the new vpte. The 830 * new semantics for kenter require use to do an UNCONDITIONAL invalidation, 831 * because the entry may have previously been cleared without an invalidation. 832 */ 833 void 834 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 835 { 836 pt_entry_t *ptep; 837 pt_entry_t npte; 838 839 KKASSERT(va >= KvaStart && va < KvaEnd); 840 npte = pa | VPTE_RW | VPTE_V | VPTE_U; 841 ptep = vtopte(va); 842 843 #if 1 844 pmap_inval_pte(ptep, &kernel_pmap, va); 845 #else 846 if (*pte & VPTE_V) 847 pmap_inval_pte(ptep, &kernel_pmap, va); 848 #endif 849 atomic_swap_long(ptep, npte); 850 } 851 852 /* 853 * Enter an unmanaged KVA mapping for the private use of the current 854 * cpu only. 855 * 856 * It is illegal for the mapping to be accessed by other cpus without 857 * proper invalidation. 858 */ 859 int 860 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 861 { 862 pt_entry_t *ptep; 863 pt_entry_t npte; 864 int res; 865 866 KKASSERT(va >= KvaStart && va < KvaEnd); 867 868 npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; 869 ptep = vtopte(va); 870 871 #if 1 872 pmap_inval_pte_quick(ptep, &kernel_pmap, va); 873 res = 1; 874 #else 875 /* FUTURE */ 876 res = (*ptep != 0); 877 if (*pte & VPTE_V) 878 pmap_inval_pte(pte, &kernel_pmap, va); 879 #endif 880 atomic_swap_long(ptep, npte); 881 882 return res; 883 } 884 885 /* 886 * Invalidation will occur later, ok to be lazy here. 887 */ 888 int 889 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 890 { 891 pt_entry_t *ptep; 892 pt_entry_t npte; 893 int res; 894 895 KKASSERT(va >= KvaStart && va < KvaEnd); 896 897 npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; 898 ptep = vtopte(va); 899 #if 1 900 res = 1; 901 #else 902 /* FUTURE */ 903 res = (*ptep != 0); 904 #endif 905 atomic_swap_long(ptep, npte); 906 907 return res; 908 } 909 910 /* 911 * Remove an unmanaged mapping created with pmap_kenter*(). 912 */ 913 void 914 pmap_kremove(vm_offset_t va) 915 { 916 pt_entry_t *ptep; 917 918 KKASSERT(va >= KvaStart && va < KvaEnd); 919 920 ptep = vtopte(va); 921 atomic_swap_long(ptep, 0); 922 pmap_inval_pte(ptep, &kernel_pmap, va); 923 } 924 925 /* 926 * Remove an unmanaged mapping created with pmap_kenter*() but synchronize 927 * only with this cpu. 928 * 929 * Unfortunately because we optimize new entries by testing VPTE_V later 930 * on, we actually still have to synchronize with all the cpus. XXX maybe 931 * store a junk value and test against 0 in the other places instead? 932 */ 933 void 934 pmap_kremove_quick(vm_offset_t va) 935 { 936 pt_entry_t *ptep; 937 938 KKASSERT(va >= KvaStart && va < KvaEnd); 939 940 ptep = vtopte(va); 941 atomic_swap_long(ptep, 0); 942 pmap_inval_pte(ptep, &kernel_pmap, va); /* NOT _quick */ 943 } 944 945 /* 946 * Invalidation will occur later, ok to be lazy here. 947 */ 948 void 949 pmap_kremove_noinval(vm_offset_t va) 950 { 951 pt_entry_t *ptep; 952 953 KKASSERT(va >= KvaStart && va < KvaEnd); 954 955 ptep = vtopte(va); 956 atomic_swap_long(ptep, 0); 957 } 958 959 /* 960 * Used to map a range of physical addresses into kernel 961 * virtual address space. 962 * 963 * For now, VM is already on, we only need to map the 964 * specified memory. 965 */ 966 vm_offset_t 967 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 968 { 969 return PHYS_TO_DMAP(start); 970 } 971 972 /* 973 * Map a set of unmanaged VM pages into KVM. 974 */ 975 static __inline void 976 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) 977 { 978 vm_offset_t end_va; 979 vm_offset_t va; 980 981 end_va = beg_va + count * PAGE_SIZE; 982 KKASSERT(beg_va >= KvaStart && end_va <= KvaEnd); 983 984 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 985 pt_entry_t *ptep; 986 987 ptep = vtopte(va); 988 atomic_swap_long(ptep, VM_PAGE_TO_PHYS(*m) | 989 VPTE_RW | VPTE_V | VPTE_U); 990 ++m; 991 } 992 if (doinval) 993 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 994 /* pmap_inval_pte(pte, &kernel_pmap, va); */ 995 } 996 997 void 998 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 999 { 1000 _pmap_qenter(beg_va, m, count, 1); 1001 } 1002 1003 void 1004 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) 1005 { 1006 _pmap_qenter(beg_va, m, count, 0); 1007 } 1008 1009 /* 1010 * Undo the effects of pmap_qenter*(). 1011 */ 1012 void 1013 pmap_qremove(vm_offset_t beg_va, int count) 1014 { 1015 vm_offset_t end_va; 1016 vm_offset_t va; 1017 1018 end_va = beg_va + count * PAGE_SIZE; 1019 KKASSERT(beg_va >= KvaStart && end_va < KvaEnd); 1020 1021 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1022 pt_entry_t *ptep; 1023 1024 ptep = vtopte(va); 1025 atomic_swap_long(ptep, 0); 1026 } 1027 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1028 } 1029 1030 /* 1031 * Unlike the real pmap code, we can't avoid calling the real-kernel. 1032 */ 1033 void 1034 pmap_qremove_quick(vm_offset_t va, int count) 1035 { 1036 pmap_qremove(va, count); 1037 } 1038 1039 void 1040 pmap_qremove_noinval(vm_offset_t va, int count) 1041 { 1042 pmap_qremove(va, count); 1043 } 1044 1045 /* 1046 * This routine works like vm_page_lookup() but also blocks as long as the 1047 * page is busy. This routine does not busy the page it returns. 1048 * 1049 * Unless the caller is managing objects whos pages are in a known state, 1050 * the call should be made with a critical section held so the page's object 1051 * association remains valid on return. 1052 */ 1053 static vm_page_t 1054 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1055 { 1056 vm_page_t m; 1057 1058 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1059 m = vm_page_lookup_busy_wait(object, pindex, TRUE, "pplookp"); 1060 1061 return(m); 1062 } 1063 1064 /* 1065 * Create a new thread and optionally associate it with a (new) process. 1066 * NOTE! the new thread's cpu may not equal the current cpu. 1067 */ 1068 void 1069 pmap_init_thread(thread_t td) 1070 { 1071 /* enforce pcb placement */ 1072 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1073 td->td_savefpu = &td->td_pcb->pcb_save; 1074 td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */ 1075 } 1076 1077 /* 1078 * This routine directly affects the fork perf for a process. 1079 */ 1080 void 1081 pmap_init_proc(struct proc *p) 1082 { 1083 } 1084 1085 /* 1086 * Unwire a page table which has been removed from the pmap. We own the 1087 * wire_count, so the page cannot go away. The page representing the page 1088 * table is passed in unbusied and must be busied if we cannot trivially 1089 * unwire it. 1090 * 1091 * XXX NOTE! This code is not usually run because we do not currently 1092 * implement dynamic page table page removal. The page in 1093 * its parent assumes at least 1 wire count, so no call to this 1094 * function ever sees a wire count less than 2. 1095 */ 1096 static int 1097 pmap_unwire_pgtable(pmap_t pmap, vm_offset_t va, vm_page_t m) 1098 { 1099 /* 1100 * Try to unwire optimally. If non-zero is returned the wire_count 1101 * is 1 and we must busy the page to unwire it. 1102 */ 1103 if (vm_page_unwire_quick(m) == 0) 1104 return 0; 1105 1106 vm_page_busy_wait(m, TRUE, "pmuwpt"); 1107 KASSERT(m->queue == PQ_NONE, 1108 ("_pmap_unwire_pgtable: %p->queue != PQ_NONE", m)); 1109 1110 if (m->wire_count == 1) { 1111 /* 1112 * Unmap the page table page. 1113 */ 1114 /* pmap_inval_add(info, pmap, -1); */ 1115 1116 if (m->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1117 /* PDP page */ 1118 pml4_entry_t *pml4; 1119 pml4 = pmap_pml4e(pmap, va); 1120 *pml4 = 0; 1121 } else if (m->pindex >= NUPT_TOTAL) { 1122 /* PD page */ 1123 pdp_entry_t *pdp; 1124 pdp = pmap_pdpe(pmap, va); 1125 *pdp = 0; 1126 } else { 1127 /* PT page */ 1128 pd_entry_t *pd; 1129 pd = pmap_pde(pmap, va); 1130 *pd = 0; 1131 } 1132 1133 KKASSERT(pmap->pm_stats.resident_count > 0); 1134 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1135 1136 if (pmap->pm_ptphint == m) 1137 pmap->pm_ptphint = NULL; 1138 1139 if (m->pindex < NUPT_TOTAL) { 1140 /* We just released a PT, unhold the matching PD */ 1141 vm_page_t pdpg; 1142 1143 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & 1144 VPTE_FRAME); 1145 pmap_unwire_pgtable(pmap, va, pdpg); 1146 } 1147 if (m->pindex >= NUPT_TOTAL && 1148 m->pindex < (NUPT_TOTAL + NUPD_TOTAL)) { 1149 /* We just released a PD, unhold the matching PDP */ 1150 vm_page_t pdppg; 1151 1152 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & 1153 VPTE_FRAME); 1154 pmap_unwire_pgtable(pmap, va, pdppg); 1155 } 1156 1157 /* 1158 * This was our last wire, the page had better be unwired 1159 * after we decrement wire_count. 1160 * 1161 * FUTURE NOTE: shared page directory page could result in 1162 * multiple wire counts. 1163 */ 1164 vm_page_unwire(m, 0); 1165 KKASSERT(m->wire_count == 0); 1166 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1167 vm_page_flash(m); 1168 vm_page_free(m); 1169 return 1; 1170 } else { 1171 /* XXX SMP race to 1 if not holding vmobj */ 1172 vm_page_unwire(m, 0); 1173 vm_page_wakeup(m); 1174 return 0; 1175 } 1176 } 1177 1178 /* 1179 * After removing a page table entry, this routine is used to 1180 * conditionally free the page, and manage the hold/wire counts. 1181 * 1182 * If not NULL the caller owns a wire_count on mpte, so it can't disappear. 1183 * If NULL the caller owns a wire_count on what would be the mpte, we must 1184 * look it up. 1185 */ 1186 static int 1187 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1188 { 1189 vm_pindex_t ptepindex; 1190 1191 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1192 1193 if (mpte == NULL) { 1194 /* 1195 * page table pages in the kernel_pmap are not managed. 1196 */ 1197 if (pmap == &kernel_pmap) 1198 return(0); 1199 ptepindex = pmap_pt_pindex(va); 1200 if (pmap->pm_ptphint && 1201 (pmap->pm_ptphint->pindex == ptepindex)) { 1202 mpte = pmap->pm_ptphint; 1203 } else { 1204 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1205 pmap->pm_ptphint = mpte; 1206 vm_page_wakeup(mpte); 1207 } 1208 } 1209 return pmap_unwire_pgtable(pmap, va, mpte); 1210 } 1211 1212 /* 1213 * Initialize pmap0/vmspace0 . Since process 0 never enters user mode we 1214 * just dummy it up so it works well enough for fork(). 1215 * 1216 * In DragonFly, process pmaps may only be used to manipulate user address 1217 * space, never kernel address space. 1218 */ 1219 void 1220 pmap_pinit0(struct pmap *pmap) 1221 { 1222 pmap_pinit(pmap); 1223 } 1224 1225 /* 1226 * Initialize a preallocated and zeroed pmap structure, 1227 * such as one in a vmspace structure. 1228 */ 1229 void 1230 pmap_pinit(struct pmap *pmap) 1231 { 1232 vm_page_t ptdpg; 1233 1234 /* 1235 * No need to allocate page table space yet but we do need a valid 1236 * page directory table. 1237 */ 1238 if (pmap->pm_pml4 == NULL) { 1239 pmap->pm_pml4 = (pml4_entry_t *) 1240 kmem_alloc_pageable(&kernel_map, PAGE_SIZE, 1241 VM_SUBSYS_PML4); 1242 } 1243 1244 /* 1245 * Allocate an object for the ptes 1246 */ 1247 if (pmap->pm_pteobj == NULL) 1248 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL + 1); 1249 1250 /* 1251 * Allocate the page directory page, unless we already have 1252 * one cached. If we used the cached page the wire_count will 1253 * already be set appropriately. 1254 */ 1255 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1256 ptdpg = vm_page_grab(pmap->pm_pteobj, 1257 NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL, 1258 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | 1259 VM_ALLOC_ZERO); 1260 pmap->pm_pdirm = ptdpg; 1261 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_WRITEABLE); 1262 vm_page_wire(ptdpg); 1263 vm_page_wakeup(ptdpg); 1264 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1265 } 1266 pmap->pm_count = 1; 1267 CPUMASK_ASSZERO(pmap->pm_active); 1268 pmap->pm_ptphint = NULL; 1269 RB_INIT(&pmap->pm_pvroot); 1270 spin_init(&pmap->pm_spin, "pmapinit"); 1271 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1272 pmap->pm_stats.resident_count = 1; 1273 pmap->pm_stats.wired_count = 1; 1274 } 1275 1276 /* 1277 * Clean up a pmap structure so it can be physically freed. This routine 1278 * is called by the vmspace dtor function. A great deal of pmap data is 1279 * left passively mapped to improve vmspace management so we have a bit 1280 * of cleanup work to do here. 1281 * 1282 * No requirements. 1283 */ 1284 void 1285 pmap_puninit(pmap_t pmap) 1286 { 1287 vm_page_t p; 1288 1289 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 1290 if ((p = pmap->pm_pdirm) != NULL) { 1291 KKASSERT(pmap->pm_pml4 != NULL); 1292 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1293 vm_page_busy_wait(p, TRUE, "pgpun"); 1294 vm_page_unwire(p, 0); 1295 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1296 vm_page_free(p); 1297 pmap->pm_pdirm = NULL; 1298 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1299 KKASSERT(pmap->pm_stats.wired_count == 0); 1300 } 1301 if (pmap->pm_pml4) { 1302 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1303 pmap->pm_pml4 = NULL; 1304 } 1305 if (pmap->pm_pteobj) { 1306 vm_object_deallocate(pmap->pm_pteobj); 1307 pmap->pm_pteobj = NULL; 1308 } 1309 } 1310 1311 /* 1312 * This function is now unused (used to add the pmap to the pmap_list) 1313 */ 1314 void 1315 pmap_pinit2(struct pmap *pmap) 1316 { 1317 } 1318 1319 /* 1320 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1321 * 0 on failure (if the procedure had to sleep). 1322 * 1323 * When asked to remove the page directory page itself, we actually just 1324 * leave it cached so we do not have to incur the SMP inval overhead of 1325 * removing the kernel mapping. pmap_puninit() will take care of it. 1326 */ 1327 static int 1328 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1329 { 1330 /* 1331 * This code optimizes the case of freeing non-busy 1332 * page-table pages. Those pages are zero now, and 1333 * might as well be placed directly into the zero queue. 1334 */ 1335 if (vm_page_busy_try(p, TRUE)) { 1336 vm_page_sleep_busy(p, TRUE, "pmaprl"); 1337 return 1; 1338 } 1339 1340 /* 1341 * Remove the page table page from the processes address space. 1342 */ 1343 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1344 /* 1345 * We are the pml4 table itself. 1346 */ 1347 /* XXX anything to do here? */ 1348 } else if (p->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1349 /* 1350 * We are a PDP page. 1351 * We look for the PML4 entry that points to us. 1352 */ 1353 vm_page_t m4; 1354 pml4_entry_t *pml4; 1355 int idx; 1356 1357 m4 = vm_page_lookup(pmap->pm_pteobj, 1358 NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 1359 KKASSERT(m4 != NULL); 1360 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1361 idx = (p->pindex - (NUPT_TOTAL + NUPD_TOTAL)) % NPML4EPG; 1362 KKASSERT(pml4[idx] != 0); 1363 if (pml4[idx] == 0) 1364 kprintf("pmap_release: Unmapped PML4\n"); 1365 pml4[idx] = 0; 1366 vm_page_unwire_quick(m4); 1367 } else if (p->pindex >= NUPT_TOTAL) { 1368 /* 1369 * We are a PD page. 1370 * We look for the PDP entry that points to us. 1371 */ 1372 vm_page_t m3; 1373 pdp_entry_t *pdp; 1374 int idx; 1375 1376 m3 = vm_page_lookup(pmap->pm_pteobj, 1377 NUPT_TOTAL + NUPD_TOTAL + 1378 (p->pindex - NUPT_TOTAL) / NPDPEPG); 1379 KKASSERT(m3 != NULL); 1380 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1381 idx = (p->pindex - NUPT_TOTAL) % NPDPEPG; 1382 KKASSERT(pdp[idx] != 0); 1383 if (pdp[idx] == 0) 1384 kprintf("pmap_release: Unmapped PDP %d\n", idx); 1385 pdp[idx] = 0; 1386 vm_page_unwire_quick(m3); 1387 } else { 1388 /* We are a PT page. 1389 * We look for the PD entry that points to us. 1390 */ 1391 vm_page_t m2; 1392 pd_entry_t *pd; 1393 int idx; 1394 1395 m2 = vm_page_lookup(pmap->pm_pteobj, 1396 NUPT_TOTAL + p->pindex / NPDEPG); 1397 KKASSERT(m2 != NULL); 1398 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1399 idx = p->pindex % NPDEPG; 1400 if (pd[idx] == 0) 1401 kprintf("pmap_release: Unmapped PD %d\n", idx); 1402 pd[idx] = 0; 1403 vm_page_unwire_quick(m2); 1404 } 1405 KKASSERT(pmap->pm_stats.resident_count > 0); 1406 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1407 1408 if (p->wire_count > 1) { 1409 panic("pmap_release: freeing held pt page " 1410 "pmap=%p pg=%p dmap=%p pi=%ld {%ld,%ld,%ld}", 1411 pmap, p, (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(p)), 1412 p->pindex, NUPT_TOTAL, NUPD_TOTAL, NUPDP_TOTAL); 1413 } 1414 1415 if (pmap->pm_ptphint == p) 1416 pmap->pm_ptphint = NULL; 1417 1418 /* 1419 * We leave the top-level page table page cached, wired, and mapped in 1420 * the pmap until the dtor function (pmap_puninit()) gets called. 1421 * However, still clean it up. 1422 */ 1423 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1424 bzero(pmap->pm_pml4, PAGE_SIZE); 1425 vm_page_wakeup(p); 1426 } else { 1427 vm_page_unwire(p, 0); 1428 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1429 vm_page_free(p); 1430 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1431 } 1432 return 0; 1433 } 1434 1435 /* 1436 * Locate the requested PT, PD, or PDP page table page. 1437 * 1438 * Returns a busied page, caller must vm_page_wakeup() when done. 1439 */ 1440 static vm_page_t 1441 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1442 { 1443 vm_page_t m; 1444 vm_page_t pm; 1445 vm_pindex_t pindex; 1446 pt_entry_t *ptep; 1447 pt_entry_t data; 1448 1449 /* 1450 * Find or fabricate a new pagetable page. A non-zero wire_count 1451 * indicates that the page has already been mapped into its parent. 1452 */ 1453 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1454 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1455 if (m->wire_count != 0) 1456 return m; 1457 1458 /* 1459 * Map the page table page into its parent, giving it 1 wire count. 1460 */ 1461 vm_page_wire(m); 1462 vm_page_unqueue(m); 1463 atomic_add_long(&pmap->pm_stats.resident_count, 1); 1464 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 1465 1466 data = VM_PAGE_TO_PHYS(m) | 1467 VPTE_RW | VPTE_V | VPTE_U | VPTE_A | VPTE_M | VPTE_WIRED; 1468 atomic_add_long(&pmap->pm_stats.wired_count, 1); 1469 1470 if (ptepindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1471 /* 1472 * Map PDP into the PML4 1473 */ 1474 pindex = ptepindex - (NUPT_TOTAL + NUPD_TOTAL); 1475 pindex &= (NUPDP_TOTAL - 1); 1476 ptep = (pt_entry_t *)pmap->pm_pml4; 1477 pm = NULL; 1478 } else if (ptepindex >= NUPT_TOTAL) { 1479 /* 1480 * Map PD into its PDP 1481 */ 1482 pindex = (ptepindex - NUPT_TOTAL) >> NPDPEPGSHIFT; 1483 pindex += NUPT_TOTAL + NUPD_TOTAL; 1484 pm = _pmap_allocpte(pmap, pindex); 1485 pindex = (ptepindex - NUPT_TOTAL) & (NPDPEPG - 1); 1486 ptep = (void *)PHYS_TO_DMAP(pm->phys_addr); 1487 } else { 1488 /* 1489 * Map PT into its PD 1490 */ 1491 pindex = ptepindex >> NPDPEPGSHIFT; 1492 pindex += NUPT_TOTAL; 1493 pm = _pmap_allocpte(pmap, pindex); 1494 pindex = ptepindex & (NPTEPG - 1); 1495 ptep = (void *)PHYS_TO_DMAP(pm->phys_addr); 1496 } 1497 1498 /* 1499 * Install the pte in (pm). (m) prevents races. 1500 */ 1501 ptep += pindex; 1502 data = atomic_swap_long(ptep, data); 1503 if (pm) { 1504 vm_page_wire_quick(pm); 1505 vm_page_wakeup(pm); 1506 } 1507 pmap->pm_ptphint = pm; 1508 1509 return m; 1510 } 1511 1512 /* 1513 * Determine the page table page required to access the VA in the pmap 1514 * and allocate it if necessary. Return a held vm_page_t for the page. 1515 * 1516 * Only used with user pmaps. 1517 */ 1518 static vm_page_t 1519 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1520 { 1521 vm_pindex_t ptepindex; 1522 vm_page_t m; 1523 1524 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1525 1526 /* 1527 * Calculate pagetable page index, and return the PT page to 1528 * the caller. 1529 */ 1530 ptepindex = pmap_pt_pindex(va); 1531 m = _pmap_allocpte(pmap, ptepindex); 1532 1533 return m; 1534 } 1535 1536 /*************************************************** 1537 * Pmap allocation/deallocation routines. 1538 ***************************************************/ 1539 1540 /* 1541 * Release any resources held by the given physical map. 1542 * Called when a pmap initialized by pmap_pinit is being released. 1543 * Should only be called if the map contains no valid mappings. 1544 */ 1545 static int pmap_release_callback(struct vm_page *p, void *data); 1546 1547 void 1548 pmap_release(struct pmap *pmap) 1549 { 1550 vm_object_t object = pmap->pm_pteobj; 1551 struct rb_vm_page_scan_info info; 1552 1553 KKASSERT(pmap != &kernel_pmap); 1554 1555 #if defined(DIAGNOSTIC) 1556 if (object->ref_count != 1) 1557 panic("pmap_release: pteobj reference count != 1"); 1558 #endif 1559 1560 info.pmap = pmap; 1561 info.object = object; 1562 1563 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 1564 ("pmap %p still active! %016jx", 1565 pmap, 1566 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 1567 1568 vm_object_hold(object); 1569 do { 1570 info.error = 0; 1571 info.mpte = NULL; 1572 info.limit = object->generation; 1573 1574 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1575 pmap_release_callback, &info); 1576 if (info.error == 0 && info.mpte) { 1577 if (pmap_release_free_page(pmap, info.mpte)) 1578 info.error = 1; 1579 } 1580 } while (info.error); 1581 1582 pmap->pm_ptphint = NULL; 1583 1584 KASSERT((pmap->pm_stats.wired_count == (pmap->pm_pdirm != NULL)), 1585 ("pmap_release: dangling count %p %ld", 1586 pmap, pmap->pm_stats.wired_count)); 1587 1588 vm_object_drop(object); 1589 } 1590 1591 static int 1592 pmap_release_callback(struct vm_page *p, void *data) 1593 { 1594 struct rb_vm_page_scan_info *info = data; 1595 1596 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1597 info->mpte = p; 1598 return(0); 1599 } 1600 if (pmap_release_free_page(info->pmap, p)) { 1601 info->error = 1; 1602 return(-1); 1603 } 1604 if (info->object->generation != info->limit) { 1605 info->error = 1; 1606 return(-1); 1607 } 1608 return(0); 1609 } 1610 1611 /* 1612 * Grow the number of kernel page table entries, if needed. 1613 * 1614 * kernel_map must be locked exclusively by the caller. 1615 */ 1616 void 1617 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 1618 { 1619 vm_offset_t addr; 1620 vm_paddr_t paddr; 1621 vm_offset_t ptppaddr; 1622 vm_page_t nkpg; 1623 pd_entry_t *pde, newpdir; 1624 pdp_entry_t newpdp; 1625 1626 addr = kend; 1627 1628 vm_object_hold(&kptobj); 1629 if (kernel_vm_end == 0) { 1630 kernel_vm_end = KvaStart; 1631 nkpt = 0; 1632 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & VPTE_V) != 0) { 1633 kernel_vm_end = 1634 rounddown2(kernel_vm_end + PAGE_SIZE * NPTEPG, 1635 PAGE_SIZE * NPTEPG); 1636 nkpt++; 1637 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 1638 kernel_vm_end = vm_map_max(&kernel_map); 1639 break; 1640 } 1641 } 1642 } 1643 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1644 if (addr - 1 >= vm_map_max(&kernel_map)) 1645 addr = vm_map_max(&kernel_map); 1646 while (kernel_vm_end < addr) { 1647 pde = pmap_pde(&kernel_pmap, kernel_vm_end); 1648 if (pde == NULL) { 1649 /* We need a new PDP entry */ 1650 nkpg = vm_page_alloc(&kptobj, nkpt, 1651 VM_ALLOC_NORMAL | 1652 VM_ALLOC_SYSTEM | 1653 VM_ALLOC_INTERRUPT); 1654 if (nkpg == NULL) { 1655 panic("pmap_growkernel: no memory to " 1656 "grow kernel"); 1657 } 1658 paddr = VM_PAGE_TO_PHYS(nkpg); 1659 pmap_zero_page(paddr); 1660 newpdp = (pdp_entry_t)(paddr | 1661 VPTE_V | VPTE_RW | VPTE_U | 1662 VPTE_A | VPTE_M | VPTE_WIRED); 1663 *pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp; 1664 atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1); 1665 nkpt++; 1666 continue; /* try again */ 1667 } 1668 if ((*pde & VPTE_V) != 0) { 1669 kernel_vm_end = 1670 rounddown2(kernel_vm_end + PAGE_SIZE * NPTEPG, 1671 PAGE_SIZE * NPTEPG); 1672 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 1673 kernel_vm_end = vm_map_max(&kernel_map); 1674 break; 1675 } 1676 continue; 1677 } 1678 1679 /* 1680 * This index is bogus, but out of the way 1681 */ 1682 nkpg = vm_page_alloc(&kptobj, nkpt, 1683 VM_ALLOC_NORMAL | 1684 VM_ALLOC_SYSTEM | 1685 VM_ALLOC_INTERRUPT); 1686 if (nkpg == NULL) 1687 panic("pmap_growkernel: no memory to grow kernel"); 1688 1689 vm_page_wire(nkpg); 1690 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1691 pmap_zero_page(ptppaddr); 1692 newpdir = (pd_entry_t)(ptppaddr | 1693 VPTE_V | VPTE_RW | VPTE_U | 1694 VPTE_A | VPTE_M | VPTE_WIRED); 1695 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir; 1696 atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1); 1697 nkpt++; 1698 1699 kernel_vm_end = 1700 rounddown2(kernel_vm_end + PAGE_SIZE * NPTEPG, 1701 PAGE_SIZE * NPTEPG); 1702 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 1703 kernel_vm_end = vm_map_max(&kernel_map); 1704 break; 1705 } 1706 } 1707 vm_object_drop(&kptobj); 1708 } 1709 1710 /* 1711 * Add a reference to the specified pmap. 1712 * 1713 * No requirements. 1714 */ 1715 void 1716 pmap_reference(pmap_t pmap) 1717 { 1718 if (pmap) 1719 atomic_add_int(&pmap->pm_count, 1); 1720 } 1721 1722 /************************************************************************ 1723 * VMSPACE MANAGEMENT * 1724 ************************************************************************ 1725 * 1726 * The VMSPACE management we do in our virtual kernel must be reflected 1727 * in the real kernel. This is accomplished by making vmspace system 1728 * calls to the real kernel. 1729 */ 1730 void 1731 cpu_vmspace_alloc(struct vmspace *vm) 1732 { 1733 int r; 1734 void *rp; 1735 vpte_t vpte; 1736 1737 /* 1738 * If VMM enable, don't do nothing, we 1739 * are able to use real page tables 1740 */ 1741 if (vmm_enabled) 1742 return; 1743 1744 #define USER_SIZE (VM_MAX_USER_ADDRESS - VM_MIN_USER_ADDRESS) 1745 1746 if (vmspace_create(&vm->vm_pmap, 0, NULL) < 0) 1747 panic("vmspace_create() failed"); 1748 1749 rp = vmspace_mmap(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1750 PROT_READ|PROT_WRITE|PROT_EXEC, 1751 MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED, 1752 MemImageFd, 0); 1753 if (rp == MAP_FAILED) 1754 panic("vmspace_mmap: failed"); 1755 vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1756 MADV_NOSYNC, 0); 1757 vpte = VM_PAGE_TO_PHYS(vmspace_pmap(vm)->pm_pdirm) | 1758 VPTE_RW | VPTE_V | VPTE_U; 1759 r = vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1760 MADV_SETMAP, vpte); 1761 if (r < 0) 1762 panic("vmspace_mcontrol: failed"); 1763 } 1764 1765 void 1766 cpu_vmspace_free(struct vmspace *vm) 1767 { 1768 /* 1769 * If VMM enable, don't do nothing, we 1770 * are able to use real page tables 1771 */ 1772 if (vmm_enabled) 1773 return; 1774 1775 if (vmspace_destroy(&vm->vm_pmap) < 0) 1776 panic("vmspace_destroy() failed"); 1777 } 1778 1779 /*************************************************** 1780 * page management routines. 1781 ***************************************************/ 1782 1783 /* 1784 * free the pv_entry back to the free list. This function may be 1785 * called from an interrupt. 1786 */ 1787 static __inline void 1788 free_pv_entry(pv_entry_t pv) 1789 { 1790 atomic_add_long(&pv_entry_count, -1); 1791 zfree(pvzone, pv); 1792 } 1793 1794 /* 1795 * get a new pv_entry, allocating a block from the system 1796 * when needed. This function may be called from an interrupt. 1797 */ 1798 static pv_entry_t 1799 get_pv_entry(void) 1800 { 1801 atomic_add_long(&pv_entry_count, 1); 1802 if (pv_entry_high_water && 1803 (pv_entry_count > pv_entry_high_water) && 1804 atomic_swap_int(&pmap_pagedaemon_waken, 1) == 0) { 1805 wakeup(&vm_pages_needed); 1806 } 1807 return zalloc(pvzone); 1808 } 1809 1810 /* 1811 * This routine is very drastic, but can save the system 1812 * in a pinch. 1813 * 1814 * No requirements. 1815 */ 1816 void 1817 pmap_collect(void) 1818 { 1819 int i; 1820 vm_page_t m; 1821 static int warningdone=0; 1822 1823 if (pmap_pagedaemon_waken == 0) 1824 return; 1825 pmap_pagedaemon_waken = 0; 1826 1827 if (warningdone < 5) { 1828 kprintf("pmap_collect: collecting pv entries -- " 1829 "suggest increasing PMAP_SHPGPERPROC\n"); 1830 warningdone++; 1831 } 1832 1833 for (i = 0; i < vm_page_array_size; i++) { 1834 m = &vm_page_array[i]; 1835 if (m->wire_count || m->hold_count) 1836 continue; 1837 if (vm_page_busy_try(m, TRUE) == 0) { 1838 if (m->wire_count == 0 && m->hold_count == 0) { 1839 pmap_remove_all(m); 1840 } 1841 vm_page_wakeup(m); 1842 } 1843 } 1844 } 1845 1846 1847 /* 1848 * If it is the first entry on the list, it is actually 1849 * in the header and we must copy the following entry up 1850 * to the header. Otherwise we must search the list for 1851 * the entry. In either case we free the now unused entry. 1852 * 1853 * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller. 1854 */ 1855 static int 1856 pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va) 1857 { 1858 pv_entry_t pv; 1859 int rtval; 1860 1861 vm_page_spin_lock(m); 1862 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, va); 1863 1864 /* 1865 * Note that pv_ptem is NULL if the page table page itself is not 1866 * managed, even if the page being removed IS managed. 1867 */ 1868 rtval = 0; 1869 if (pv) { 1870 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1871 if (TAILQ_EMPTY(&m->md.pv_list)) 1872 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1873 m->md.pv_list_count--; 1874 KKASSERT(m->md.pv_list_count >= 0); 1875 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 1876 atomic_add_int(&pmap->pm_generation, 1); 1877 vm_page_spin_unlock(m); 1878 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); 1879 free_pv_entry(pv); 1880 } else { 1881 vm_page_spin_unlock(m); 1882 kprintf("pmap_remove_entry: could not find " 1883 "pmap=%p m=%p va=%016jx\n", 1884 pmap, m, va); 1885 } 1886 return rtval; 1887 } 1888 1889 /* 1890 * Create a pv entry for page at pa for (pmap, va). If the page table page 1891 * holding the VA is managed, mpte will be non-NULL. 1892 * 1893 * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller. 1894 */ 1895 static void 1896 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m, 1897 pv_entry_t pv) 1898 { 1899 pv->pv_va = va; 1900 pv->pv_pmap = pmap; 1901 pv->pv_ptem = mpte; 1902 1903 m->md.pv_list_count++; 1904 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1905 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pv); 1906 vm_page_flag_set(m, PG_MAPPED); 1907 KKASSERT(pv == NULL); 1908 } 1909 1910 /* 1911 * pmap_remove_pte: do the things to unmap a page in a process 1912 * 1913 * Caller holds pmap->pm_pteobj and holds the associated page table 1914 * page busy to prevent races. 1915 */ 1916 static int 1917 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, pt_entry_t oldpte, 1918 vm_offset_t va) 1919 { 1920 vm_page_t m; 1921 int error; 1922 1923 if (ptq) 1924 oldpte = pmap_inval_loadandclear(ptq, pmap, va); 1925 1926 if (oldpte & VPTE_WIRED) 1927 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1928 KKASSERT(pmap->pm_stats.wired_count >= 0); 1929 1930 #if 0 1931 /* 1932 * Machines that don't support invlpg, also don't support 1933 * PG_G. XXX PG_G is disabled for SMP so don't worry about 1934 * the SMP case. 1935 */ 1936 if (oldpte & PG_G) 1937 cpu_invlpg((void *)va); 1938 #endif 1939 KKASSERT(pmap->pm_stats.resident_count > 0); 1940 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1941 if (oldpte & VPTE_MANAGED) { 1942 m = PHYS_TO_VM_PAGE(oldpte); 1943 1944 /* 1945 * NOTE: pmap_remove_entry() will spin-lock the page 1946 */ 1947 if (oldpte & VPTE_M) { 1948 #if defined(PMAP_DIAGNOSTIC) 1949 if (pmap_nw_modified(oldpte)) { 1950 kprintf("pmap_remove: modified page not " 1951 "writable: va: 0x%lx, pte: 0x%lx\n", 1952 va, oldpte); 1953 } 1954 #endif 1955 pmap_track_modified(pmap, va); 1956 vm_page_dirty(m); 1957 } 1958 if (oldpte & VPTE_A) 1959 vm_page_flag_set(m, PG_REFERENCED); 1960 error = pmap_remove_entry(pmap, m, va); 1961 } else { 1962 error = pmap_unuse_pt(pmap, va, NULL); 1963 } 1964 return error; 1965 } 1966 1967 /* 1968 * pmap_remove_page: 1969 * 1970 * Remove a single page from a process address space. 1971 * 1972 * This function may not be called from an interrupt if the pmap is 1973 * not kernel_pmap. 1974 * 1975 * Caller holds pmap->pm_pteobj 1976 */ 1977 static void 1978 pmap_remove_page(struct pmap *pmap, vm_offset_t va) 1979 { 1980 pt_entry_t *pte; 1981 1982 pte = pmap_pte(pmap, va); 1983 if (pte == NULL) 1984 return; 1985 if ((*pte & VPTE_V) == 0) 1986 return; 1987 pmap_remove_pte(pmap, pte, 0, va); 1988 } 1989 1990 /* 1991 * Remove the given range of addresses from the specified map. 1992 * 1993 * It is assumed that the start and end are properly rounded to 1994 * the page size. 1995 * 1996 * This function may not be called from an interrupt if the pmap is 1997 * not kernel_pmap. 1998 * 1999 * No requirements. 2000 */ 2001 void 2002 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 2003 { 2004 vm_offset_t va_next; 2005 pml4_entry_t *pml4e; 2006 pdp_entry_t *pdpe; 2007 pd_entry_t ptpaddr, *pde; 2008 pt_entry_t *pte; 2009 vm_page_t pt_m; 2010 2011 if (pmap == NULL) 2012 return; 2013 2014 vm_object_hold(pmap->pm_pteobj); 2015 KKASSERT(pmap->pm_stats.resident_count >= 0); 2016 if (pmap->pm_stats.resident_count == 0) { 2017 vm_object_drop(pmap->pm_pteobj); 2018 return; 2019 } 2020 2021 /* 2022 * special handling of removing one page. a very 2023 * common operation and easy to short circuit some 2024 * code. 2025 */ 2026 if (sva + PAGE_SIZE == eva) { 2027 pde = pmap_pde(pmap, sva); 2028 if (pde && (*pde & VPTE_PS) == 0) { 2029 pmap_remove_page(pmap, sva); 2030 vm_object_drop(pmap->pm_pteobj); 2031 return; 2032 } 2033 } 2034 2035 for (; sva < eva; sva = va_next) { 2036 pml4e = pmap_pml4e(pmap, sva); 2037 if ((*pml4e & VPTE_V) == 0) { 2038 va_next = (sva + NBPML4) & ~PML4MASK; 2039 if (va_next < sva) 2040 va_next = eva; 2041 continue; 2042 } 2043 2044 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2045 if ((*pdpe & VPTE_V) == 0) { 2046 va_next = (sva + NBPDP) & ~PDPMASK; 2047 if (va_next < sva) 2048 va_next = eva; 2049 continue; 2050 } 2051 2052 /* 2053 * Calculate index for next page table. 2054 */ 2055 va_next = (sva + NBPDR) & ~PDRMASK; 2056 if (va_next < sva) 2057 va_next = eva; 2058 2059 pde = pmap_pdpe_to_pde(pdpe, sva); 2060 ptpaddr = *pde; 2061 2062 /* 2063 * Weed out invalid mappings. 2064 */ 2065 if (ptpaddr == 0) 2066 continue; 2067 2068 /* 2069 * Check for large page. 2070 */ 2071 if ((ptpaddr & VPTE_PS) != 0) { 2072 /* JG FreeBSD has more complex treatment here */ 2073 KKASSERT(*pde != 0); 2074 pmap_inval_pde(pde, pmap, sva); 2075 atomic_add_long(&pmap->pm_stats.resident_count, 2076 -NBPDR / PAGE_SIZE); 2077 continue; 2078 } 2079 2080 /* 2081 * Limit our scan to either the end of the va represented 2082 * by the current page table page, or to the end of the 2083 * range being removed. 2084 */ 2085 if (va_next > eva) 2086 va_next = eva; 2087 2088 /* 2089 * NOTE: pmap_remove_pte() can block. 2090 */ 2091 pt_m = pmap_hold_pt_page(pde, sva); 2092 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2093 sva += PAGE_SIZE) { 2094 if (*pte) { 2095 if (pmap_remove_pte(pmap, pte, 0, sva)) 2096 break; 2097 } 2098 } 2099 vm_page_unhold(pt_m); 2100 } 2101 vm_object_drop(pmap->pm_pteobj); 2102 } 2103 2104 /* 2105 * Removes this physical page from all physical maps in which it resides. 2106 * Reflects back modify bits to the pager. 2107 * 2108 * This routine may not be called from an interrupt. 2109 * 2110 * No requirements. 2111 */ 2112 static void 2113 pmap_remove_all(vm_page_t m) 2114 { 2115 pt_entry_t *pte, tpte; 2116 pv_entry_t pv; 2117 vm_object_t pmobj; 2118 pmap_t pmap; 2119 2120 #if defined(PMAP_DIAGNOSTIC) 2121 /* 2122 * XXX this makes pmap_page_protect(NONE) illegal for non-managed 2123 * pages! 2124 */ 2125 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2126 panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m)); 2127 } 2128 #endif 2129 2130 restart: 2131 vm_page_spin_lock(m); 2132 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2133 pmap = pv->pv_pmap; 2134 pmobj = pmap->pm_pteobj; 2135 2136 /* 2137 * Handle reversed lock ordering 2138 */ 2139 if (vm_object_hold_try(pmobj) == 0) { 2140 refcount_acquire(&pmobj->hold_count); 2141 vm_page_spin_unlock(m); 2142 vm_object_lock(pmobj); 2143 vm_page_spin_lock(m); 2144 if (pv != TAILQ_FIRST(&m->md.pv_list) || 2145 pmap != pv->pv_pmap || 2146 pmobj != pmap->pm_pteobj) { 2147 vm_page_spin_unlock(m); 2148 vm_object_drop(pmobj); 2149 goto restart; 2150 } 2151 } 2152 2153 KKASSERT(pmap->pm_stats.resident_count > 0); 2154 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2155 2156 pte = pmap_pte(pmap, pv->pv_va); 2157 KKASSERT(pte != NULL); 2158 2159 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2160 if (tpte & VPTE_WIRED) 2161 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2162 KKASSERT(pmap->pm_stats.wired_count >= 0); 2163 2164 if (tpte & VPTE_A) 2165 vm_page_flag_set(m, PG_REFERENCED); 2166 2167 /* 2168 * Update the vm_page_t clean and reference bits. 2169 */ 2170 if (tpte & VPTE_M) { 2171 #if defined(PMAP_DIAGNOSTIC) 2172 if (pmap_nw_modified(tpte)) { 2173 kprintf( 2174 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2175 pv->pv_va, tpte); 2176 } 2177 #endif 2178 pmap_track_modified(pmap, pv->pv_va); 2179 vm_page_dirty(m); 2180 } 2181 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2182 if (TAILQ_EMPTY(&m->md.pv_list)) 2183 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2184 m->md.pv_list_count--; 2185 KKASSERT(m->md.pv_list_count >= 0); 2186 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2187 atomic_add_int(&pmap->pm_generation, 1); 2188 vm_page_spin_unlock(m); 2189 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2190 free_pv_entry(pv); 2191 2192 vm_object_drop(pmobj); 2193 vm_page_spin_lock(m); 2194 } 2195 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2196 vm_page_spin_unlock(m); 2197 } 2198 2199 /* 2200 * Removes the page from a particular pmap 2201 */ 2202 void 2203 pmap_remove_specific(pmap_t pmap, vm_page_t m) 2204 { 2205 pt_entry_t *pte, tpte; 2206 pv_entry_t pv; 2207 2208 vm_object_hold(pmap->pm_pteobj); 2209 again: 2210 vm_page_spin_lock(m); 2211 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2212 if (pv->pv_pmap != pmap) 2213 continue; 2214 2215 KKASSERT(pmap->pm_stats.resident_count > 0); 2216 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2217 2218 pte = pmap_pte(pmap, pv->pv_va); 2219 KKASSERT(pte != NULL); 2220 2221 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2222 if (tpte & VPTE_WIRED) 2223 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2224 KKASSERT(pmap->pm_stats.wired_count >= 0); 2225 2226 if (tpte & VPTE_A) 2227 vm_page_flag_set(m, PG_REFERENCED); 2228 2229 /* 2230 * Update the vm_page_t clean and reference bits. 2231 */ 2232 if (tpte & VPTE_M) { 2233 pmap_track_modified(pmap, pv->pv_va); 2234 vm_page_dirty(m); 2235 } 2236 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2237 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2238 atomic_add_int(&pmap->pm_generation, 1); 2239 m->md.pv_list_count--; 2240 KKASSERT(m->md.pv_list_count >= 0); 2241 if (TAILQ_EMPTY(&m->md.pv_list)) 2242 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2243 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2244 vm_page_spin_unlock(m); 2245 free_pv_entry(pv); 2246 goto again; 2247 } 2248 vm_page_spin_unlock(m); 2249 vm_object_drop(pmap->pm_pteobj); 2250 } 2251 2252 /* 2253 * Set the physical protection on the specified range of this map 2254 * as requested. 2255 * 2256 * This function may not be called from an interrupt if the map is 2257 * not the kernel_pmap. 2258 * 2259 * No requirements. 2260 */ 2261 void 2262 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2263 { 2264 vm_offset_t va_next; 2265 pml4_entry_t *pml4e; 2266 pdp_entry_t *pdpe; 2267 pd_entry_t ptpaddr, *pde; 2268 pt_entry_t *pte; 2269 vm_page_t pt_m; 2270 2271 if (pmap == NULL) 2272 return; 2273 2274 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 2275 pmap_remove(pmap, sva, eva); 2276 return; 2277 } 2278 2279 if (prot & VM_PROT_WRITE) 2280 return; 2281 2282 vm_object_hold(pmap->pm_pteobj); 2283 2284 for (; sva < eva; sva = va_next) { 2285 pml4e = pmap_pml4e(pmap, sva); 2286 if ((*pml4e & VPTE_V) == 0) { 2287 va_next = (sva + NBPML4) & ~PML4MASK; 2288 if (va_next < sva) 2289 va_next = eva; 2290 continue; 2291 } 2292 2293 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2294 if ((*pdpe & VPTE_V) == 0) { 2295 va_next = (sva + NBPDP) & ~PDPMASK; 2296 if (va_next < sva) 2297 va_next = eva; 2298 continue; 2299 } 2300 2301 va_next = (sva + NBPDR) & ~PDRMASK; 2302 if (va_next < sva) 2303 va_next = eva; 2304 2305 pde = pmap_pdpe_to_pde(pdpe, sva); 2306 ptpaddr = *pde; 2307 2308 #if 0 2309 /* 2310 * Check for large page. 2311 */ 2312 if ((ptpaddr & VPTE_PS) != 0) { 2313 /* JG correct? */ 2314 pmap_clean_pde(pde, pmap, sva); 2315 atomic_add_long(&pmap->pm_stats.resident_count, 2316 -NBPDR / PAGE_SIZE); 2317 continue; 2318 } 2319 #endif 2320 2321 /* 2322 * Weed out invalid mappings. Note: we assume that the page 2323 * directory table is always allocated, and in kernel virtual. 2324 */ 2325 if (ptpaddr == 0) 2326 continue; 2327 2328 if (va_next > eva) 2329 va_next = eva; 2330 2331 pt_m = pmap_hold_pt_page(pde, sva); 2332 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2333 sva += PAGE_SIZE) { 2334 /* 2335 * Clean managed pages and also check the accessed 2336 * bit. Just remove write perms for unmanaged 2337 * pages. Be careful of races, turning off write 2338 * access will force a fault rather then setting 2339 * the modified bit at an unexpected time. 2340 */ 2341 pmap_track_modified(pmap, sva); 2342 pmap_clean_pte(pte, pmap, sva, NULL); 2343 } 2344 vm_page_unhold(pt_m); 2345 } 2346 vm_object_drop(pmap->pm_pteobj); 2347 } 2348 2349 /* 2350 * Enter a managed page into a pmap. If the page is not wired related pmap 2351 * data can be destroyed at any time for later demand-operation. 2352 * 2353 * Insert the vm_page (m) at virtual address (v) in (pmap), with the 2354 * specified protection, and wire the mapping if requested. 2355 * 2356 * NOTE: This routine may not lazy-evaluate or lose information. The 2357 * page must actually be inserted into the given map NOW. 2358 * 2359 * NOTE: When entering a page at a KVA address, the pmap must be the 2360 * kernel_pmap. 2361 * 2362 * No requirements. 2363 */ 2364 void 2365 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2366 boolean_t wired, vm_map_entry_t entry __unused) 2367 { 2368 vm_paddr_t pa; 2369 pv_entry_t pv; 2370 pt_entry_t *pte; 2371 pt_entry_t origpte, newpte; 2372 vm_paddr_t opa; 2373 vm_page_t mpte; 2374 2375 if (pmap == NULL) 2376 return; 2377 2378 va = trunc_page(va); 2379 2380 vm_object_hold(pmap->pm_pteobj); 2381 2382 /* 2383 * Get the page table page. The kernel_pmap's page table pages 2384 * are preallocated and have no associated vm_page_t. 2385 * 2386 * If not NULL, mpte will be busied and we must vm_page_wakeup() 2387 * to cleanup. There will already be at least one wire count from 2388 * it being mapped into its parent. 2389 */ 2390 if (pmap == &kernel_pmap) { 2391 mpte = NULL; 2392 pte = vtopte(va); 2393 } else { 2394 mpte = pmap_allocpte(pmap, va); 2395 pte = (void *)PHYS_TO_DMAP(mpte->phys_addr); 2396 pte += pmap_pte_index(va); 2397 } 2398 2399 /* 2400 * Deal with races against the kernel's real MMU by cleaning the 2401 * page, even if we are re-entering the same page. 2402 */ 2403 pa = VM_PAGE_TO_PHYS(m); 2404 origpte = pmap_inval_loadandclear(pte, pmap, va); 2405 /*origpte = pmap_clean_pte(pte, pmap, va, NULL);*/ 2406 opa = origpte & VPTE_FRAME; 2407 2408 if (origpte & VPTE_PS) 2409 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2410 2411 if ((origpte & (VPTE_MANAGED|VPTE_M)) == (VPTE_MANAGED|VPTE_M)) { 2412 vm_page_t om; 2413 2414 pmap_track_modified(pmap, va); 2415 om = PHYS_TO_VM_PAGE(opa); 2416 vm_page_dirty(om); 2417 } 2418 2419 /* 2420 * Mapping has not changed, must be protection or wiring change. 2421 */ 2422 if (origpte && (opa == pa)) { 2423 /* 2424 * Wiring change, just update stats. We don't worry about 2425 * wiring PT pages as they remain resident as long as there 2426 * are valid mappings in them. Hence, if a user page is wired, 2427 * the PT page will be also. 2428 */ 2429 if (wired && ((origpte & VPTE_WIRED) == 0)) 2430 atomic_add_long(&pmap->pm_stats.wired_count, 1); 2431 else if (!wired && (origpte & VPTE_WIRED)) 2432 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2433 2434 if (origpte & VPTE_MANAGED) { 2435 pa |= VPTE_MANAGED; 2436 KKASSERT(m->flags & PG_MAPPED); 2437 KKASSERT((m->flags & PG_FICTITIOUS) == 0); 2438 } else { 2439 KKASSERT((m->flags & PG_FICTITIOUS)); 2440 } 2441 vm_page_spin_lock(m); 2442 goto validate; 2443 } 2444 2445 /* 2446 * Bump the wire_count for the page table page. 2447 */ 2448 if (mpte) 2449 vm_page_wire_quick(mpte); 2450 2451 /* 2452 * Mapping has changed, invalidate old range and fall through to 2453 * handle validating new mapping. Don't inherit anything from 2454 * oldpte. 2455 */ 2456 if (opa) { 2457 int err; 2458 err = pmap_remove_pte(pmap, NULL, origpte, va); 2459 origpte = 0; 2460 if (err) 2461 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2462 } 2463 2464 /* 2465 * Enter on the PV list if part of our managed memory. Note that we 2466 * raise IPL while manipulating pv_table since pmap_enter can be 2467 * called at interrupt time. 2468 */ 2469 if (pmap_initialized) { 2470 if ((m->flags & PG_FICTITIOUS) == 0) { 2471 /* 2472 * WARNING! We are using m's spin-lock as a 2473 * man's pte lock to interlock against 2474 * pmap_page_protect() operations. 2475 * 2476 * This is a bad hack (obviously). 2477 */ 2478 pv = get_pv_entry(); 2479 vm_page_spin_lock(m); 2480 pmap_insert_entry(pmap, va, mpte, m, pv); 2481 pa |= VPTE_MANAGED; 2482 /* vm_page_spin_unlock(m); */ 2483 } else { 2484 vm_page_spin_lock(m); 2485 } 2486 } else { 2487 vm_page_spin_lock(m); 2488 } 2489 2490 /* 2491 * Increment counters 2492 */ 2493 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2494 if (wired) 2495 atomic_add_long(&pmap->pm_stats.wired_count, 1); 2496 2497 validate: 2498 /* 2499 * Now validate mapping with desired protection/wiring. 2500 */ 2501 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | VPTE_V | VPTE_U); 2502 newpte |= VPTE_A; 2503 2504 if (wired) 2505 newpte |= VPTE_WIRED; 2506 // if (pmap != &kernel_pmap) 2507 newpte |= VPTE_U; 2508 if (newpte & VPTE_RW) 2509 vm_page_flag_set(m, PG_WRITEABLE); 2510 KKASSERT((newpte & VPTE_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2511 2512 origpte = atomic_swap_long(pte, newpte); 2513 if (origpte & VPTE_M) { 2514 kprintf("pmap [M] race @ %016jx\n", va); 2515 atomic_set_long(pte, VPTE_M); 2516 } 2517 vm_page_spin_unlock(m); 2518 2519 if (mpte) 2520 vm_page_wakeup(mpte); 2521 vm_object_drop(pmap->pm_pteobj); 2522 } 2523 2524 /* 2525 * Make a temporary mapping for a physical address. This is only intended 2526 * to be used for panic dumps. 2527 * 2528 * The caller is responsible for calling smp_invltlb(). 2529 */ 2530 void * 2531 pmap_kenter_temporary(vm_paddr_t pa, long i) 2532 { 2533 pmap_kenter_quick(crashdumpmap + (i * PAGE_SIZE), pa); 2534 return ((void *)crashdumpmap); 2535 } 2536 2537 #define MAX_INIT_PT (96) 2538 2539 /* 2540 * This routine preloads the ptes for a given object into the specified pmap. 2541 * This eliminates the blast of soft faults on process startup and 2542 * immediately after an mmap. 2543 * 2544 * No requirements. 2545 */ 2546 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2547 2548 void 2549 pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry, 2550 vm_offset_t addr, vm_size_t size, int limit) 2551 { 2552 vm_prot_t prot = entry->protection; 2553 vm_object_t object = entry->ba.object; 2554 vm_pindex_t pindex = atop(entry->ba.offset + (addr - entry->ba.start)); 2555 struct rb_vm_page_scan_info info; 2556 struct lwp *lp; 2557 vm_size_t psize; 2558 2559 /* 2560 * We can't preinit if read access isn't set or there is no pmap 2561 * or object. 2562 */ 2563 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2564 return; 2565 2566 /* 2567 * We can't preinit if the pmap is not the current pmap 2568 */ 2569 lp = curthread->td_lwp; 2570 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2571 return; 2572 2573 /* 2574 * Misc additional checks 2575 */ 2576 psize = x86_64_btop(size); 2577 2578 if ((object->type != OBJT_VNODE) || 2579 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2580 (object->resident_page_count > MAX_INIT_PT))) { 2581 return; 2582 } 2583 2584 if (psize + pindex > object->size) { 2585 if (object->size < pindex) 2586 return; 2587 psize = object->size - pindex; 2588 } 2589 2590 if (psize == 0) 2591 return; 2592 2593 /* 2594 * Use a red-black scan to traverse the requested range and load 2595 * any valid pages found into the pmap. 2596 * 2597 * We cannot safely scan the object's memq unless we are in a 2598 * critical section since interrupts can remove pages from objects. 2599 */ 2600 info.start_pindex = pindex; 2601 info.end_pindex = pindex + psize - 1; 2602 info.limit = limit; 2603 info.mpte = NULL; 2604 info.addr = addr; 2605 info.pmap = pmap; 2606 info.entry = entry; 2607 2608 vm_object_hold_shared(object); 2609 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2610 pmap_object_init_pt_callback, &info); 2611 vm_object_drop(object); 2612 } 2613 2614 static 2615 int 2616 pmap_object_init_pt_callback(vm_page_t p, void *data) 2617 { 2618 struct rb_vm_page_scan_info *info = data; 2619 vm_pindex_t rel_index; 2620 /* 2621 * don't allow an madvise to blow away our really 2622 * free pages allocating pv entries. 2623 */ 2624 if ((info->limit & MAP_PREFAULT_MADVISE) && 2625 vmstats.v_free_count < vmstats.v_free_reserved) { 2626 return(-1); 2627 } 2628 2629 /* 2630 * Ignore list markers and ignore pages we cannot instantly 2631 * busy (while holding the object token). 2632 */ 2633 if (p->flags & PG_MARKER) 2634 return 0; 2635 if (vm_page_busy_try(p, TRUE)) 2636 return 0; 2637 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2638 (p->flags & PG_FICTITIOUS) == 0) { 2639 if ((p->queue - p->pc) == PQ_CACHE) 2640 vm_page_deactivate(p); 2641 rel_index = p->pindex - info->start_pindex; 2642 pmap_enter(info->pmap, info->addr + x86_64_ptob(rel_index), p, 2643 VM_PROT_READ, FALSE, info->entry); 2644 } 2645 vm_page_wakeup(p); 2646 return(0); 2647 } 2648 2649 /* 2650 * Return TRUE if the pmap is in shape to trivially 2651 * pre-fault the specified address. 2652 * 2653 * Returns FALSE if it would be non-trivial or if a 2654 * pte is already loaded into the slot. 2655 * 2656 * No requirements. 2657 */ 2658 int 2659 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2660 { 2661 pt_entry_t *pte; 2662 pd_entry_t *pde; 2663 int ret; 2664 2665 vm_object_hold(pmap->pm_pteobj); 2666 pde = pmap_pde(pmap, addr); 2667 if (pde == NULL || *pde == 0) { 2668 ret = 0; 2669 } else { 2670 pte = pmap_pde_to_pte(pde, addr); 2671 ret = (*pte) ? 0 : 1; 2672 } 2673 vm_object_drop(pmap->pm_pteobj); 2674 2675 return (ret); 2676 } 2677 2678 /* 2679 * Change the wiring attribute for a map/virtual-address pair. 2680 * 2681 * The mapping must already exist in the pmap. 2682 * No other requirements. 2683 */ 2684 vm_page_t 2685 pmap_unwire(pmap_t pmap, vm_offset_t va) 2686 { 2687 pt_entry_t *pte; 2688 vm_paddr_t pa; 2689 vm_page_t m; 2690 2691 if (pmap == NULL) 2692 return NULL; 2693 2694 vm_object_hold(pmap->pm_pteobj); 2695 pte = pmap_pte(pmap, va); 2696 2697 if (pte == NULL || (*pte & VPTE_V) == 0) { 2698 vm_object_drop(pmap->pm_pteobj); 2699 return NULL; 2700 } 2701 2702 /* 2703 * Wiring is not a hardware characteristic so there is no need to 2704 * invalidate TLB. However, in an SMP environment we must use 2705 * a locked bus cycle to update the pte (if we are not using 2706 * the pmap_inval_*() API that is)... it's ok to do this for simple 2707 * wiring changes. 2708 */ 2709 if (pmap_pte_w(pte)) 2710 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2711 /* XXX else return NULL so caller doesn't unwire m ? */ 2712 atomic_clear_long(pte, VPTE_WIRED); 2713 2714 pa = *pte & VPTE_FRAME; 2715 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 2716 2717 vm_object_drop(pmap->pm_pteobj); 2718 2719 return m; 2720 } 2721 2722 /* 2723 * Copy the range specified by src_addr/len 2724 * from the source map to the range dst_addr/len 2725 * in the destination map. 2726 * 2727 * This routine is only advisory and need not do anything. 2728 */ 2729 void 2730 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2731 vm_size_t len, vm_offset_t src_addr) 2732 { 2733 /* 2734 * XXX BUGGY. Amoung other things srcmpte is assumed to remain 2735 * valid through blocking calls, and that's just not going to 2736 * be the case. 2737 * 2738 * FIXME! 2739 */ 2740 return; 2741 } 2742 2743 /* 2744 * pmap_zero_page: 2745 * 2746 * Zero the specified physical page. 2747 * 2748 * This function may be called from an interrupt and no locking is 2749 * required. 2750 */ 2751 void 2752 pmap_zero_page(vm_paddr_t phys) 2753 { 2754 vm_offset_t va = PHYS_TO_DMAP(phys); 2755 2756 bzero((void *)va, PAGE_SIZE); 2757 } 2758 2759 /* 2760 * pmap_zero_page: 2761 * 2762 * Zero part of a physical page by mapping it into memory and clearing 2763 * its contents with bzero. 2764 * 2765 * off and size may not cover an area beyond a single hardware page. 2766 */ 2767 void 2768 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 2769 { 2770 vm_offset_t virt = PHYS_TO_DMAP(phys); 2771 2772 bzero((char *)virt + off, size); 2773 } 2774 2775 /* 2776 * pmap_copy_page: 2777 * 2778 * Copy the physical page from the source PA to the target PA. 2779 * This function may be called from an interrupt. No locking 2780 * is required. 2781 */ 2782 void 2783 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 2784 { 2785 vm_offset_t src_virt, dst_virt; 2786 2787 src_virt = PHYS_TO_DMAP(src); 2788 dst_virt = PHYS_TO_DMAP(dst); 2789 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 2790 } 2791 2792 /* 2793 * pmap_copy_page_frag: 2794 * 2795 * Copy the physical page from the source PA to the target PA. 2796 * This function may be called from an interrupt. No locking 2797 * is required. 2798 */ 2799 void 2800 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 2801 { 2802 vm_offset_t src_virt, dst_virt; 2803 2804 src_virt = PHYS_TO_DMAP(src); 2805 dst_virt = PHYS_TO_DMAP(dst); 2806 bcopy((char *)src_virt + (src & PAGE_MASK), 2807 (char *)dst_virt + (dst & PAGE_MASK), 2808 bytes); 2809 } 2810 2811 /* 2812 * Remove all pages from specified address space this aids process 2813 * exit speeds. Also, this code is special cased for current 2814 * process only, but can have the more generic (and slightly slower) 2815 * mode enabled. This is much faster than pmap_remove in the case 2816 * of running down an entire address space. 2817 * 2818 * No other requirements. 2819 */ 2820 void 2821 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2822 { 2823 pmap_remove(pmap, sva, eva); 2824 #if 0 2825 pt_entry_t *pte, tpte; 2826 pv_entry_t pv, npv; 2827 vm_page_t m; 2828 int save_generation; 2829 2830 if (pmap->pm_pteobj) 2831 vm_object_hold(pmap->pm_pteobj); 2832 2833 pmap_invalidate_range(pmap, sva, eva); 2834 2835 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 2836 if (pv->pv_va >= eva || pv->pv_va < sva) { 2837 npv = TAILQ_NEXT(pv, pv_plist); 2838 continue; 2839 } 2840 2841 KKASSERT(pmap == pv->pv_pmap); 2842 2843 pte = pmap_pte(pmap, pv->pv_va); 2844 2845 /* 2846 * We cannot remove wired pages from a process' mapping 2847 * at this time 2848 */ 2849 if (*pte & VPTE_WIRED) { 2850 npv = TAILQ_NEXT(pv, pv_plist); 2851 continue; 2852 } 2853 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2854 2855 m = PHYS_TO_VM_PAGE(tpte & VPTE_FRAME); 2856 vm_page_spin_lock(m); 2857 2858 KASSERT(m < &vm_page_array[vm_page_array_size], 2859 ("pmap_remove_pages: bad tpte %lx", tpte)); 2860 2861 KKASSERT(pmap->pm_stats.resident_count > 0); 2862 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2863 2864 /* 2865 * Update the vm_page_t clean and reference bits. 2866 */ 2867 if (tpte & VPTE_M) { 2868 vm_page_dirty(m); 2869 } 2870 2871 npv = TAILQ_NEXT(pv, pv_plist); 2872 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2873 atomic_add_int(&pmap->pm_generation, 1); 2874 save_generation = pmap->pm_generation; 2875 m->md.pv_list_count--; 2876 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2877 if (TAILQ_EMPTY(&m->md.pv_list)) 2878 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2879 vm_page_spin_unlock(m); 2880 2881 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2882 free_pv_entry(pv); 2883 2884 /* 2885 * Restart the scan if we blocked during the unuse or free 2886 * calls and other removals were made. 2887 */ 2888 if (save_generation != pmap->pm_generation) { 2889 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 2890 npv = TAILQ_FIRST(&pmap->pm_pvlist); 2891 } 2892 } 2893 if (pmap->pm_pteobj) 2894 vm_object_drop(pmap->pm_pteobj); 2895 pmap_remove(pmap, sva, eva); 2896 #endif 2897 } 2898 2899 /* 2900 * pmap_testbit tests bits in active mappings of a VM page. 2901 */ 2902 static boolean_t 2903 pmap_testbit(vm_page_t m, int bit) 2904 { 2905 pv_entry_t pv; 2906 pt_entry_t *pte; 2907 2908 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2909 return FALSE; 2910 2911 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 2912 return FALSE; 2913 2914 vm_page_spin_lock(m); 2915 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2916 /* 2917 * if the bit being tested is the modified bit, then 2918 * mark clean_map and ptes as never 2919 * modified. 2920 */ 2921 if (bit & (VPTE_A|VPTE_M)) 2922 pmap_track_modified(pv->pv_pmap, pv->pv_va); 2923 2924 #if defined(PMAP_DIAGNOSTIC) 2925 if (pv->pv_pmap == NULL) { 2926 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 2927 continue; 2928 } 2929 #endif 2930 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2931 if (*pte & bit) { 2932 vm_page_spin_unlock(m); 2933 return TRUE; 2934 } 2935 } 2936 vm_page_spin_unlock(m); 2937 return (FALSE); 2938 } 2939 2940 /* 2941 * This routine is used to clear bits in ptes. Certain bits require special 2942 * handling, in particular (on virtual kernels) the VPTE_M (modify) bit. 2943 * 2944 * This routine is only called with certain VPTE_* bit combinations. 2945 */ 2946 static __inline void 2947 pmap_clearbit(vm_page_t m, int bit) 2948 { 2949 pv_entry_t pv; 2950 pt_entry_t *pte; 2951 pt_entry_t pbits; 2952 vm_object_t pmobj; 2953 pmap_t pmap; 2954 2955 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2956 if (bit == VPTE_RW) 2957 vm_page_flag_clear(m, PG_WRITEABLE); 2958 return; 2959 } 2960 2961 /* 2962 * Loop over all current mappings setting/clearing as appropos If 2963 * setting RO do we need to clear the VAC? 2964 */ 2965 restart: 2966 vm_page_spin_lock(m); 2967 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2968 /* 2969 * Need the pmap object lock(?) 2970 */ 2971 pmap = pv->pv_pmap; 2972 pmobj = pmap->pm_pteobj; 2973 2974 if (vm_object_hold_try(pmobj) == 0) { 2975 refcount_acquire(&pmobj->hold_count); 2976 vm_page_spin_unlock(m); 2977 vm_object_lock(pmobj); 2978 vm_object_drop(pmobj); 2979 goto restart; 2980 } 2981 2982 /* 2983 * don't write protect pager mappings 2984 */ 2985 if (bit == VPTE_RW) { 2986 pmap_track_modified(pv->pv_pmap, pv->pv_va); 2987 } 2988 2989 #if defined(PMAP_DIAGNOSTIC) 2990 if (pv->pv_pmap == NULL) { 2991 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 2992 vm_object_drop(pmobj); 2993 continue; 2994 } 2995 #endif 2996 2997 /* 2998 * Careful here. We can use a locked bus instruction to 2999 * clear VPTE_A or VPTE_M safely but we need to synchronize 3000 * with the target cpus when we mess with VPTE_RW. 3001 * 3002 * On virtual kernels we must force a new fault-on-write 3003 * in the real kernel if we clear the Modify bit ourselves, 3004 * otherwise the real kernel will not get a new fault and 3005 * will never set our Modify bit again. 3006 */ 3007 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3008 if (*pte & bit) { 3009 if (bit == VPTE_RW) { 3010 /* 3011 * We must also clear VPTE_M when clearing 3012 * VPTE_RW and synchronize its state to 3013 * the page. 3014 */ 3015 pmap_track_modified(pv->pv_pmap, pv->pv_va); 3016 pbits = pmap_clean_pte(pte, pv->pv_pmap, 3017 pv->pv_va, m); 3018 } else if (bit == VPTE_M) { 3019 /* 3020 * We must invalidate the real-kernel pte 3021 * when clearing VPTE_M bit to force the 3022 * real-kernel to take a new fault to re-set 3023 * VPTE_M. 3024 */ 3025 atomic_clear_long(pte, VPTE_M); 3026 if (*pte & VPTE_RW) { 3027 pmap_invalidate_range(pv->pv_pmap, 3028 pv->pv_va, 3029 pv->pv_va + PAGE_SIZE); 3030 } 3031 } else if ((bit & (VPTE_RW|VPTE_M)) == 3032 (VPTE_RW|VPTE_M)) { 3033 /* 3034 * We've been asked to clear W & M, I guess 3035 * the caller doesn't want us to update 3036 * the dirty status of the VM page. 3037 */ 3038 pmap_track_modified(pv->pv_pmap, pv->pv_va); 3039 pmap_clean_pte(pte, pv->pv_pmap, pv->pv_va, m); 3040 panic("shouldn't be called"); 3041 } else { 3042 /* 3043 * We've been asked to clear bits that do 3044 * not interact with hardware. 3045 */ 3046 atomic_clear_long(pte, bit); 3047 } 3048 } 3049 vm_object_drop(pmobj); 3050 } 3051 if (bit == VPTE_RW) 3052 vm_page_flag_clear(m, PG_WRITEABLE); 3053 vm_page_spin_unlock(m); 3054 } 3055 3056 /* 3057 * Lower the permission for all mappings to a given page. 3058 * 3059 * No other requirements. 3060 */ 3061 void 3062 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3063 { 3064 if ((prot & VM_PROT_WRITE) == 0) { 3065 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3066 pmap_clearbit(m, VPTE_RW); 3067 } else { 3068 pmap_remove_all(m); 3069 } 3070 } 3071 } 3072 3073 vm_paddr_t 3074 pmap_phys_address(vm_pindex_t ppn) 3075 { 3076 return (x86_64_ptob(ppn)); 3077 } 3078 3079 /* 3080 * Return a count of reference bits for a page, clearing those bits. 3081 * It is not necessary for every reference bit to be cleared, but it 3082 * is necessary that 0 only be returned when there are truly no 3083 * reference bits set. 3084 * 3085 * XXX: The exact number of bits to check and clear is a matter that 3086 * should be tested and standardized at some point in the future for 3087 * optimal aging of shared pages. 3088 * 3089 * No other requirements. 3090 */ 3091 int 3092 pmap_ts_referenced(vm_page_t m) 3093 { 3094 pv_entry_t pv, pvf, pvn; 3095 pt_entry_t *pte; 3096 int rtval = 0; 3097 3098 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3099 return (rtval); 3100 3101 vm_page_spin_lock(m); 3102 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3103 pvf = pv; 3104 do { 3105 pvn = TAILQ_NEXT(pv, pv_list); 3106 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3107 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3108 3109 pmap_track_modified(pv->pv_pmap, pv->pv_va); 3110 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3111 3112 if (pte && (*pte & VPTE_A)) { 3113 atomic_clear_long(pte, VPTE_A); 3114 rtval++; 3115 if (rtval > 4) { 3116 break; 3117 } 3118 } 3119 } while ((pv = pvn) != NULL && pv != pvf); 3120 } 3121 vm_page_spin_unlock(m); 3122 3123 return (rtval); 3124 } 3125 3126 /* 3127 * Return whether or not the specified physical page was modified 3128 * in any physical maps. 3129 * 3130 * No other requirements. 3131 */ 3132 boolean_t 3133 pmap_is_modified(vm_page_t m) 3134 { 3135 boolean_t res; 3136 3137 res = pmap_testbit(m, VPTE_M); 3138 3139 return (res); 3140 } 3141 3142 /* 3143 * Clear the modify bits on the specified physical page. For the vkernel 3144 * we really need to clean the page, which clears VPTE_RW and VPTE_M, in 3145 * order to ensure that we take a fault on the next write to the page. 3146 * Otherwise the page may become dirty without us knowing it. 3147 * 3148 * No other requirements. 3149 */ 3150 void 3151 pmap_clear_modify(vm_page_t m) 3152 { 3153 pmap_clearbit(m, VPTE_RW); 3154 } 3155 3156 /* 3157 * Clear the reference bit on the specified physical page. 3158 * 3159 * No other requirements. 3160 */ 3161 void 3162 pmap_clear_reference(vm_page_t m) 3163 { 3164 pmap_clearbit(m, VPTE_A); 3165 } 3166 3167 /* 3168 * Miscellaneous support routines follow 3169 */ 3170 static void 3171 x86_64_protection_init(void) 3172 { 3173 uint64_t *kp; 3174 int prot; 3175 3176 kp = protection_codes; 3177 for (prot = 0; prot < 8; prot++) { 3178 if (prot & VM_PROT_READ) 3179 *kp |= 0; /* R */ 3180 if (prot & VM_PROT_WRITE) 3181 *kp |= VPTE_RW; /* R+W */ 3182 if (prot && (prot & VM_PROT_EXECUTE) == 0) 3183 *kp |= VPTE_NX; /* NX - !executable */ 3184 ++kp; 3185 } 3186 } 3187 3188 /* 3189 * Sets the memory attribute for the specified page. 3190 */ 3191 void 3192 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 3193 { 3194 /* This is a vkernel, do nothing */ 3195 } 3196 3197 /* 3198 * Change the PAT attribute on an existing kernel memory map. Caller 3199 * must ensure that the virtual memory in question is not accessed 3200 * during the adjustment. 3201 */ 3202 void 3203 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 3204 { 3205 /* This is a vkernel, do nothing */ 3206 } 3207 3208 /* 3209 * Perform the pmap work for mincore 3210 * 3211 * No other requirements. 3212 */ 3213 int 3214 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3215 { 3216 pt_entry_t *ptep, pte; 3217 vm_page_t m; 3218 int val = 0; 3219 3220 vm_object_hold(pmap->pm_pteobj); 3221 ptep = pmap_pte(pmap, addr); 3222 3223 if (ptep && (pte = *ptep) != 0) { 3224 vm_paddr_t pa; 3225 3226 val = MINCORE_INCORE; 3227 if ((pte & VPTE_MANAGED) == 0) 3228 goto done; 3229 3230 pa = pte & VPTE_FRAME; 3231 3232 m = PHYS_TO_VM_PAGE(pa); 3233 3234 /* 3235 * Modified by us 3236 */ 3237 if (pte & VPTE_M) 3238 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3239 /* 3240 * Modified by someone 3241 */ 3242 else if (m->dirty || pmap_is_modified(m)) 3243 val |= MINCORE_MODIFIED_OTHER; 3244 /* 3245 * Referenced by us 3246 */ 3247 if (pte & VPTE_A) 3248 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3249 3250 /* 3251 * Referenced by someone 3252 */ 3253 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3254 val |= MINCORE_REFERENCED_OTHER; 3255 vm_page_flag_set(m, PG_REFERENCED); 3256 } 3257 } 3258 done: 3259 vm_object_drop(pmap->pm_pteobj); 3260 3261 return val; 3262 } 3263 3264 /* 3265 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3266 * vmspace will be ref'd and the old one will be deref'd. 3267 * 3268 * Caller must hold vmspace->vm_map.token for oldvm and newvm 3269 */ 3270 void 3271 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3272 { 3273 struct vmspace *oldvm; 3274 struct lwp *lp; 3275 3276 oldvm = p->p_vmspace; 3277 if (oldvm != newvm) { 3278 if (adjrefs) 3279 vmspace_ref(newvm); 3280 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 3281 p->p_vmspace = newvm; 3282 KKASSERT(p->p_nthreads == 1); 3283 lp = RB_ROOT(&p->p_lwp_tree); 3284 pmap_setlwpvm(lp, newvm); 3285 if (adjrefs) 3286 vmspace_rel(oldvm); 3287 } 3288 } 3289 3290 /* 3291 * Set the vmspace for a LWP. The vmspace is almost universally set the 3292 * same as the process vmspace, but virtual kernels need to swap out contexts 3293 * on a per-lwp basis. 3294 */ 3295 void 3296 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3297 { 3298 struct vmspace *oldvm; 3299 struct pmap *pmap; 3300 3301 oldvm = lp->lwp_vmspace; 3302 if (oldvm != newvm) { 3303 crit_enter(); 3304 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 3305 lp->lwp_vmspace = newvm; 3306 if (curthread->td_lwp == lp) { 3307 pmap = vmspace_pmap(newvm); 3308 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 3309 if (pmap->pm_active_lock & CPULOCK_EXCL) 3310 pmap_interlock_wait(newvm); 3311 #if defined(SWTCH_OPTIM_STATS) 3312 tlb_flush_count++; 3313 #endif 3314 pmap = vmspace_pmap(oldvm); 3315 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 3316 mycpu->gd_cpuid); 3317 } 3318 crit_exit(); 3319 } 3320 } 3321 3322 /* 3323 * The swtch code tried to switch in a heavy weight process whos pmap 3324 * is locked by another cpu. We have to wait for the lock to clear before 3325 * the pmap can be used. 3326 */ 3327 void 3328 pmap_interlock_wait (struct vmspace *vm) 3329 { 3330 pmap_t pmap = vmspace_pmap(vm); 3331 3332 if (pmap->pm_active_lock & CPULOCK_EXCL) { 3333 crit_enter(); 3334 while (pmap->pm_active_lock & CPULOCK_EXCL) { 3335 cpu_ccfence(); 3336 vkernel_yield(); 3337 } 3338 crit_exit(); 3339 } 3340 } 3341 3342 vm_offset_t 3343 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3344 { 3345 3346 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3347 return addr; 3348 } 3349 3350 addr = roundup2(addr, NBPDR); 3351 return addr; 3352 } 3353 3354 /* 3355 * Used by kmalloc/kfree, page already exists at va 3356 */ 3357 vm_page_t 3358 pmap_kvtom(vm_offset_t va) 3359 { 3360 vpte_t *ptep; 3361 3362 KKASSERT(va >= KvaStart && va < KvaEnd); 3363 ptep = vtopte(va); 3364 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 3365 } 3366 3367 void 3368 pmap_object_init(vm_object_t object) 3369 { 3370 /* empty */ 3371 } 3372 3373 void 3374 pmap_object_free(vm_object_t object) 3375 { 3376 /* empty */ 3377 } 3378 3379 void 3380 pmap_pgscan(struct pmap_pgscan_info *pginfo) 3381 { 3382 pmap_t pmap = pginfo->pmap; 3383 vm_offset_t sva = pginfo->beg_addr; 3384 vm_offset_t eva = pginfo->end_addr; 3385 vm_offset_t va_next; 3386 pml4_entry_t *pml4e; 3387 pdp_entry_t *pdpe; 3388 pd_entry_t ptpaddr, *pde; 3389 pt_entry_t *pte; 3390 vm_page_t pt_m; 3391 int stop = 0; 3392 3393 vm_object_hold(pmap->pm_pteobj); 3394 3395 for (; sva < eva; sva = va_next) { 3396 if (stop) 3397 break; 3398 3399 pml4e = pmap_pml4e(pmap, sva); 3400 if ((*pml4e & VPTE_V) == 0) { 3401 va_next = (sva + NBPML4) & ~PML4MASK; 3402 if (va_next < sva) 3403 va_next = eva; 3404 continue; 3405 } 3406 3407 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3408 if ((*pdpe & VPTE_V) == 0) { 3409 va_next = (sva + NBPDP) & ~PDPMASK; 3410 if (va_next < sva) 3411 va_next = eva; 3412 continue; 3413 } 3414 3415 va_next = (sva + NBPDR) & ~PDRMASK; 3416 if (va_next < sva) 3417 va_next = eva; 3418 3419 pde = pmap_pdpe_to_pde(pdpe, sva); 3420 ptpaddr = *pde; 3421 3422 #if 0 3423 /* 3424 * Check for large page (ignore). 3425 */ 3426 if ((ptpaddr & VPTE_PS) != 0) { 3427 #if 0 3428 pmap_clean_pde(pde, pmap, sva); 3429 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 3430 #endif 3431 continue; 3432 } 3433 #endif 3434 3435 /* 3436 * Weed out invalid mappings. Note: we assume that the page 3437 * directory table is always allocated, and in kernel virtual. 3438 */ 3439 if (ptpaddr == 0) 3440 continue; 3441 3442 if (va_next > eva) 3443 va_next = eva; 3444 3445 pt_m = pmap_hold_pt_page(pde, sva); 3446 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3447 sva += PAGE_SIZE) { 3448 vm_page_t m; 3449 3450 if (stop) 3451 break; 3452 if ((*pte & VPTE_MANAGED) == 0) 3453 continue; 3454 3455 m = PHYS_TO_VM_PAGE(*pte & VPTE_FRAME); 3456 if (vm_page_busy_try(m, TRUE) == 0) { 3457 if (pginfo->callback(pginfo, sva, m) < 0) 3458 stop = 1; 3459 } 3460 } 3461 vm_page_unhold(pt_m); 3462 } 3463 vm_object_drop(pmap->pm_pteobj); 3464 } 3465 3466 void 3467 pmap_maybethreaded(pmap_t pmap) 3468 { 3469 /* nop */ 3470 } 3471 3472 /* 3473 * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE 3474 * flags if able. 3475 * 3476 * vkernel code is using the old pmap style so the flags should already 3477 * be properly set. 3478 */ 3479 int 3480 pmap_mapped_sync(vm_page_t m) 3481 { 3482 return (m->flags); 3483 } 3484