1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008-2019 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 45 */ 46 47 /* 48 * Manages physical address maps. 49 */ 50 51 #include "opt_msgbuf.h" 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/vmspace.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/thread2.h> 76 #include <sys/spinlock2.h> 77 #include <vm/vm_page2.h> 78 79 #include <machine/cputypes.h> 80 #include <machine/md_var.h> 81 #include <machine/specialreg.h> 82 #include <machine/smp.h> 83 #include <machine/globaldata.h> 84 #include <machine/pcb.h> 85 #include <machine/pmap.h> 86 #include <machine/pmap_inval.h> 87 88 #include <ddb/ddb.h> 89 90 #include <stdio.h> 91 #include <assert.h> 92 #include <stdlib.h> 93 #include <pthread.h> 94 95 #define PMAP_KEEP_PDIRS 96 #ifndef PMAP_SHPGPERPROC 97 #define PMAP_SHPGPERPROC 1000 98 #endif 99 100 #if defined(DIAGNOSTIC) 101 #define PMAP_DIAGNOSTIC 102 #endif 103 104 #define MINPV 2048 105 106 #if !defined(PMAP_DIAGNOSTIC) 107 #define PMAP_INLINE __inline 108 #else 109 #define PMAP_INLINE 110 #endif 111 112 /* 113 * Get PDEs and PTEs for user/kernel address space 114 */ 115 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 116 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 117 118 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & VPTE_V) != 0) 119 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & VPTE_WIRED) != 0) 120 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & VPTE_M) != 0) 121 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & VPTE_A) != 0) 122 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & VPTE_V) != 0) 123 124 /* 125 * Given a map and a machine independent protection code, 126 * convert to a vax protection code. 127 */ 128 #define pte_prot(m, p) \ 129 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 130 static uint64_t protection_codes[8]; 131 132 struct pmap kernel_pmap; 133 134 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 135 136 static struct vm_object kptobj; 137 static int nkpt; 138 139 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 140 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 141 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 142 143 extern int vmm_enabled; 144 extern void *vkernel_stack; 145 146 /* 147 * Data for the pv entry allocation mechanism 148 */ 149 static vm_zone_t pvzone; 150 static struct vm_zone pvzone_store; 151 static vm_pindex_t pv_entry_count = 0; 152 static vm_pindex_t pv_entry_max = 0; 153 static vm_pindex_t pv_entry_high_water = 0; 154 static int pmap_pagedaemon_waken = 0; 155 static struct pv_entry *pvinit; 156 157 /* 158 * All those kernel PT submaps that BSD is so fond of 159 */ 160 pt_entry_t *CMAP1 = NULL, *ptmmap; 161 caddr_t CADDR1 = NULL; 162 static pt_entry_t *msgbufmap; 163 164 uint64_t KPTphys; 165 166 static PMAP_INLINE void free_pv_entry (pv_entry_t pv); 167 static pv_entry_t get_pv_entry (void); 168 static void x86_64_protection_init (void); 169 static __inline void pmap_clearbit (vm_page_t m, int bit); 170 171 static void pmap_remove_all (vm_page_t m); 172 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 173 pt_entry_t oldpte, vm_offset_t sva); 174 static void pmap_remove_page (struct pmap *pmap, vm_offset_t va); 175 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 176 vm_offset_t va); 177 static boolean_t pmap_testbit (vm_page_t m, int bit); 178 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 179 vm_page_t mpte, vm_page_t m, pv_entry_t); 180 181 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 182 183 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 184 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 185 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 186 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t); 187 188 static int 189 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 190 { 191 if (pv1->pv_va < pv2->pv_va) 192 return(-1); 193 if (pv1->pv_va > pv2->pv_va) 194 return(1); 195 return(0); 196 } 197 198 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 199 pv_entry_compare, vm_offset_t, pv_va); 200 201 static __inline vm_pindex_t 202 pmap_pt_pindex(vm_offset_t va) 203 { 204 return va >> PDRSHIFT; 205 } 206 207 static __inline vm_pindex_t 208 pmap_pte_index(vm_offset_t va) 209 { 210 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 211 } 212 213 static __inline vm_pindex_t 214 pmap_pde_index(vm_offset_t va) 215 { 216 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 217 } 218 219 static __inline vm_pindex_t 220 pmap_pdpe_index(vm_offset_t va) 221 { 222 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 223 } 224 225 static __inline vm_pindex_t 226 pmap_pml4e_index(vm_offset_t va) 227 { 228 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 229 } 230 231 /* Return a pointer to the PML4 slot that corresponds to a VA */ 232 static __inline pml4_entry_t * 233 pmap_pml4e(pmap_t pmap, vm_offset_t va) 234 { 235 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 236 } 237 238 /* Return a pointer to the PDP slot that corresponds to a VA */ 239 static __inline pdp_entry_t * 240 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 241 { 242 pdp_entry_t *pdpe; 243 244 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & VPTE_FRAME); 245 return (&pdpe[pmap_pdpe_index(va)]); 246 } 247 248 /* Return a pointer to the PDP slot that corresponds to a VA */ 249 static __inline pdp_entry_t * 250 pmap_pdpe(pmap_t pmap, vm_offset_t va) 251 { 252 pml4_entry_t *pml4e; 253 254 pml4e = pmap_pml4e(pmap, va); 255 if ((*pml4e & VPTE_V) == 0) 256 return NULL; 257 return (pmap_pml4e_to_pdpe(pml4e, va)); 258 } 259 260 /* Return a pointer to the PD slot that corresponds to a VA */ 261 static __inline pd_entry_t * 262 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 263 { 264 pd_entry_t *pde; 265 266 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & VPTE_FRAME); 267 return (&pde[pmap_pde_index(va)]); 268 } 269 270 /* Return a pointer to the PD slot that corresponds to a VA */ 271 static __inline pd_entry_t * 272 pmap_pde(pmap_t pmap, vm_offset_t va) 273 { 274 pdp_entry_t *pdpe; 275 276 pdpe = pmap_pdpe(pmap, va); 277 if (pdpe == NULL || (*pdpe & VPTE_V) == 0) 278 return NULL; 279 return (pmap_pdpe_to_pde(pdpe, va)); 280 } 281 282 /* Return a pointer to the PT slot that corresponds to a VA */ 283 static __inline pt_entry_t * 284 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 285 { 286 pt_entry_t *pte; 287 288 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & VPTE_FRAME); 289 return (&pte[pmap_pte_index(va)]); 290 } 291 292 /* 293 * Hold pt_m for page table scans to prevent it from getting reused out 294 * from under us across blocking conditions in the body of the loop. 295 */ 296 static __inline 297 vm_page_t 298 pmap_hold_pt_page(pd_entry_t *pde, vm_offset_t va) 299 { 300 pt_entry_t pte; 301 vm_page_t pt_m; 302 303 pte = (pt_entry_t)*pde; 304 KKASSERT(pte != 0); 305 pt_m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME); 306 vm_page_hold(pt_m); 307 308 return pt_m; 309 } 310 311 /* Return a pointer to the PT slot that corresponds to a VA */ 312 static __inline pt_entry_t * 313 pmap_pte(pmap_t pmap, vm_offset_t va) 314 { 315 pd_entry_t *pde; 316 317 pde = pmap_pde(pmap, va); 318 if (pde == NULL || (*pde & VPTE_V) == 0) 319 return NULL; 320 if ((*pde & VPTE_PS) != 0) /* compat with x86 pmap_pte() */ 321 return ((pt_entry_t *)pde); 322 return (pmap_pde_to_pte(pde, va)); 323 } 324 325 static PMAP_INLINE pt_entry_t * 326 vtopte(vm_offset_t va) 327 { 328 pt_entry_t *x; 329 x = pmap_pte(&kernel_pmap, va); 330 assert(x != NULL); 331 return x; 332 } 333 334 static __inline pd_entry_t * 335 vtopde(vm_offset_t va) 336 { 337 pd_entry_t *x; 338 x = pmap_pde(&kernel_pmap, va); 339 assert(x != NULL); 340 return x; 341 } 342 343 /* 344 * Returns the physical address translation from va for a user address. 345 * (vm_paddr_t)-1 is returned on failure. 346 */ 347 vm_paddr_t 348 uservtophys(vm_offset_t va) 349 { 350 struct vmspace *vm = curproc->p_vmspace; 351 vm_page_t m; 352 vm_paddr_t pa; 353 int error; 354 int busy; 355 356 /* XXX No idea how to handle this case in a simple way, just abort */ 357 if (PAGE_SIZE - (va & PAGE_MASK) < sizeof(u_int)) 358 return ((vm_paddr_t)-1); 359 360 m = vm_fault_page(&vm->vm_map, trunc_page(va), 361 VM_PROT_READ|VM_PROT_WRITE, 362 VM_FAULT_NORMAL, 363 &error, &busy); 364 if (error) 365 return ((vm_paddr_t)-1); 366 367 pa = VM_PAGE_TO_PHYS(m) | (va & PAGE_MASK); 368 if (busy) 369 vm_page_wakeup(m); 370 else 371 vm_page_unhold(m); 372 373 return pa; 374 } 375 376 static uint64_t 377 allocpages(vm_paddr_t *firstaddr, int n) 378 { 379 uint64_t ret; 380 381 ret = *firstaddr; 382 /*bzero((void *)ret, n * PAGE_SIZE); not mapped yet */ 383 *firstaddr += n * PAGE_SIZE; 384 return (ret); 385 } 386 387 static void 388 create_dmap_vmm(vm_paddr_t *firstaddr) 389 { 390 void *stack_addr; 391 int pml4_stack_index; 392 int pdp_stack_index; 393 int pd_stack_index; 394 long i,j; 395 int regs[4]; 396 int amd_feature; 397 398 uint64_t KPDP_DMAP_phys = allocpages(firstaddr, NDMPML4E); 399 uint64_t KPDP_VSTACK_phys = allocpages(firstaddr, 1); 400 uint64_t KPD_VSTACK_phys = allocpages(firstaddr, 1); 401 402 pml4_entry_t *KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 403 pdp_entry_t *KPDP_DMAP_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_DMAP_phys); 404 pdp_entry_t *KPDP_VSTACK_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_VSTACK_phys); 405 pd_entry_t *KPD_VSTACK_virt = (pd_entry_t *)PHYS_TO_DMAP(KPD_VSTACK_phys); 406 407 bzero(KPDP_DMAP_virt, NDMPML4E * PAGE_SIZE); 408 bzero(KPDP_VSTACK_virt, 1 * PAGE_SIZE); 409 bzero(KPD_VSTACK_virt, 1 * PAGE_SIZE); 410 411 do_cpuid(0x80000001, regs); 412 amd_feature = regs[3]; 413 414 /* Build the mappings for the first 512GB */ 415 if (amd_feature & AMDID_PAGE1GB) { 416 /* In pages of 1 GB, if supported */ 417 for (i = 0; i < NPDPEPG; i++) { 418 KPDP_DMAP_virt[i] = ((uint64_t)i << PDPSHIFT); 419 KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_PS | VPTE_U; 420 } 421 } else { 422 /* In page of 2MB, otherwise */ 423 for (i = 0; i < NPDPEPG; i++) { 424 uint64_t KPD_DMAP_phys; 425 pd_entry_t *KPD_DMAP_virt; 426 427 KPD_DMAP_phys = allocpages(firstaddr, 1); 428 KPD_DMAP_virt = 429 (pd_entry_t *)PHYS_TO_DMAP(KPD_DMAP_phys); 430 431 bzero(KPD_DMAP_virt, PAGE_SIZE); 432 433 KPDP_DMAP_virt[i] = KPD_DMAP_phys; 434 KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_U; 435 436 /* For each PD, we have to allocate NPTEPG PT */ 437 for (j = 0; j < NPTEPG; j++) { 438 KPD_DMAP_virt[j] = (i << PDPSHIFT) | 439 (j << PDRSHIFT); 440 KPD_DMAP_virt[j] |= VPTE_RW | VPTE_V | 441 VPTE_PS | VPTE_U; 442 } 443 } 444 } 445 446 /* DMAP for the first 512G */ 447 KPML4virt[0] = KPDP_DMAP_phys; 448 KPML4virt[0] |= VPTE_RW | VPTE_V | VPTE_U; 449 450 /* create a 2 MB map of the new stack */ 451 pml4_stack_index = (uint64_t)&stack_addr >> PML4SHIFT; 452 KPML4virt[pml4_stack_index] = KPDP_VSTACK_phys; 453 KPML4virt[pml4_stack_index] |= VPTE_RW | VPTE_V | VPTE_U; 454 455 pdp_stack_index = ((uint64_t)&stack_addr & PML4MASK) >> PDPSHIFT; 456 KPDP_VSTACK_virt[pdp_stack_index] = KPD_VSTACK_phys; 457 KPDP_VSTACK_virt[pdp_stack_index] |= VPTE_RW | VPTE_V | VPTE_U; 458 459 pd_stack_index = ((uint64_t)&stack_addr & PDPMASK) >> PDRSHIFT; 460 KPD_VSTACK_virt[pd_stack_index] = (uint64_t) vkernel_stack; 461 KPD_VSTACK_virt[pd_stack_index] |= VPTE_RW | VPTE_V | VPTE_U | VPTE_PS; 462 } 463 464 static void 465 create_pagetables(vm_paddr_t *firstaddr, int64_t ptov_offset) 466 { 467 int i; 468 pml4_entry_t *KPML4virt; 469 pdp_entry_t *KPDPvirt; 470 pd_entry_t *KPDvirt; 471 pt_entry_t *KPTvirt; 472 int kpml4i = pmap_pml4e_index(ptov_offset); 473 int kpdpi = pmap_pdpe_index(ptov_offset); 474 int kpdi = pmap_pde_index(ptov_offset); 475 476 /* 477 * Calculate NKPT - number of kernel page tables. We have to 478 * accomodoate prealloction of the vm_page_array, dump bitmap, 479 * MSGBUF_SIZE, and other stuff. Be generous. 480 * 481 * Maxmem is in pages. 482 */ 483 nkpt = (Maxmem * (sizeof(struct vm_page) * 2) + MSGBUF_SIZE) / NBPDR; 484 /* 485 * Allocate pages 486 */ 487 KPML4phys = allocpages(firstaddr, 1); 488 KPDPphys = allocpages(firstaddr, NKPML4E); 489 KPDphys = allocpages(firstaddr, NKPDPE); 490 KPTphys = allocpages(firstaddr, nkpt); 491 492 KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 493 KPDPvirt = (pdp_entry_t *)PHYS_TO_DMAP(KPDPphys); 494 KPDvirt = (pd_entry_t *)PHYS_TO_DMAP(KPDphys); 495 KPTvirt = (pt_entry_t *)PHYS_TO_DMAP(KPTphys); 496 497 bzero(KPML4virt, 1 * PAGE_SIZE); 498 bzero(KPDPvirt, NKPML4E * PAGE_SIZE); 499 bzero(KPDvirt, NKPDPE * PAGE_SIZE); 500 bzero(KPTvirt, nkpt * PAGE_SIZE); 501 502 /* Now map the page tables at their location within PTmap */ 503 for (i = 0; i < nkpt; i++) { 504 KPDvirt[i + kpdi] = KPTphys + (i << PAGE_SHIFT); 505 KPDvirt[i + kpdi] |= VPTE_RW | VPTE_V | VPTE_U; 506 } 507 508 /* And connect up the PD to the PDP */ 509 for (i = 0; i < NKPDPE; i++) { 510 KPDPvirt[i + kpdpi] = KPDphys + (i << PAGE_SHIFT); 511 KPDPvirt[i + kpdpi] |= VPTE_RW | VPTE_V | VPTE_U; 512 } 513 514 /* And recursively map PML4 to itself in order to get PTmap */ 515 KPML4virt[PML4PML4I] = KPML4phys; 516 KPML4virt[PML4PML4I] |= VPTE_RW | VPTE_V | VPTE_U; 517 518 /* Connect the KVA slot up to the PML4 */ 519 KPML4virt[kpml4i] = KPDPphys; 520 KPML4virt[kpml4i] |= VPTE_RW | VPTE_V | VPTE_U; 521 } 522 523 /* 524 * Typically used to initialize a fictitious page by vm/device_pager.c 525 */ 526 void 527 pmap_page_init(struct vm_page *m) 528 { 529 vm_page_init(m); 530 TAILQ_INIT(&m->md.pv_list); 531 } 532 533 /* 534 * Bootstrap the system enough to run with virtual memory. 535 * 536 * On x86_64 this is called after mapping has already been enabled 537 * and just syncs the pmap module with what has already been done. 538 * [We can't call it easily with mapping off since the kernel is not 539 * mapped with PA == VA, hence we would have to relocate every address 540 * from the linked base (virtual) address "KERNBASE" to the actual 541 * (physical) address starting relative to 0] 542 */ 543 void 544 pmap_bootstrap(vm_paddr_t *firstaddr, int64_t ptov_offset) 545 { 546 vm_offset_t va; 547 pt_entry_t *pte; 548 549 /* 550 * Create an initial set of page tables to run the kernel in. 551 */ 552 create_pagetables(firstaddr, ptov_offset); 553 554 /* Create the DMAP for the VMM */ 555 if (vmm_enabled) { 556 create_dmap_vmm(firstaddr); 557 } 558 559 virtual_start = KvaStart; 560 virtual_end = KvaEnd; 561 562 /* 563 * Initialize protection array. 564 */ 565 x86_64_protection_init(); 566 567 /* 568 * The kernel's pmap is statically allocated so we don't have to use 569 * pmap_create, which is unlikely to work correctly at this part of 570 * the boot sequence (XXX and which no longer exists). 571 * 572 * The kernel_pmap's pm_pteobj is used only for locking and not 573 * for mmu pages. 574 */ 575 kernel_pmap.pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 576 kernel_pmap.pm_count = 1; 577 /* don't allow deactivation */ 578 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 579 kernel_pmap.pm_pteobj = NULL; /* see pmap_init */ 580 RB_INIT(&kernel_pmap.pm_pvroot); 581 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 582 583 /* 584 * Reserve some special page table entries/VA space for temporary 585 * mapping of pages. 586 */ 587 #define SYSMAP(c, p, v, n) \ 588 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 589 590 va = virtual_start; 591 pte = pmap_pte(&kernel_pmap, va); 592 /* 593 * CMAP1/CMAP2 are used for zeroing and copying pages. 594 */ 595 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 596 597 #if 0 /* JGV */ 598 /* 599 * Crashdump maps. 600 */ 601 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 602 #endif 603 604 /* 605 * ptvmmap is used for reading arbitrary physical pages via 606 * /dev/mem. 607 */ 608 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 609 610 /* 611 * msgbufp is used to map the system message buffer. 612 * XXX msgbufmap is not used. 613 */ 614 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 615 atop(round_page(MSGBUF_SIZE))) 616 617 virtual_start = va; 618 619 *CMAP1 = 0; 620 /* Not ready to do an invltlb yet for VMM*/ 621 if (!vmm_enabled) 622 cpu_invltlb(); 623 624 } 625 626 /* 627 * Initialize the pmap module. 628 * Called by vm_init, to initialize any structures that the pmap 629 * system needs to map virtual memory. 630 * pmap_init has been enhanced to support in a fairly consistant 631 * way, discontiguous physical memory. 632 */ 633 void 634 pmap_init(void) 635 { 636 vm_pindex_t i; 637 vm_pindex_t initial_pvs; 638 639 /* 640 * object for kernel page table pages 641 */ 642 /* JG I think the number can be arbitrary */ 643 vm_object_init(&kptobj, 5); 644 kernel_pmap.pm_pteobj = &kptobj; 645 646 /* 647 * Allocate memory for random pmap data structures. Includes the 648 * pv_head_table. 649 */ 650 for (i = 0; i < vm_page_array_size; i++) { 651 vm_page_t m; 652 653 m = &vm_page_array[i]; 654 TAILQ_INIT(&m->md.pv_list); 655 m->md.pv_list_count = 0; 656 } 657 658 /* 659 * init the pv free list 660 */ 661 initial_pvs = vm_page_array_size; 662 if (initial_pvs < MINPV) 663 initial_pvs = MINPV; 664 pvzone = &pvzone_store; 665 pvinit = (struct pv_entry *) 666 kmem_alloc(&kernel_map, 667 initial_pvs * sizeof (struct pv_entry), 668 VM_SUBSYS_PVENTRY); 669 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, 670 initial_pvs); 671 672 /* 673 * Now it is safe to enable pv_table recording. 674 */ 675 pmap_initialized = TRUE; 676 } 677 678 /* 679 * Initialize the address space (zone) for the pv_entries. Set a 680 * high water mark so that the system can recover from excessive 681 * numbers of pv entries. 682 */ 683 void 684 pmap_init2(void) 685 { 686 vm_pindex_t shpgperproc = PMAP_SHPGPERPROC; 687 688 TUNABLE_LONG_FETCH("vm.pmap.shpgperproc", &shpgperproc); 689 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 690 TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &pv_entry_max); 691 pv_entry_high_water = 9 * (pv_entry_max / 10); 692 zinitna(pvzone, NULL, 0, pv_entry_max, ZONE_INTERRUPT); 693 } 694 695 696 /*************************************************** 697 * Low level helper routines..... 698 ***************************************************/ 699 700 /* 701 * The modification bit is not tracked for any pages in this range. XXX 702 * such pages in this maps should always use pmap_k*() functions and not 703 * be managed anyhow. 704 * 705 * XXX User and kernel address spaces are independant for virtual kernels, 706 * this function only applies to the kernel pmap. 707 */ 708 static void 709 pmap_track_modified(pmap_t pmap, vm_offset_t va) 710 { 711 KKASSERT(pmap != &kernel_pmap || 712 va < clean_sva || va >= clean_eva); 713 } 714 715 /* 716 * Extract the physical page address associated with the map/VA pair. 717 * 718 * No requirements. 719 */ 720 vm_paddr_t 721 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 722 { 723 vm_paddr_t rtval; 724 pt_entry_t *pte; 725 pd_entry_t pde, *pdep; 726 727 vm_object_hold(pmap->pm_pteobj); 728 rtval = 0; 729 pdep = pmap_pde(pmap, va); 730 if (pdep != NULL) { 731 pde = *pdep; 732 if (pde) { 733 if ((pde & VPTE_PS) != 0) { 734 /* JGV */ 735 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 736 } else { 737 pte = pmap_pde_to_pte(pdep, va); 738 rtval = (*pte & VPTE_FRAME) | (va & PAGE_MASK); 739 } 740 } 741 } 742 if (handlep) 743 *handlep = NULL; /* XXX */ 744 vm_object_drop(pmap->pm_pteobj); 745 746 return rtval; 747 } 748 749 void 750 pmap_extract_done(void *handle) 751 { 752 pmap_t pmap; 753 754 if (handle) { 755 pmap = handle; 756 vm_object_drop(pmap->pm_pteobj); 757 } 758 } 759 760 /* 761 * Similar to extract but checks protections, SMP-friendly short-cut for 762 * vm_fault_page[_quick](). 763 * 764 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NEITHER IT NOR ITS TARGET 765 * DATA IS SUITABLE FOR WRITING. Writing can interfere with 766 * pageouts flushes, msync, etc. The hold_count is not enough 767 * to avoid races against pageouts and other flush code doesn't 768 * care about hold_count. 769 */ 770 vm_page_t 771 pmap_fault_page_quick(pmap_t pmap __unused, vm_offset_t vaddr __unused, 772 vm_prot_t prot __unused, int *busyp __unused) 773 { 774 return(NULL); 775 } 776 777 /* 778 * Routine: pmap_kextract 779 * Function: 780 * Extract the physical page address associated 781 * kernel virtual address. 782 */ 783 vm_paddr_t 784 pmap_kextract(vm_offset_t va) 785 { 786 pd_entry_t pde; 787 vm_paddr_t pa; 788 789 KKASSERT(va >= KvaStart && va < KvaEnd); 790 791 /* 792 * The DMAP region is not included in [KvaStart, KvaEnd) 793 */ 794 #if 0 795 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 796 pa = DMAP_TO_PHYS(va); 797 } else { 798 #endif 799 pde = *vtopde(va); 800 if (pde & VPTE_PS) { 801 /* JGV */ 802 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 803 } else { 804 /* 805 * Beware of a concurrent promotion that changes the 806 * PDE at this point! For example, vtopte() must not 807 * be used to access the PTE because it would use the 808 * new PDE. It is, however, safe to use the old PDE 809 * because the page table page is preserved by the 810 * promotion. 811 */ 812 pa = *pmap_pde_to_pte(&pde, va); 813 pa = (pa & VPTE_FRAME) | (va & PAGE_MASK); 814 } 815 #if 0 816 } 817 #endif 818 return pa; 819 } 820 821 /*************************************************** 822 * Low level mapping routines..... 823 ***************************************************/ 824 825 /* 826 * Enter a mapping into kernel_pmap. Mappings created in this fashion 827 * are not managed. Mappings must be immediately accessible on all cpus. 828 * 829 * Call pmap_inval_pte() to invalidate the virtual pte and clean out the 830 * real pmap and handle related races before storing the new vpte. The 831 * new semantics for kenter require use to do an UNCONDITIONAL invalidation, 832 * because the entry may have previously been cleared without an invalidation. 833 */ 834 void 835 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 836 { 837 pt_entry_t *ptep; 838 pt_entry_t npte; 839 840 KKASSERT(va >= KvaStart && va < KvaEnd); 841 npte = pa | VPTE_RW | VPTE_V | VPTE_U; 842 ptep = vtopte(va); 843 844 #if 1 845 pmap_inval_pte(ptep, &kernel_pmap, va); 846 #else 847 if (*pte & VPTE_V) 848 pmap_inval_pte(ptep, &kernel_pmap, va); 849 #endif 850 atomic_swap_long(ptep, npte); 851 } 852 853 /* 854 * Enter an unmanaged KVA mapping for the private use of the current 855 * cpu only. 856 * 857 * It is illegal for the mapping to be accessed by other cpus without 858 * proper invalidation. 859 */ 860 int 861 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 862 { 863 pt_entry_t *ptep; 864 pt_entry_t npte; 865 int res; 866 867 KKASSERT(va >= KvaStart && va < KvaEnd); 868 869 npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; 870 ptep = vtopte(va); 871 872 #if 1 873 pmap_inval_pte_quick(ptep, &kernel_pmap, va); 874 res = 1; 875 #else 876 /* FUTURE */ 877 res = (*ptep != 0); 878 if (*pte & VPTE_V) 879 pmap_inval_pte(pte, &kernel_pmap, va); 880 #endif 881 atomic_swap_long(ptep, npte); 882 883 return res; 884 } 885 886 /* 887 * Invalidation will occur later, ok to be lazy here. 888 */ 889 int 890 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 891 { 892 pt_entry_t *ptep; 893 pt_entry_t npte; 894 int res; 895 896 KKASSERT(va >= KvaStart && va < KvaEnd); 897 898 npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; 899 ptep = vtopte(va); 900 #if 1 901 res = 1; 902 #else 903 /* FUTURE */ 904 res = (*ptep != 0); 905 #endif 906 atomic_swap_long(ptep, npte); 907 908 return res; 909 } 910 911 /* 912 * Remove an unmanaged mapping created with pmap_kenter*(). 913 */ 914 void 915 pmap_kremove(vm_offset_t va) 916 { 917 pt_entry_t *ptep; 918 919 KKASSERT(va >= KvaStart && va < KvaEnd); 920 921 ptep = vtopte(va); 922 atomic_swap_long(ptep, 0); 923 pmap_inval_pte(ptep, &kernel_pmap, va); 924 } 925 926 /* 927 * Remove an unmanaged mapping created with pmap_kenter*() but synchronize 928 * only with this cpu. 929 * 930 * Unfortunately because we optimize new entries by testing VPTE_V later 931 * on, we actually still have to synchronize with all the cpus. XXX maybe 932 * store a junk value and test against 0 in the other places instead? 933 */ 934 void 935 pmap_kremove_quick(vm_offset_t va) 936 { 937 pt_entry_t *ptep; 938 939 KKASSERT(va >= KvaStart && va < KvaEnd); 940 941 ptep = vtopte(va); 942 atomic_swap_long(ptep, 0); 943 pmap_inval_pte(ptep, &kernel_pmap, va); /* NOT _quick */ 944 } 945 946 /* 947 * Invalidation will occur later, ok to be lazy here. 948 */ 949 void 950 pmap_kremove_noinval(vm_offset_t va) 951 { 952 pt_entry_t *ptep; 953 954 KKASSERT(va >= KvaStart && va < KvaEnd); 955 956 ptep = vtopte(va); 957 atomic_swap_long(ptep, 0); 958 } 959 960 /* 961 * Used to map a range of physical addresses into kernel 962 * virtual address space. 963 * 964 * For now, VM is already on, we only need to map the 965 * specified memory. 966 */ 967 vm_offset_t 968 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 969 { 970 return PHYS_TO_DMAP(start); 971 } 972 973 /* 974 * Map a set of unmanaged VM pages into KVM. 975 */ 976 static __inline void 977 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) 978 { 979 vm_offset_t end_va; 980 vm_offset_t va; 981 982 end_va = beg_va + count * PAGE_SIZE; 983 KKASSERT(beg_va >= KvaStart && end_va <= KvaEnd); 984 985 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 986 pt_entry_t *ptep; 987 988 ptep = vtopte(va); 989 atomic_swap_long(ptep, VM_PAGE_TO_PHYS(*m) | 990 VPTE_RW | VPTE_V | VPTE_U); 991 ++m; 992 } 993 if (doinval) 994 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 995 /* pmap_inval_pte(pte, &kernel_pmap, va); */ 996 } 997 998 void 999 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 1000 { 1001 _pmap_qenter(beg_va, m, count, 1); 1002 } 1003 1004 void 1005 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) 1006 { 1007 _pmap_qenter(beg_va, m, count, 0); 1008 } 1009 1010 /* 1011 * Undo the effects of pmap_qenter*(). 1012 */ 1013 void 1014 pmap_qremove(vm_offset_t beg_va, int count) 1015 { 1016 vm_offset_t end_va; 1017 vm_offset_t va; 1018 1019 end_va = beg_va + count * PAGE_SIZE; 1020 KKASSERT(beg_va >= KvaStart && end_va < KvaEnd); 1021 1022 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1023 pt_entry_t *ptep; 1024 1025 ptep = vtopte(va); 1026 atomic_swap_long(ptep, 0); 1027 } 1028 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1029 } 1030 1031 /* 1032 * Unlike the real pmap code, we can't avoid calling the real-kernel. 1033 */ 1034 void 1035 pmap_qremove_quick(vm_offset_t va, int count) 1036 { 1037 pmap_qremove(va, count); 1038 } 1039 1040 void 1041 pmap_qremove_noinval(vm_offset_t va, int count) 1042 { 1043 pmap_qremove(va, count); 1044 } 1045 1046 /* 1047 * This routine works like vm_page_lookup() but also blocks as long as the 1048 * page is busy. This routine does not busy the page it returns. 1049 * 1050 * Unless the caller is managing objects whos pages are in a known state, 1051 * the call should be made with a critical section held so the page's object 1052 * association remains valid on return. 1053 */ 1054 static vm_page_t 1055 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1056 { 1057 vm_page_t m; 1058 1059 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1060 m = vm_page_lookup_busy_wait(object, pindex, TRUE, "pplookp"); 1061 1062 return(m); 1063 } 1064 1065 /* 1066 * Create a new thread and optionally associate it with a (new) process. 1067 * NOTE! the new thread's cpu may not equal the current cpu. 1068 */ 1069 void 1070 pmap_init_thread(thread_t td) 1071 { 1072 /* enforce pcb placement */ 1073 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1074 td->td_savefpu = &td->td_pcb->pcb_save; 1075 td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */ 1076 } 1077 1078 /* 1079 * This routine directly affects the fork perf for a process. 1080 */ 1081 void 1082 pmap_init_proc(struct proc *p) 1083 { 1084 } 1085 1086 /* 1087 * Unwire a page table which has been removed from the pmap. We own the 1088 * wire_count, so the page cannot go away. The page representing the page 1089 * table is passed in unbusied and must be busied if we cannot trivially 1090 * unwire it. 1091 * 1092 * XXX NOTE! This code is not usually run because we do not currently 1093 * implement dynamic page table page removal. The page in 1094 * its parent assumes at least 1 wire count, so no call to this 1095 * function ever sees a wire count less than 2. 1096 */ 1097 static int 1098 pmap_unwire_pgtable(pmap_t pmap, vm_offset_t va, vm_page_t m) 1099 { 1100 /* 1101 * Try to unwire optimally. If non-zero is returned the wire_count 1102 * is 1 and we must busy the page to unwire it. 1103 */ 1104 if (vm_page_unwire_quick(m) == 0) 1105 return 0; 1106 1107 vm_page_busy_wait(m, TRUE, "pmuwpt"); 1108 KASSERT(m->queue == PQ_NONE, 1109 ("_pmap_unwire_pgtable: %p->queue != PQ_NONE", m)); 1110 1111 if (m->wire_count == 1) { 1112 /* 1113 * Unmap the page table page. 1114 */ 1115 /* pmap_inval_add(info, pmap, -1); */ 1116 1117 if (m->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1118 /* PDP page */ 1119 pml4_entry_t *pml4; 1120 pml4 = pmap_pml4e(pmap, va); 1121 *pml4 = 0; 1122 } else if (m->pindex >= NUPT_TOTAL) { 1123 /* PD page */ 1124 pdp_entry_t *pdp; 1125 pdp = pmap_pdpe(pmap, va); 1126 *pdp = 0; 1127 } else { 1128 /* PT page */ 1129 pd_entry_t *pd; 1130 pd = pmap_pde(pmap, va); 1131 *pd = 0; 1132 } 1133 1134 KKASSERT(pmap->pm_stats.resident_count > 0); 1135 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1136 1137 if (pmap->pm_ptphint == m) 1138 pmap->pm_ptphint = NULL; 1139 1140 if (m->pindex < NUPT_TOTAL) { 1141 /* We just released a PT, unhold the matching PD */ 1142 vm_page_t pdpg; 1143 1144 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & 1145 VPTE_FRAME); 1146 pmap_unwire_pgtable(pmap, va, pdpg); 1147 } 1148 if (m->pindex >= NUPT_TOTAL && 1149 m->pindex < (NUPT_TOTAL + NUPD_TOTAL)) { 1150 /* We just released a PD, unhold the matching PDP */ 1151 vm_page_t pdppg; 1152 1153 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & 1154 VPTE_FRAME); 1155 pmap_unwire_pgtable(pmap, va, pdppg); 1156 } 1157 1158 /* 1159 * This was our last wire, the page had better be unwired 1160 * after we decrement wire_count. 1161 * 1162 * FUTURE NOTE: shared page directory page could result in 1163 * multiple wire counts. 1164 */ 1165 vm_page_unwire(m, 0); 1166 KKASSERT(m->wire_count == 0); 1167 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1168 vm_page_flash(m); 1169 vm_page_free(m); 1170 return 1; 1171 } else { 1172 /* XXX SMP race to 1 if not holding vmobj */ 1173 vm_page_unwire(m, 0); 1174 vm_page_wakeup(m); 1175 return 0; 1176 } 1177 } 1178 1179 /* 1180 * After removing a page table entry, this routine is used to 1181 * conditionally free the page, and manage the hold/wire counts. 1182 * 1183 * If not NULL the caller owns a wire_count on mpte, so it can't disappear. 1184 * If NULL the caller owns a wire_count on what would be the mpte, we must 1185 * look it up. 1186 */ 1187 static int 1188 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1189 { 1190 vm_pindex_t ptepindex; 1191 1192 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1193 1194 if (mpte == NULL) { 1195 /* 1196 * page table pages in the kernel_pmap are not managed. 1197 */ 1198 if (pmap == &kernel_pmap) 1199 return(0); 1200 ptepindex = pmap_pt_pindex(va); 1201 if (pmap->pm_ptphint && 1202 (pmap->pm_ptphint->pindex == ptepindex)) { 1203 mpte = pmap->pm_ptphint; 1204 } else { 1205 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1206 pmap->pm_ptphint = mpte; 1207 vm_page_wakeup(mpte); 1208 } 1209 } 1210 return pmap_unwire_pgtable(pmap, va, mpte); 1211 } 1212 1213 /* 1214 * Initialize pmap0/vmspace0 . Since process 0 never enters user mode we 1215 * just dummy it up so it works well enough for fork(). 1216 * 1217 * In DragonFly, process pmaps may only be used to manipulate user address 1218 * space, never kernel address space. 1219 */ 1220 void 1221 pmap_pinit0(struct pmap *pmap) 1222 { 1223 pmap_pinit(pmap); 1224 } 1225 1226 /* 1227 * Initialize a preallocated and zeroed pmap structure, 1228 * such as one in a vmspace structure. 1229 */ 1230 void 1231 pmap_pinit(struct pmap *pmap) 1232 { 1233 vm_page_t ptdpg; 1234 1235 /* 1236 * No need to allocate page table space yet but we do need a valid 1237 * page directory table. 1238 */ 1239 if (pmap->pm_pml4 == NULL) { 1240 pmap->pm_pml4 = (pml4_entry_t *) 1241 kmem_alloc_pageable(&kernel_map, PAGE_SIZE, 1242 VM_SUBSYS_PML4); 1243 } 1244 1245 /* 1246 * Allocate an object for the ptes 1247 */ 1248 if (pmap->pm_pteobj == NULL) 1249 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL + 1); 1250 1251 /* 1252 * Allocate the page directory page, unless we already have 1253 * one cached. If we used the cached page the wire_count will 1254 * already be set appropriately. 1255 */ 1256 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1257 ptdpg = vm_page_grab(pmap->pm_pteobj, 1258 NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL, 1259 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | 1260 VM_ALLOC_ZERO); 1261 pmap->pm_pdirm = ptdpg; 1262 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_WRITEABLE); 1263 vm_page_wire(ptdpg); 1264 vm_page_wakeup(ptdpg); 1265 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1266 } 1267 pmap->pm_count = 1; 1268 CPUMASK_ASSZERO(pmap->pm_active); 1269 pmap->pm_ptphint = NULL; 1270 RB_INIT(&pmap->pm_pvroot); 1271 spin_init(&pmap->pm_spin, "pmapinit"); 1272 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1273 pmap->pm_stats.resident_count = 1; 1274 pmap->pm_stats.wired_count = 1; 1275 } 1276 1277 /* 1278 * Clean up a pmap structure so it can be physically freed. This routine 1279 * is called by the vmspace dtor function. A great deal of pmap data is 1280 * left passively mapped to improve vmspace management so we have a bit 1281 * of cleanup work to do here. 1282 * 1283 * No requirements. 1284 */ 1285 void 1286 pmap_puninit(pmap_t pmap) 1287 { 1288 vm_page_t p; 1289 1290 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 1291 if ((p = pmap->pm_pdirm) != NULL) { 1292 KKASSERT(pmap->pm_pml4 != NULL); 1293 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1294 vm_page_busy_wait(p, TRUE, "pgpun"); 1295 vm_page_unwire(p, 0); 1296 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1297 vm_page_free(p); 1298 pmap->pm_pdirm = NULL; 1299 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1300 KKASSERT(pmap->pm_stats.wired_count == 0); 1301 } 1302 if (pmap->pm_pml4) { 1303 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1304 pmap->pm_pml4 = NULL; 1305 } 1306 if (pmap->pm_pteobj) { 1307 vm_object_deallocate(pmap->pm_pteobj); 1308 pmap->pm_pteobj = NULL; 1309 } 1310 } 1311 1312 /* 1313 * This function is now unused (used to add the pmap to the pmap_list) 1314 */ 1315 void 1316 pmap_pinit2(struct pmap *pmap) 1317 { 1318 } 1319 1320 /* 1321 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1322 * 0 on failure (if the procedure had to sleep). 1323 * 1324 * When asked to remove the page directory page itself, we actually just 1325 * leave it cached so we do not have to incur the SMP inval overhead of 1326 * removing the kernel mapping. pmap_puninit() will take care of it. 1327 */ 1328 static int 1329 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1330 { 1331 /* 1332 * This code optimizes the case of freeing non-busy 1333 * page-table pages. Those pages are zero now, and 1334 * might as well be placed directly into the zero queue. 1335 */ 1336 if (vm_page_busy_try(p, TRUE)) { 1337 vm_page_sleep_busy(p, TRUE, "pmaprl"); 1338 return 1; 1339 } 1340 1341 /* 1342 * Remove the page table page from the processes address space. 1343 */ 1344 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1345 /* 1346 * We are the pml4 table itself. 1347 */ 1348 /* XXX anything to do here? */ 1349 } else if (p->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1350 /* 1351 * We are a PDP page. 1352 * We look for the PML4 entry that points to us. 1353 */ 1354 vm_page_t m4; 1355 pml4_entry_t *pml4; 1356 int idx; 1357 1358 m4 = vm_page_lookup(pmap->pm_pteobj, 1359 NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 1360 KKASSERT(m4 != NULL); 1361 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1362 idx = (p->pindex - (NUPT_TOTAL + NUPD_TOTAL)) % NPML4EPG; 1363 KKASSERT(pml4[idx] != 0); 1364 if (pml4[idx] == 0) 1365 kprintf("pmap_release: Unmapped PML4\n"); 1366 pml4[idx] = 0; 1367 vm_page_unwire_quick(m4); 1368 } else if (p->pindex >= NUPT_TOTAL) { 1369 /* 1370 * We are a PD page. 1371 * We look for the PDP entry that points to us. 1372 */ 1373 vm_page_t m3; 1374 pdp_entry_t *pdp; 1375 int idx; 1376 1377 m3 = vm_page_lookup(pmap->pm_pteobj, 1378 NUPT_TOTAL + NUPD_TOTAL + 1379 (p->pindex - NUPT_TOTAL) / NPDPEPG); 1380 KKASSERT(m3 != NULL); 1381 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1382 idx = (p->pindex - NUPT_TOTAL) % NPDPEPG; 1383 KKASSERT(pdp[idx] != 0); 1384 if (pdp[idx] == 0) 1385 kprintf("pmap_release: Unmapped PDP %d\n", idx); 1386 pdp[idx] = 0; 1387 vm_page_unwire_quick(m3); 1388 } else { 1389 /* We are a PT page. 1390 * We look for the PD entry that points to us. 1391 */ 1392 vm_page_t m2; 1393 pd_entry_t *pd; 1394 int idx; 1395 1396 m2 = vm_page_lookup(pmap->pm_pteobj, 1397 NUPT_TOTAL + p->pindex / NPDEPG); 1398 KKASSERT(m2 != NULL); 1399 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1400 idx = p->pindex % NPDEPG; 1401 if (pd[idx] == 0) 1402 kprintf("pmap_release: Unmapped PD %d\n", idx); 1403 pd[idx] = 0; 1404 vm_page_unwire_quick(m2); 1405 } 1406 KKASSERT(pmap->pm_stats.resident_count > 0); 1407 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1408 1409 if (p->wire_count > 1) { 1410 panic("pmap_release: freeing held pt page " 1411 "pmap=%p pg=%p dmap=%p pi=%ld {%ld,%ld,%ld}", 1412 pmap, p, (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(p)), 1413 p->pindex, NUPT_TOTAL, NUPD_TOTAL, NUPDP_TOTAL); 1414 } 1415 1416 if (pmap->pm_ptphint == p) 1417 pmap->pm_ptphint = NULL; 1418 1419 /* 1420 * We leave the top-level page table page cached, wired, and mapped in 1421 * the pmap until the dtor function (pmap_puninit()) gets called. 1422 * However, still clean it up. 1423 */ 1424 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1425 bzero(pmap->pm_pml4, PAGE_SIZE); 1426 vm_page_wakeup(p); 1427 } else { 1428 vm_page_unwire(p, 0); 1429 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1430 vm_page_free(p); 1431 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1432 } 1433 return 0; 1434 } 1435 1436 /* 1437 * Locate the requested PT, PD, or PDP page table page. 1438 * 1439 * Returns a busied page, caller must vm_page_wakeup() when done. 1440 */ 1441 static vm_page_t 1442 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1443 { 1444 vm_page_t m; 1445 vm_page_t pm; 1446 vm_pindex_t pindex; 1447 pt_entry_t *ptep; 1448 pt_entry_t data; 1449 1450 /* 1451 * Find or fabricate a new pagetable page. A non-zero wire_count 1452 * indicates that the page has already been mapped into its parent. 1453 */ 1454 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1455 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1456 if (m->wire_count != 0) 1457 return m; 1458 1459 /* 1460 * Map the page table page into its parent, giving it 1 wire count. 1461 */ 1462 vm_page_wire(m); 1463 vm_page_unqueue(m); 1464 atomic_add_long(&pmap->pm_stats.resident_count, 1); 1465 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 1466 1467 data = VM_PAGE_TO_PHYS(m) | 1468 VPTE_RW | VPTE_V | VPTE_U | VPTE_A | VPTE_M | VPTE_WIRED; 1469 atomic_add_long(&pmap->pm_stats.wired_count, 1); 1470 1471 if (ptepindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1472 /* 1473 * Map PDP into the PML4 1474 */ 1475 pindex = ptepindex - (NUPT_TOTAL + NUPD_TOTAL); 1476 pindex &= (NUPDP_TOTAL - 1); 1477 ptep = (pt_entry_t *)pmap->pm_pml4; 1478 pm = NULL; 1479 } else if (ptepindex >= NUPT_TOTAL) { 1480 /* 1481 * Map PD into its PDP 1482 */ 1483 pindex = (ptepindex - NUPT_TOTAL) >> NPDPEPGSHIFT; 1484 pindex += NUPT_TOTAL + NUPD_TOTAL; 1485 pm = _pmap_allocpte(pmap, pindex); 1486 pindex = (ptepindex - NUPT_TOTAL) & (NPDPEPG - 1); 1487 ptep = (void *)PHYS_TO_DMAP(pm->phys_addr); 1488 } else { 1489 /* 1490 * Map PT into its PD 1491 */ 1492 pindex = ptepindex >> NPDPEPGSHIFT; 1493 pindex += NUPT_TOTAL; 1494 pm = _pmap_allocpte(pmap, pindex); 1495 pindex = ptepindex & (NPTEPG - 1); 1496 ptep = (void *)PHYS_TO_DMAP(pm->phys_addr); 1497 } 1498 1499 /* 1500 * Install the pte in (pm). (m) prevents races. 1501 */ 1502 ptep += pindex; 1503 data = atomic_swap_long(ptep, data); 1504 if (pm) { 1505 vm_page_wire_quick(pm); 1506 vm_page_wakeup(pm); 1507 } 1508 pmap->pm_ptphint = pm; 1509 1510 return m; 1511 } 1512 1513 /* 1514 * Determine the page table page required to access the VA in the pmap 1515 * and allocate it if necessary. Return a held vm_page_t for the page. 1516 * 1517 * Only used with user pmaps. 1518 */ 1519 static vm_page_t 1520 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1521 { 1522 vm_pindex_t ptepindex; 1523 vm_page_t m; 1524 1525 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1526 1527 /* 1528 * Calculate pagetable page index, and return the PT page to 1529 * the caller. 1530 */ 1531 ptepindex = pmap_pt_pindex(va); 1532 m = _pmap_allocpte(pmap, ptepindex); 1533 1534 return m; 1535 } 1536 1537 /*************************************************** 1538 * Pmap allocation/deallocation routines. 1539 ***************************************************/ 1540 1541 /* 1542 * Release any resources held by the given physical map. 1543 * Called when a pmap initialized by pmap_pinit is being released. 1544 * Should only be called if the map contains no valid mappings. 1545 */ 1546 static int pmap_release_callback(struct vm_page *p, void *data); 1547 1548 void 1549 pmap_release(struct pmap *pmap) 1550 { 1551 vm_object_t object = pmap->pm_pteobj; 1552 struct rb_vm_page_scan_info info; 1553 1554 KKASSERT(pmap != &kernel_pmap); 1555 1556 #if defined(DIAGNOSTIC) 1557 if (object->ref_count != 1) 1558 panic("pmap_release: pteobj reference count != 1"); 1559 #endif 1560 1561 info.pmap = pmap; 1562 info.object = object; 1563 1564 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 1565 ("pmap %p still active! %016jx", 1566 pmap, 1567 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 1568 1569 vm_object_hold(object); 1570 do { 1571 info.error = 0; 1572 info.mpte = NULL; 1573 info.limit = object->generation; 1574 1575 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1576 pmap_release_callback, &info); 1577 if (info.error == 0 && info.mpte) { 1578 if (pmap_release_free_page(pmap, info.mpte)) 1579 info.error = 1; 1580 } 1581 } while (info.error); 1582 1583 pmap->pm_ptphint = NULL; 1584 1585 KASSERT((pmap->pm_stats.wired_count == (pmap->pm_pdirm != NULL)), 1586 ("pmap_release: dangling count %p %ld", 1587 pmap, pmap->pm_stats.wired_count)); 1588 1589 vm_object_drop(object); 1590 } 1591 1592 static int 1593 pmap_release_callback(struct vm_page *p, void *data) 1594 { 1595 struct rb_vm_page_scan_info *info = data; 1596 1597 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1598 info->mpte = p; 1599 return(0); 1600 } 1601 if (pmap_release_free_page(info->pmap, p)) { 1602 info->error = 1; 1603 return(-1); 1604 } 1605 if (info->object->generation != info->limit) { 1606 info->error = 1; 1607 return(-1); 1608 } 1609 return(0); 1610 } 1611 1612 /* 1613 * Grow the number of kernel page table entries, if needed. 1614 * 1615 * kernel_map must be locked exclusively by the caller. 1616 */ 1617 void 1618 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 1619 { 1620 vm_offset_t addr; 1621 vm_paddr_t paddr; 1622 vm_offset_t ptppaddr; 1623 vm_page_t nkpg; 1624 pd_entry_t *pde, newpdir; 1625 pdp_entry_t newpdp; 1626 1627 addr = kend; 1628 1629 vm_object_hold(&kptobj); 1630 if (kernel_vm_end == 0) { 1631 kernel_vm_end = KvaStart; 1632 nkpt = 0; 1633 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & VPTE_V) != 0) { 1634 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1635 nkpt++; 1636 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 1637 kernel_vm_end = vm_map_max(&kernel_map); 1638 break; 1639 } 1640 } 1641 } 1642 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1643 if (addr - 1 >= vm_map_max(&kernel_map)) 1644 addr = vm_map_max(&kernel_map); 1645 while (kernel_vm_end < addr) { 1646 pde = pmap_pde(&kernel_pmap, kernel_vm_end); 1647 if (pde == NULL) { 1648 /* We need a new PDP entry */ 1649 nkpg = vm_page_alloc(&kptobj, nkpt, 1650 VM_ALLOC_NORMAL | 1651 VM_ALLOC_SYSTEM | 1652 VM_ALLOC_INTERRUPT); 1653 if (nkpg == NULL) { 1654 panic("pmap_growkernel: no memory to " 1655 "grow kernel"); 1656 } 1657 paddr = VM_PAGE_TO_PHYS(nkpg); 1658 pmap_zero_page(paddr); 1659 newpdp = (pdp_entry_t)(paddr | 1660 VPTE_V | VPTE_RW | VPTE_U | 1661 VPTE_A | VPTE_M | VPTE_WIRED); 1662 *pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp; 1663 atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1); 1664 nkpt++; 1665 continue; /* try again */ 1666 } 1667 if ((*pde & VPTE_V) != 0) { 1668 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1669 ~(PAGE_SIZE * NPTEPG - 1); 1670 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 1671 kernel_vm_end = vm_map_max(&kernel_map); 1672 break; 1673 } 1674 continue; 1675 } 1676 1677 /* 1678 * This index is bogus, but out of the way 1679 */ 1680 nkpg = vm_page_alloc(&kptobj, nkpt, 1681 VM_ALLOC_NORMAL | 1682 VM_ALLOC_SYSTEM | 1683 VM_ALLOC_INTERRUPT); 1684 if (nkpg == NULL) 1685 panic("pmap_growkernel: no memory to grow kernel"); 1686 1687 vm_page_wire(nkpg); 1688 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1689 pmap_zero_page(ptppaddr); 1690 newpdir = (pd_entry_t)(ptppaddr | 1691 VPTE_V | VPTE_RW | VPTE_U | 1692 VPTE_A | VPTE_M | VPTE_WIRED); 1693 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir; 1694 atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1); 1695 nkpt++; 1696 1697 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1698 ~(PAGE_SIZE * NPTEPG - 1); 1699 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 1700 kernel_vm_end = vm_map_max(&kernel_map); 1701 break; 1702 } 1703 } 1704 vm_object_drop(&kptobj); 1705 } 1706 1707 /* 1708 * Add a reference to the specified pmap. 1709 * 1710 * No requirements. 1711 */ 1712 void 1713 pmap_reference(pmap_t pmap) 1714 { 1715 if (pmap) 1716 atomic_add_int(&pmap->pm_count, 1); 1717 } 1718 1719 /************************************************************************ 1720 * VMSPACE MANAGEMENT * 1721 ************************************************************************ 1722 * 1723 * The VMSPACE management we do in our virtual kernel must be reflected 1724 * in the real kernel. This is accomplished by making vmspace system 1725 * calls to the real kernel. 1726 */ 1727 void 1728 cpu_vmspace_alloc(struct vmspace *vm) 1729 { 1730 int r; 1731 void *rp; 1732 vpte_t vpte; 1733 1734 /* 1735 * If VMM enable, don't do nothing, we 1736 * are able to use real page tables 1737 */ 1738 if (vmm_enabled) 1739 return; 1740 1741 #define USER_SIZE (VM_MAX_USER_ADDRESS - VM_MIN_USER_ADDRESS) 1742 1743 if (vmspace_create(&vm->vm_pmap, 0, NULL) < 0) 1744 panic("vmspace_create() failed"); 1745 1746 rp = vmspace_mmap(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1747 PROT_READ|PROT_WRITE|PROT_EXEC, 1748 MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED, 1749 MemImageFd, 0); 1750 if (rp == MAP_FAILED) 1751 panic("vmspace_mmap: failed"); 1752 vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1753 MADV_NOSYNC, 0); 1754 vpte = VM_PAGE_TO_PHYS(vmspace_pmap(vm)->pm_pdirm) | 1755 VPTE_RW | VPTE_V | VPTE_U; 1756 r = vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1757 MADV_SETMAP, vpte); 1758 if (r < 0) 1759 panic("vmspace_mcontrol: failed"); 1760 } 1761 1762 void 1763 cpu_vmspace_free(struct vmspace *vm) 1764 { 1765 /* 1766 * If VMM enable, don't do nothing, we 1767 * are able to use real page tables 1768 */ 1769 if (vmm_enabled) 1770 return; 1771 1772 if (vmspace_destroy(&vm->vm_pmap) < 0) 1773 panic("vmspace_destroy() failed"); 1774 } 1775 1776 /*************************************************** 1777 * page management routines. 1778 ***************************************************/ 1779 1780 /* 1781 * free the pv_entry back to the free list. This function may be 1782 * called from an interrupt. 1783 */ 1784 static __inline void 1785 free_pv_entry(pv_entry_t pv) 1786 { 1787 atomic_add_long(&pv_entry_count, -1); 1788 zfree(pvzone, pv); 1789 } 1790 1791 /* 1792 * get a new pv_entry, allocating a block from the system 1793 * when needed. This function may be called from an interrupt. 1794 */ 1795 static pv_entry_t 1796 get_pv_entry(void) 1797 { 1798 atomic_add_long(&pv_entry_count, 1); 1799 if (pv_entry_high_water && 1800 (pv_entry_count > pv_entry_high_water) && 1801 atomic_swap_int(&pmap_pagedaemon_waken, 1) == 0) { 1802 wakeup(&vm_pages_needed); 1803 } 1804 return zalloc(pvzone); 1805 } 1806 1807 /* 1808 * This routine is very drastic, but can save the system 1809 * in a pinch. 1810 * 1811 * No requirements. 1812 */ 1813 void 1814 pmap_collect(void) 1815 { 1816 int i; 1817 vm_page_t m; 1818 static int warningdone=0; 1819 1820 if (pmap_pagedaemon_waken == 0) 1821 return; 1822 pmap_pagedaemon_waken = 0; 1823 1824 if (warningdone < 5) { 1825 kprintf("pmap_collect: collecting pv entries -- " 1826 "suggest increasing PMAP_SHPGPERPROC\n"); 1827 warningdone++; 1828 } 1829 1830 for (i = 0; i < vm_page_array_size; i++) { 1831 m = &vm_page_array[i]; 1832 if (m->wire_count || m->hold_count) 1833 continue; 1834 if (vm_page_busy_try(m, TRUE) == 0) { 1835 if (m->wire_count == 0 && m->hold_count == 0) { 1836 pmap_remove_all(m); 1837 } 1838 vm_page_wakeup(m); 1839 } 1840 } 1841 } 1842 1843 1844 /* 1845 * If it is the first entry on the list, it is actually 1846 * in the header and we must copy the following entry up 1847 * to the header. Otherwise we must search the list for 1848 * the entry. In either case we free the now unused entry. 1849 * 1850 * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller. 1851 */ 1852 static int 1853 pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va) 1854 { 1855 pv_entry_t pv; 1856 int rtval; 1857 1858 vm_page_spin_lock(m); 1859 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, va); 1860 1861 /* 1862 * Note that pv_ptem is NULL if the page table page itself is not 1863 * managed, even if the page being removed IS managed. 1864 */ 1865 rtval = 0; 1866 if (pv) { 1867 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1868 if (TAILQ_EMPTY(&m->md.pv_list)) 1869 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1870 m->md.pv_list_count--; 1871 KKASSERT(m->md.pv_list_count >= 0); 1872 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 1873 atomic_add_int(&pmap->pm_generation, 1); 1874 vm_page_spin_unlock(m); 1875 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); 1876 free_pv_entry(pv); 1877 } else { 1878 vm_page_spin_unlock(m); 1879 kprintf("pmap_remove_entry: could not find " 1880 "pmap=%p m=%p va=%016jx\n", 1881 pmap, m, va); 1882 } 1883 return rtval; 1884 } 1885 1886 /* 1887 * Create a pv entry for page at pa for (pmap, va). If the page table page 1888 * holding the VA is managed, mpte will be non-NULL. 1889 * 1890 * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller. 1891 */ 1892 static void 1893 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m, 1894 pv_entry_t pv) 1895 { 1896 pv->pv_va = va; 1897 pv->pv_pmap = pmap; 1898 pv->pv_ptem = mpte; 1899 1900 m->md.pv_list_count++; 1901 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1902 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pv); 1903 vm_page_flag_set(m, PG_MAPPED); 1904 KKASSERT(pv == NULL); 1905 } 1906 1907 /* 1908 * pmap_remove_pte: do the things to unmap a page in a process 1909 * 1910 * Caller holds pmap->pm_pteobj and holds the associated page table 1911 * page busy to prevent races. 1912 */ 1913 static int 1914 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, pt_entry_t oldpte, 1915 vm_offset_t va) 1916 { 1917 vm_page_t m; 1918 int error; 1919 1920 if (ptq) 1921 oldpte = pmap_inval_loadandclear(ptq, pmap, va); 1922 1923 if (oldpte & VPTE_WIRED) 1924 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1925 KKASSERT(pmap->pm_stats.wired_count >= 0); 1926 1927 #if 0 1928 /* 1929 * Machines that don't support invlpg, also don't support 1930 * PG_G. XXX PG_G is disabled for SMP so don't worry about 1931 * the SMP case. 1932 */ 1933 if (oldpte & PG_G) 1934 cpu_invlpg((void *)va); 1935 #endif 1936 KKASSERT(pmap->pm_stats.resident_count > 0); 1937 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1938 if (oldpte & VPTE_MANAGED) { 1939 m = PHYS_TO_VM_PAGE(oldpte); 1940 1941 /* 1942 * NOTE: pmap_remove_entry() will spin-lock the page 1943 */ 1944 if (oldpte & VPTE_M) { 1945 #if defined(PMAP_DIAGNOSTIC) 1946 if (pmap_nw_modified(oldpte)) { 1947 kprintf("pmap_remove: modified page not " 1948 "writable: va: 0x%lx, pte: 0x%lx\n", 1949 va, oldpte); 1950 } 1951 #endif 1952 pmap_track_modified(pmap, va); 1953 vm_page_dirty(m); 1954 } 1955 if (oldpte & VPTE_A) 1956 vm_page_flag_set(m, PG_REFERENCED); 1957 error = pmap_remove_entry(pmap, m, va); 1958 } else { 1959 error = pmap_unuse_pt(pmap, va, NULL); 1960 } 1961 return error; 1962 } 1963 1964 /* 1965 * pmap_remove_page: 1966 * 1967 * Remove a single page from a process address space. 1968 * 1969 * This function may not be called from an interrupt if the pmap is 1970 * not kernel_pmap. 1971 * 1972 * Caller holds pmap->pm_pteobj 1973 */ 1974 static void 1975 pmap_remove_page(struct pmap *pmap, vm_offset_t va) 1976 { 1977 pt_entry_t *pte; 1978 1979 pte = pmap_pte(pmap, va); 1980 if (pte == NULL) 1981 return; 1982 if ((*pte & VPTE_V) == 0) 1983 return; 1984 pmap_remove_pte(pmap, pte, 0, va); 1985 } 1986 1987 /* 1988 * Remove the given range of addresses from the specified map. 1989 * 1990 * It is assumed that the start and end are properly rounded to 1991 * the page size. 1992 * 1993 * This function may not be called from an interrupt if the pmap is 1994 * not kernel_pmap. 1995 * 1996 * No requirements. 1997 */ 1998 void 1999 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 2000 { 2001 vm_offset_t va_next; 2002 pml4_entry_t *pml4e; 2003 pdp_entry_t *pdpe; 2004 pd_entry_t ptpaddr, *pde; 2005 pt_entry_t *pte; 2006 vm_page_t pt_m; 2007 2008 if (pmap == NULL) 2009 return; 2010 2011 vm_object_hold(pmap->pm_pteobj); 2012 KKASSERT(pmap->pm_stats.resident_count >= 0); 2013 if (pmap->pm_stats.resident_count == 0) { 2014 vm_object_drop(pmap->pm_pteobj); 2015 return; 2016 } 2017 2018 /* 2019 * special handling of removing one page. a very 2020 * common operation and easy to short circuit some 2021 * code. 2022 */ 2023 if (sva + PAGE_SIZE == eva) { 2024 pde = pmap_pde(pmap, sva); 2025 if (pde && (*pde & VPTE_PS) == 0) { 2026 pmap_remove_page(pmap, sva); 2027 vm_object_drop(pmap->pm_pteobj); 2028 return; 2029 } 2030 } 2031 2032 for (; sva < eva; sva = va_next) { 2033 pml4e = pmap_pml4e(pmap, sva); 2034 if ((*pml4e & VPTE_V) == 0) { 2035 va_next = (sva + NBPML4) & ~PML4MASK; 2036 if (va_next < sva) 2037 va_next = eva; 2038 continue; 2039 } 2040 2041 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2042 if ((*pdpe & VPTE_V) == 0) { 2043 va_next = (sva + NBPDP) & ~PDPMASK; 2044 if (va_next < sva) 2045 va_next = eva; 2046 continue; 2047 } 2048 2049 /* 2050 * Calculate index for next page table. 2051 */ 2052 va_next = (sva + NBPDR) & ~PDRMASK; 2053 if (va_next < sva) 2054 va_next = eva; 2055 2056 pde = pmap_pdpe_to_pde(pdpe, sva); 2057 ptpaddr = *pde; 2058 2059 /* 2060 * Weed out invalid mappings. 2061 */ 2062 if (ptpaddr == 0) 2063 continue; 2064 2065 /* 2066 * Check for large page. 2067 */ 2068 if ((ptpaddr & VPTE_PS) != 0) { 2069 /* JG FreeBSD has more complex treatment here */ 2070 KKASSERT(*pde != 0); 2071 pmap_inval_pde(pde, pmap, sva); 2072 atomic_add_long(&pmap->pm_stats.resident_count, 2073 -NBPDR / PAGE_SIZE); 2074 continue; 2075 } 2076 2077 /* 2078 * Limit our scan to either the end of the va represented 2079 * by the current page table page, or to the end of the 2080 * range being removed. 2081 */ 2082 if (va_next > eva) 2083 va_next = eva; 2084 2085 /* 2086 * NOTE: pmap_remove_pte() can block. 2087 */ 2088 pt_m = pmap_hold_pt_page(pde, sva); 2089 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2090 sva += PAGE_SIZE) { 2091 if (*pte) { 2092 if (pmap_remove_pte(pmap, pte, 0, sva)) 2093 break; 2094 } 2095 } 2096 vm_page_unhold(pt_m); 2097 } 2098 vm_object_drop(pmap->pm_pteobj); 2099 } 2100 2101 /* 2102 * Removes this physical page from all physical maps in which it resides. 2103 * Reflects back modify bits to the pager. 2104 * 2105 * This routine may not be called from an interrupt. 2106 * 2107 * No requirements. 2108 */ 2109 static void 2110 pmap_remove_all(vm_page_t m) 2111 { 2112 pt_entry_t *pte, tpte; 2113 pv_entry_t pv; 2114 vm_object_t pmobj; 2115 pmap_t pmap; 2116 2117 #if defined(PMAP_DIAGNOSTIC) 2118 /* 2119 * XXX this makes pmap_page_protect(NONE) illegal for non-managed 2120 * pages! 2121 */ 2122 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2123 panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m)); 2124 } 2125 #endif 2126 2127 restart: 2128 vm_page_spin_lock(m); 2129 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2130 pmap = pv->pv_pmap; 2131 pmobj = pmap->pm_pteobj; 2132 2133 /* 2134 * Handle reversed lock ordering 2135 */ 2136 if (vm_object_hold_try(pmobj) == 0) { 2137 refcount_acquire(&pmobj->hold_count); 2138 vm_page_spin_unlock(m); 2139 vm_object_lock(pmobj); 2140 vm_page_spin_lock(m); 2141 if (pv != TAILQ_FIRST(&m->md.pv_list) || 2142 pmap != pv->pv_pmap || 2143 pmobj != pmap->pm_pteobj) { 2144 vm_page_spin_unlock(m); 2145 vm_object_drop(pmobj); 2146 goto restart; 2147 } 2148 } 2149 2150 KKASSERT(pmap->pm_stats.resident_count > 0); 2151 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2152 2153 pte = pmap_pte(pmap, pv->pv_va); 2154 KKASSERT(pte != NULL); 2155 2156 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2157 if (tpte & VPTE_WIRED) 2158 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2159 KKASSERT(pmap->pm_stats.wired_count >= 0); 2160 2161 if (tpte & VPTE_A) 2162 vm_page_flag_set(m, PG_REFERENCED); 2163 2164 /* 2165 * Update the vm_page_t clean and reference bits. 2166 */ 2167 if (tpte & VPTE_M) { 2168 #if defined(PMAP_DIAGNOSTIC) 2169 if (pmap_nw_modified(tpte)) { 2170 kprintf( 2171 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2172 pv->pv_va, tpte); 2173 } 2174 #endif 2175 pmap_track_modified(pmap, pv->pv_va); 2176 vm_page_dirty(m); 2177 } 2178 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2179 if (TAILQ_EMPTY(&m->md.pv_list)) 2180 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2181 m->md.pv_list_count--; 2182 KKASSERT(m->md.pv_list_count >= 0); 2183 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2184 atomic_add_int(&pmap->pm_generation, 1); 2185 vm_page_spin_unlock(m); 2186 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2187 free_pv_entry(pv); 2188 2189 vm_object_drop(pmobj); 2190 vm_page_spin_lock(m); 2191 } 2192 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2193 vm_page_spin_unlock(m); 2194 } 2195 2196 /* 2197 * Removes the page from a particular pmap 2198 */ 2199 void 2200 pmap_remove_specific(pmap_t pmap, vm_page_t m) 2201 { 2202 pt_entry_t *pte, tpte; 2203 pv_entry_t pv; 2204 2205 vm_object_hold(pmap->pm_pteobj); 2206 again: 2207 vm_page_spin_lock(m); 2208 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2209 if (pv->pv_pmap != pmap) 2210 continue; 2211 2212 KKASSERT(pmap->pm_stats.resident_count > 0); 2213 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2214 2215 pte = pmap_pte(pmap, pv->pv_va); 2216 KKASSERT(pte != NULL); 2217 2218 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2219 if (tpte & VPTE_WIRED) 2220 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2221 KKASSERT(pmap->pm_stats.wired_count >= 0); 2222 2223 if (tpte & VPTE_A) 2224 vm_page_flag_set(m, PG_REFERENCED); 2225 2226 /* 2227 * Update the vm_page_t clean and reference bits. 2228 */ 2229 if (tpte & VPTE_M) { 2230 pmap_track_modified(pmap, pv->pv_va); 2231 vm_page_dirty(m); 2232 } 2233 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2234 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2235 atomic_add_int(&pmap->pm_generation, 1); 2236 m->md.pv_list_count--; 2237 KKASSERT(m->md.pv_list_count >= 0); 2238 if (TAILQ_EMPTY(&m->md.pv_list)) 2239 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2240 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2241 vm_page_spin_unlock(m); 2242 free_pv_entry(pv); 2243 goto again; 2244 } 2245 vm_page_spin_unlock(m); 2246 vm_object_drop(pmap->pm_pteobj); 2247 } 2248 2249 /* 2250 * Set the physical protection on the specified range of this map 2251 * as requested. 2252 * 2253 * This function may not be called from an interrupt if the map is 2254 * not the kernel_pmap. 2255 * 2256 * No requirements. 2257 */ 2258 void 2259 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2260 { 2261 vm_offset_t va_next; 2262 pml4_entry_t *pml4e; 2263 pdp_entry_t *pdpe; 2264 pd_entry_t ptpaddr, *pde; 2265 pt_entry_t *pte; 2266 vm_page_t pt_m; 2267 2268 if (pmap == NULL) 2269 return; 2270 2271 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 2272 pmap_remove(pmap, sva, eva); 2273 return; 2274 } 2275 2276 if (prot & VM_PROT_WRITE) 2277 return; 2278 2279 vm_object_hold(pmap->pm_pteobj); 2280 2281 for (; sva < eva; sva = va_next) { 2282 pml4e = pmap_pml4e(pmap, sva); 2283 if ((*pml4e & VPTE_V) == 0) { 2284 va_next = (sva + NBPML4) & ~PML4MASK; 2285 if (va_next < sva) 2286 va_next = eva; 2287 continue; 2288 } 2289 2290 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2291 if ((*pdpe & VPTE_V) == 0) { 2292 va_next = (sva + NBPDP) & ~PDPMASK; 2293 if (va_next < sva) 2294 va_next = eva; 2295 continue; 2296 } 2297 2298 va_next = (sva + NBPDR) & ~PDRMASK; 2299 if (va_next < sva) 2300 va_next = eva; 2301 2302 pde = pmap_pdpe_to_pde(pdpe, sva); 2303 ptpaddr = *pde; 2304 2305 #if 0 2306 /* 2307 * Check for large page. 2308 */ 2309 if ((ptpaddr & VPTE_PS) != 0) { 2310 /* JG correct? */ 2311 pmap_clean_pde(pde, pmap, sva); 2312 atomic_add_long(&pmap->pm_stats.resident_count, 2313 -NBPDR / PAGE_SIZE); 2314 continue; 2315 } 2316 #endif 2317 2318 /* 2319 * Weed out invalid mappings. Note: we assume that the page 2320 * directory table is always allocated, and in kernel virtual. 2321 */ 2322 if (ptpaddr == 0) 2323 continue; 2324 2325 if (va_next > eva) 2326 va_next = eva; 2327 2328 pt_m = pmap_hold_pt_page(pde, sva); 2329 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2330 sva += PAGE_SIZE) { 2331 /* 2332 * Clean managed pages and also check the accessed 2333 * bit. Just remove write perms for unmanaged 2334 * pages. Be careful of races, turning off write 2335 * access will force a fault rather then setting 2336 * the modified bit at an unexpected time. 2337 */ 2338 pmap_track_modified(pmap, sva); 2339 pmap_clean_pte(pte, pmap, sva, NULL); 2340 } 2341 vm_page_unhold(pt_m); 2342 } 2343 vm_object_drop(pmap->pm_pteobj); 2344 } 2345 2346 /* 2347 * Enter a managed page into a pmap. If the page is not wired related pmap 2348 * data can be destroyed at any time for later demand-operation. 2349 * 2350 * Insert the vm_page (m) at virtual address (v) in (pmap), with the 2351 * specified protection, and wire the mapping if requested. 2352 * 2353 * NOTE: This routine may not lazy-evaluate or lose information. The 2354 * page must actually be inserted into the given map NOW. 2355 * 2356 * NOTE: When entering a page at a KVA address, the pmap must be the 2357 * kernel_pmap. 2358 * 2359 * No requirements. 2360 */ 2361 void 2362 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2363 boolean_t wired, vm_map_entry_t entry __unused) 2364 { 2365 vm_paddr_t pa; 2366 pv_entry_t pv; 2367 pt_entry_t *pte; 2368 pt_entry_t origpte, newpte; 2369 vm_paddr_t opa; 2370 vm_page_t mpte; 2371 2372 if (pmap == NULL) 2373 return; 2374 2375 va = trunc_page(va); 2376 2377 vm_object_hold(pmap->pm_pteobj); 2378 2379 /* 2380 * Get the page table page. The kernel_pmap's page table pages 2381 * are preallocated and have no associated vm_page_t. 2382 * 2383 * If not NULL, mpte will be busied and we must vm_page_wakeup() 2384 * to cleanup. There will already be at least one wire count from 2385 * it being mapped into its parent. 2386 */ 2387 if (pmap == &kernel_pmap) { 2388 mpte = NULL; 2389 pte = vtopte(va); 2390 } else { 2391 mpte = pmap_allocpte(pmap, va); 2392 pte = (void *)PHYS_TO_DMAP(mpte->phys_addr); 2393 pte += pmap_pte_index(va); 2394 } 2395 2396 /* 2397 * Deal with races against the kernel's real MMU by cleaning the 2398 * page, even if we are re-entering the same page. 2399 */ 2400 pa = VM_PAGE_TO_PHYS(m); 2401 origpte = pmap_inval_loadandclear(pte, pmap, va); 2402 /*origpte = pmap_clean_pte(pte, pmap, va, NULL);*/ 2403 opa = origpte & VPTE_FRAME; 2404 2405 if (origpte & VPTE_PS) 2406 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2407 2408 if ((origpte & (VPTE_MANAGED|VPTE_M)) == (VPTE_MANAGED|VPTE_M)) { 2409 vm_page_t om; 2410 2411 pmap_track_modified(pmap, va); 2412 om = PHYS_TO_VM_PAGE(opa); 2413 vm_page_dirty(om); 2414 } 2415 2416 /* 2417 * Mapping has not changed, must be protection or wiring change. 2418 */ 2419 if (origpte && (opa == pa)) { 2420 /* 2421 * Wiring change, just update stats. We don't worry about 2422 * wiring PT pages as they remain resident as long as there 2423 * are valid mappings in them. Hence, if a user page is wired, 2424 * the PT page will be also. 2425 */ 2426 if (wired && ((origpte & VPTE_WIRED) == 0)) 2427 atomic_add_long(&pmap->pm_stats.wired_count, 1); 2428 else if (!wired && (origpte & VPTE_WIRED)) 2429 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2430 2431 if (origpte & VPTE_MANAGED) { 2432 pa |= VPTE_MANAGED; 2433 KKASSERT(m->flags & PG_MAPPED); 2434 KKASSERT((m->flags & PG_FICTITIOUS) == 0); 2435 } else { 2436 KKASSERT((m->flags & PG_FICTITIOUS)); 2437 } 2438 vm_page_spin_lock(m); 2439 goto validate; 2440 } 2441 2442 /* 2443 * Bump the wire_count for the page table page. 2444 */ 2445 if (mpte) 2446 vm_page_wire_quick(mpte); 2447 2448 /* 2449 * Mapping has changed, invalidate old range and fall through to 2450 * handle validating new mapping. Don't inherit anything from 2451 * oldpte. 2452 */ 2453 if (opa) { 2454 int err; 2455 err = pmap_remove_pte(pmap, NULL, origpte, va); 2456 origpte = 0; 2457 if (err) 2458 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2459 } 2460 2461 /* 2462 * Enter on the PV list if part of our managed memory. Note that we 2463 * raise IPL while manipulating pv_table since pmap_enter can be 2464 * called at interrupt time. 2465 */ 2466 if (pmap_initialized) { 2467 if ((m->flags & PG_FICTITIOUS) == 0) { 2468 /* 2469 * WARNING! We are using m's spin-lock as a 2470 * man's pte lock to interlock against 2471 * pmap_page_protect() operations. 2472 * 2473 * This is a bad hack (obviously). 2474 */ 2475 pv = get_pv_entry(); 2476 vm_page_spin_lock(m); 2477 pmap_insert_entry(pmap, va, mpte, m, pv); 2478 pa |= VPTE_MANAGED; 2479 /* vm_page_spin_unlock(m); */ 2480 } else { 2481 vm_page_spin_lock(m); 2482 } 2483 } else { 2484 vm_page_spin_lock(m); 2485 } 2486 2487 /* 2488 * Increment counters 2489 */ 2490 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2491 if (wired) 2492 atomic_add_long(&pmap->pm_stats.wired_count, 1); 2493 2494 validate: 2495 /* 2496 * Now validate mapping with desired protection/wiring. 2497 */ 2498 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | VPTE_V | VPTE_U); 2499 newpte |= VPTE_A; 2500 2501 if (wired) 2502 newpte |= VPTE_WIRED; 2503 // if (pmap != &kernel_pmap) 2504 newpte |= VPTE_U; 2505 if (newpte & VPTE_RW) 2506 vm_page_flag_set(m, PG_WRITEABLE); 2507 KKASSERT((newpte & VPTE_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2508 2509 origpte = atomic_swap_long(pte, newpte); 2510 if (origpte & VPTE_M) { 2511 kprintf("pmap [M] race @ %016jx\n", va); 2512 atomic_set_long(pte, VPTE_M); 2513 } 2514 vm_page_spin_unlock(m); 2515 2516 if (mpte) 2517 vm_page_wakeup(mpte); 2518 vm_object_drop(pmap->pm_pteobj); 2519 } 2520 2521 /* 2522 * Make a temporary mapping for a physical address. This is only intended 2523 * to be used for panic dumps. 2524 * 2525 * The caller is responsible for calling smp_invltlb(). 2526 */ 2527 void * 2528 pmap_kenter_temporary(vm_paddr_t pa, long i) 2529 { 2530 pmap_kenter_quick(crashdumpmap + (i * PAGE_SIZE), pa); 2531 return ((void *)crashdumpmap); 2532 } 2533 2534 #define MAX_INIT_PT (96) 2535 2536 /* 2537 * This routine preloads the ptes for a given object into the specified pmap. 2538 * This eliminates the blast of soft faults on process startup and 2539 * immediately after an mmap. 2540 * 2541 * No requirements. 2542 */ 2543 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2544 2545 void 2546 pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry, 2547 vm_offset_t addr, vm_size_t size, int limit) 2548 { 2549 vm_prot_t prot = entry->protection; 2550 vm_object_t object = entry->ba.object; 2551 vm_pindex_t pindex = atop(entry->ba.offset + (addr - entry->ba.start)); 2552 struct rb_vm_page_scan_info info; 2553 struct lwp *lp; 2554 vm_size_t psize; 2555 2556 /* 2557 * We can't preinit if read access isn't set or there is no pmap 2558 * or object. 2559 */ 2560 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2561 return; 2562 2563 /* 2564 * We can't preinit if the pmap is not the current pmap 2565 */ 2566 lp = curthread->td_lwp; 2567 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2568 return; 2569 2570 /* 2571 * Misc additional checks 2572 */ 2573 psize = x86_64_btop(size); 2574 2575 if ((object->type != OBJT_VNODE) || 2576 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2577 (object->resident_page_count > MAX_INIT_PT))) { 2578 return; 2579 } 2580 2581 if (psize + pindex > object->size) { 2582 if (object->size < pindex) 2583 return; 2584 psize = object->size - pindex; 2585 } 2586 2587 if (psize == 0) 2588 return; 2589 2590 /* 2591 * Use a red-black scan to traverse the requested range and load 2592 * any valid pages found into the pmap. 2593 * 2594 * We cannot safely scan the object's memq unless we are in a 2595 * critical section since interrupts can remove pages from objects. 2596 */ 2597 info.start_pindex = pindex; 2598 info.end_pindex = pindex + psize - 1; 2599 info.limit = limit; 2600 info.mpte = NULL; 2601 info.addr = addr; 2602 info.pmap = pmap; 2603 info.entry = entry; 2604 2605 vm_object_hold_shared(object); 2606 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2607 pmap_object_init_pt_callback, &info); 2608 vm_object_drop(object); 2609 } 2610 2611 static 2612 int 2613 pmap_object_init_pt_callback(vm_page_t p, void *data) 2614 { 2615 struct rb_vm_page_scan_info *info = data; 2616 vm_pindex_t rel_index; 2617 /* 2618 * don't allow an madvise to blow away our really 2619 * free pages allocating pv entries. 2620 */ 2621 if ((info->limit & MAP_PREFAULT_MADVISE) && 2622 vmstats.v_free_count < vmstats.v_free_reserved) { 2623 return(-1); 2624 } 2625 2626 /* 2627 * Ignore list markers and ignore pages we cannot instantly 2628 * busy (while holding the object token). 2629 */ 2630 if (p->flags & PG_MARKER) 2631 return 0; 2632 if (vm_page_busy_try(p, TRUE)) 2633 return 0; 2634 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2635 (p->flags & PG_FICTITIOUS) == 0) { 2636 if ((p->queue - p->pc) == PQ_CACHE) 2637 vm_page_deactivate(p); 2638 rel_index = p->pindex - info->start_pindex; 2639 pmap_enter(info->pmap, info->addr + x86_64_ptob(rel_index), p, 2640 VM_PROT_READ, FALSE, info->entry); 2641 } 2642 vm_page_wakeup(p); 2643 return(0); 2644 } 2645 2646 /* 2647 * Return TRUE if the pmap is in shape to trivially 2648 * pre-fault the specified address. 2649 * 2650 * Returns FALSE if it would be non-trivial or if a 2651 * pte is already loaded into the slot. 2652 * 2653 * No requirements. 2654 */ 2655 int 2656 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2657 { 2658 pt_entry_t *pte; 2659 pd_entry_t *pde; 2660 int ret; 2661 2662 vm_object_hold(pmap->pm_pteobj); 2663 pde = pmap_pde(pmap, addr); 2664 if (pde == NULL || *pde == 0) { 2665 ret = 0; 2666 } else { 2667 pte = pmap_pde_to_pte(pde, addr); 2668 ret = (*pte) ? 0 : 1; 2669 } 2670 vm_object_drop(pmap->pm_pteobj); 2671 2672 return (ret); 2673 } 2674 2675 /* 2676 * Change the wiring attribute for a map/virtual-address pair. 2677 * 2678 * The mapping must already exist in the pmap. 2679 * No other requirements. 2680 */ 2681 vm_page_t 2682 pmap_unwire(pmap_t pmap, vm_offset_t va) 2683 { 2684 pt_entry_t *pte; 2685 vm_paddr_t pa; 2686 vm_page_t m; 2687 2688 if (pmap == NULL) 2689 return NULL; 2690 2691 vm_object_hold(pmap->pm_pteobj); 2692 pte = pmap_pte(pmap, va); 2693 2694 if (pte == NULL || (*pte & VPTE_V) == 0) { 2695 vm_object_drop(pmap->pm_pteobj); 2696 return NULL; 2697 } 2698 2699 /* 2700 * Wiring is not a hardware characteristic so there is no need to 2701 * invalidate TLB. However, in an SMP environment we must use 2702 * a locked bus cycle to update the pte (if we are not using 2703 * the pmap_inval_*() API that is)... it's ok to do this for simple 2704 * wiring changes. 2705 */ 2706 if (pmap_pte_w(pte)) 2707 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2708 /* XXX else return NULL so caller doesn't unwire m ? */ 2709 atomic_clear_long(pte, VPTE_WIRED); 2710 2711 pa = *pte & VPTE_FRAME; 2712 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 2713 2714 vm_object_drop(pmap->pm_pteobj); 2715 2716 return m; 2717 } 2718 2719 /* 2720 * Copy the range specified by src_addr/len 2721 * from the source map to the range dst_addr/len 2722 * in the destination map. 2723 * 2724 * This routine is only advisory and need not do anything. 2725 */ 2726 void 2727 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2728 vm_size_t len, vm_offset_t src_addr) 2729 { 2730 /* 2731 * XXX BUGGY. Amoung other things srcmpte is assumed to remain 2732 * valid through blocking calls, and that's just not going to 2733 * be the case. 2734 * 2735 * FIXME! 2736 */ 2737 return; 2738 } 2739 2740 /* 2741 * pmap_zero_page: 2742 * 2743 * Zero the specified physical page. 2744 * 2745 * This function may be called from an interrupt and no locking is 2746 * required. 2747 */ 2748 void 2749 pmap_zero_page(vm_paddr_t phys) 2750 { 2751 vm_offset_t va = PHYS_TO_DMAP(phys); 2752 2753 bzero((void *)va, PAGE_SIZE); 2754 } 2755 2756 /* 2757 * pmap_zero_page: 2758 * 2759 * Zero part of a physical page by mapping it into memory and clearing 2760 * its contents with bzero. 2761 * 2762 * off and size may not cover an area beyond a single hardware page. 2763 */ 2764 void 2765 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 2766 { 2767 vm_offset_t virt = PHYS_TO_DMAP(phys); 2768 2769 bzero((char *)virt + off, size); 2770 } 2771 2772 /* 2773 * pmap_copy_page: 2774 * 2775 * Copy the physical page from the source PA to the target PA. 2776 * This function may be called from an interrupt. No locking 2777 * is required. 2778 */ 2779 void 2780 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 2781 { 2782 vm_offset_t src_virt, dst_virt; 2783 2784 src_virt = PHYS_TO_DMAP(src); 2785 dst_virt = PHYS_TO_DMAP(dst); 2786 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 2787 } 2788 2789 /* 2790 * pmap_copy_page_frag: 2791 * 2792 * Copy the physical page from the source PA to the target PA. 2793 * This function may be called from an interrupt. No locking 2794 * is required. 2795 */ 2796 void 2797 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 2798 { 2799 vm_offset_t src_virt, dst_virt; 2800 2801 src_virt = PHYS_TO_DMAP(src); 2802 dst_virt = PHYS_TO_DMAP(dst); 2803 bcopy((char *)src_virt + (src & PAGE_MASK), 2804 (char *)dst_virt + (dst & PAGE_MASK), 2805 bytes); 2806 } 2807 2808 /* 2809 * Remove all pages from specified address space this aids process 2810 * exit speeds. Also, this code is special cased for current 2811 * process only, but can have the more generic (and slightly slower) 2812 * mode enabled. This is much faster than pmap_remove in the case 2813 * of running down an entire address space. 2814 * 2815 * No other requirements. 2816 */ 2817 void 2818 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2819 { 2820 pmap_remove(pmap, sva, eva); 2821 #if 0 2822 pt_entry_t *pte, tpte; 2823 pv_entry_t pv, npv; 2824 vm_page_t m; 2825 int save_generation; 2826 2827 if (pmap->pm_pteobj) 2828 vm_object_hold(pmap->pm_pteobj); 2829 2830 pmap_invalidate_range(pmap, sva, eva); 2831 2832 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 2833 if (pv->pv_va >= eva || pv->pv_va < sva) { 2834 npv = TAILQ_NEXT(pv, pv_plist); 2835 continue; 2836 } 2837 2838 KKASSERT(pmap == pv->pv_pmap); 2839 2840 pte = pmap_pte(pmap, pv->pv_va); 2841 2842 /* 2843 * We cannot remove wired pages from a process' mapping 2844 * at this time 2845 */ 2846 if (*pte & VPTE_WIRED) { 2847 npv = TAILQ_NEXT(pv, pv_plist); 2848 continue; 2849 } 2850 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2851 2852 m = PHYS_TO_VM_PAGE(tpte & VPTE_FRAME); 2853 vm_page_spin_lock(m); 2854 2855 KASSERT(m < &vm_page_array[vm_page_array_size], 2856 ("pmap_remove_pages: bad tpte %lx", tpte)); 2857 2858 KKASSERT(pmap->pm_stats.resident_count > 0); 2859 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2860 2861 /* 2862 * Update the vm_page_t clean and reference bits. 2863 */ 2864 if (tpte & VPTE_M) { 2865 vm_page_dirty(m); 2866 } 2867 2868 npv = TAILQ_NEXT(pv, pv_plist); 2869 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2870 atomic_add_int(&pmap->pm_generation, 1); 2871 save_generation = pmap->pm_generation; 2872 m->md.pv_list_count--; 2873 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2874 if (TAILQ_EMPTY(&m->md.pv_list)) 2875 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2876 vm_page_spin_unlock(m); 2877 2878 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2879 free_pv_entry(pv); 2880 2881 /* 2882 * Restart the scan if we blocked during the unuse or free 2883 * calls and other removals were made. 2884 */ 2885 if (save_generation != pmap->pm_generation) { 2886 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 2887 npv = TAILQ_FIRST(&pmap->pm_pvlist); 2888 } 2889 } 2890 if (pmap->pm_pteobj) 2891 vm_object_drop(pmap->pm_pteobj); 2892 pmap_remove(pmap, sva, eva); 2893 #endif 2894 } 2895 2896 /* 2897 * pmap_testbit tests bits in active mappings of a VM page. 2898 */ 2899 static boolean_t 2900 pmap_testbit(vm_page_t m, int bit) 2901 { 2902 pv_entry_t pv; 2903 pt_entry_t *pte; 2904 2905 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2906 return FALSE; 2907 2908 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 2909 return FALSE; 2910 2911 vm_page_spin_lock(m); 2912 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2913 /* 2914 * if the bit being tested is the modified bit, then 2915 * mark clean_map and ptes as never 2916 * modified. 2917 */ 2918 if (bit & (VPTE_A|VPTE_M)) 2919 pmap_track_modified(pv->pv_pmap, pv->pv_va); 2920 2921 #if defined(PMAP_DIAGNOSTIC) 2922 if (pv->pv_pmap == NULL) { 2923 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 2924 continue; 2925 } 2926 #endif 2927 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2928 if (*pte & bit) { 2929 vm_page_spin_unlock(m); 2930 return TRUE; 2931 } 2932 } 2933 vm_page_spin_unlock(m); 2934 return (FALSE); 2935 } 2936 2937 /* 2938 * This routine is used to clear bits in ptes. Certain bits require special 2939 * handling, in particular (on virtual kernels) the VPTE_M (modify) bit. 2940 * 2941 * This routine is only called with certain VPTE_* bit combinations. 2942 */ 2943 static __inline void 2944 pmap_clearbit(vm_page_t m, int bit) 2945 { 2946 pv_entry_t pv; 2947 pt_entry_t *pte; 2948 pt_entry_t pbits; 2949 vm_object_t pmobj; 2950 pmap_t pmap; 2951 2952 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2953 if (bit == VPTE_RW) 2954 vm_page_flag_clear(m, PG_WRITEABLE); 2955 return; 2956 } 2957 2958 /* 2959 * Loop over all current mappings setting/clearing as appropos If 2960 * setting RO do we need to clear the VAC? 2961 */ 2962 restart: 2963 vm_page_spin_lock(m); 2964 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2965 /* 2966 * Need the pmap object lock(?) 2967 */ 2968 pmap = pv->pv_pmap; 2969 pmobj = pmap->pm_pteobj; 2970 2971 if (vm_object_hold_try(pmobj) == 0) { 2972 refcount_acquire(&pmobj->hold_count); 2973 vm_page_spin_unlock(m); 2974 vm_object_lock(pmobj); 2975 vm_object_drop(pmobj); 2976 goto restart; 2977 } 2978 2979 /* 2980 * don't write protect pager mappings 2981 */ 2982 if (bit == VPTE_RW) { 2983 pmap_track_modified(pv->pv_pmap, pv->pv_va); 2984 } 2985 2986 #if defined(PMAP_DIAGNOSTIC) 2987 if (pv->pv_pmap == NULL) { 2988 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 2989 vm_object_drop(pmobj); 2990 continue; 2991 } 2992 #endif 2993 2994 /* 2995 * Careful here. We can use a locked bus instruction to 2996 * clear VPTE_A or VPTE_M safely but we need to synchronize 2997 * with the target cpus when we mess with VPTE_RW. 2998 * 2999 * On virtual kernels we must force a new fault-on-write 3000 * in the real kernel if we clear the Modify bit ourselves, 3001 * otherwise the real kernel will not get a new fault and 3002 * will never set our Modify bit again. 3003 */ 3004 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3005 if (*pte & bit) { 3006 if (bit == VPTE_RW) { 3007 /* 3008 * We must also clear VPTE_M when clearing 3009 * VPTE_RW and synchronize its state to 3010 * the page. 3011 */ 3012 pmap_track_modified(pv->pv_pmap, pv->pv_va); 3013 pbits = pmap_clean_pte(pte, pv->pv_pmap, 3014 pv->pv_va, m); 3015 } else if (bit == VPTE_M) { 3016 /* 3017 * We must invalidate the real-kernel pte 3018 * when clearing VPTE_M bit to force the 3019 * real-kernel to take a new fault to re-set 3020 * VPTE_M. 3021 */ 3022 atomic_clear_long(pte, VPTE_M); 3023 if (*pte & VPTE_RW) { 3024 pmap_invalidate_range(pv->pv_pmap, 3025 pv->pv_va, 3026 pv->pv_va + PAGE_SIZE); 3027 } 3028 } else if ((bit & (VPTE_RW|VPTE_M)) == 3029 (VPTE_RW|VPTE_M)) { 3030 /* 3031 * We've been asked to clear W & M, I guess 3032 * the caller doesn't want us to update 3033 * the dirty status of the VM page. 3034 */ 3035 pmap_track_modified(pv->pv_pmap, pv->pv_va); 3036 pmap_clean_pte(pte, pv->pv_pmap, pv->pv_va, m); 3037 panic("shouldn't be called"); 3038 } else { 3039 /* 3040 * We've been asked to clear bits that do 3041 * not interact with hardware. 3042 */ 3043 atomic_clear_long(pte, bit); 3044 } 3045 } 3046 vm_object_drop(pmobj); 3047 } 3048 if (bit == VPTE_RW) 3049 vm_page_flag_clear(m, PG_WRITEABLE); 3050 vm_page_spin_unlock(m); 3051 } 3052 3053 /* 3054 * Lower the permission for all mappings to a given page. 3055 * 3056 * No other requirements. 3057 */ 3058 void 3059 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3060 { 3061 if ((prot & VM_PROT_WRITE) == 0) { 3062 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3063 pmap_clearbit(m, VPTE_RW); 3064 } else { 3065 pmap_remove_all(m); 3066 } 3067 } 3068 } 3069 3070 vm_paddr_t 3071 pmap_phys_address(vm_pindex_t ppn) 3072 { 3073 return (x86_64_ptob(ppn)); 3074 } 3075 3076 /* 3077 * Return a count of reference bits for a page, clearing those bits. 3078 * It is not necessary for every reference bit to be cleared, but it 3079 * is necessary that 0 only be returned when there are truly no 3080 * reference bits set. 3081 * 3082 * XXX: The exact number of bits to check and clear is a matter that 3083 * should be tested and standardized at some point in the future for 3084 * optimal aging of shared pages. 3085 * 3086 * No other requirements. 3087 */ 3088 int 3089 pmap_ts_referenced(vm_page_t m) 3090 { 3091 pv_entry_t pv, pvf, pvn; 3092 pt_entry_t *pte; 3093 int rtval = 0; 3094 3095 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3096 return (rtval); 3097 3098 vm_page_spin_lock(m); 3099 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3100 pvf = pv; 3101 do { 3102 pvn = TAILQ_NEXT(pv, pv_list); 3103 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3104 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3105 3106 pmap_track_modified(pv->pv_pmap, pv->pv_va); 3107 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3108 3109 if (pte && (*pte & VPTE_A)) { 3110 atomic_clear_long(pte, VPTE_A); 3111 rtval++; 3112 if (rtval > 4) { 3113 break; 3114 } 3115 } 3116 } while ((pv = pvn) != NULL && pv != pvf); 3117 } 3118 vm_page_spin_unlock(m); 3119 3120 return (rtval); 3121 } 3122 3123 /* 3124 * Return whether or not the specified physical page was modified 3125 * in any physical maps. 3126 * 3127 * No other requirements. 3128 */ 3129 boolean_t 3130 pmap_is_modified(vm_page_t m) 3131 { 3132 boolean_t res; 3133 3134 res = pmap_testbit(m, VPTE_M); 3135 3136 return (res); 3137 } 3138 3139 /* 3140 * Clear the modify bits on the specified physical page. For the vkernel 3141 * we really need to clean the page, which clears VPTE_RW and VPTE_M, in 3142 * order to ensure that we take a fault on the next write to the page. 3143 * Otherwise the page may become dirty without us knowing it. 3144 * 3145 * No other requirements. 3146 */ 3147 void 3148 pmap_clear_modify(vm_page_t m) 3149 { 3150 pmap_clearbit(m, VPTE_RW); 3151 } 3152 3153 /* 3154 * Clear the reference bit on the specified physical page. 3155 * 3156 * No other requirements. 3157 */ 3158 void 3159 pmap_clear_reference(vm_page_t m) 3160 { 3161 pmap_clearbit(m, VPTE_A); 3162 } 3163 3164 /* 3165 * Miscellaneous support routines follow 3166 */ 3167 static void 3168 x86_64_protection_init(void) 3169 { 3170 uint64_t *kp; 3171 int prot; 3172 3173 kp = protection_codes; 3174 for (prot = 0; prot < 8; prot++) { 3175 if (prot & VM_PROT_READ) 3176 *kp |= 0; /* R */ 3177 if (prot & VM_PROT_WRITE) 3178 *kp |= VPTE_RW; /* R+W */ 3179 if (prot && (prot & VM_PROT_EXECUTE) == 0) 3180 *kp |= VPTE_NX; /* NX - !executable */ 3181 ++kp; 3182 } 3183 } 3184 3185 /* 3186 * Sets the memory attribute for the specified page. 3187 */ 3188 void 3189 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 3190 { 3191 /* This is a vkernel, do nothing */ 3192 } 3193 3194 /* 3195 * Change the PAT attribute on an existing kernel memory map. Caller 3196 * must ensure that the virtual memory in question is not accessed 3197 * during the adjustment. 3198 */ 3199 void 3200 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 3201 { 3202 /* This is a vkernel, do nothing */ 3203 } 3204 3205 /* 3206 * Perform the pmap work for mincore 3207 * 3208 * No other requirements. 3209 */ 3210 int 3211 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3212 { 3213 pt_entry_t *ptep, pte; 3214 vm_page_t m; 3215 int val = 0; 3216 3217 vm_object_hold(pmap->pm_pteobj); 3218 ptep = pmap_pte(pmap, addr); 3219 3220 if (ptep && (pte = *ptep) != 0) { 3221 vm_paddr_t pa; 3222 3223 val = MINCORE_INCORE; 3224 if ((pte & VPTE_MANAGED) == 0) 3225 goto done; 3226 3227 pa = pte & VPTE_FRAME; 3228 3229 m = PHYS_TO_VM_PAGE(pa); 3230 3231 /* 3232 * Modified by us 3233 */ 3234 if (pte & VPTE_M) 3235 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3236 /* 3237 * Modified by someone 3238 */ 3239 else if (m->dirty || pmap_is_modified(m)) 3240 val |= MINCORE_MODIFIED_OTHER; 3241 /* 3242 * Referenced by us 3243 */ 3244 if (pte & VPTE_A) 3245 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3246 3247 /* 3248 * Referenced by someone 3249 */ 3250 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3251 val |= MINCORE_REFERENCED_OTHER; 3252 vm_page_flag_set(m, PG_REFERENCED); 3253 } 3254 } 3255 done: 3256 vm_object_drop(pmap->pm_pteobj); 3257 3258 return val; 3259 } 3260 3261 /* 3262 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3263 * vmspace will be ref'd and the old one will be deref'd. 3264 * 3265 * Caller must hold vmspace->vm_map.token for oldvm and newvm 3266 */ 3267 void 3268 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3269 { 3270 struct vmspace *oldvm; 3271 struct lwp *lp; 3272 3273 oldvm = p->p_vmspace; 3274 if (oldvm != newvm) { 3275 if (adjrefs) 3276 vmspace_ref(newvm); 3277 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 3278 p->p_vmspace = newvm; 3279 KKASSERT(p->p_nthreads == 1); 3280 lp = RB_ROOT(&p->p_lwp_tree); 3281 pmap_setlwpvm(lp, newvm); 3282 if (adjrefs) 3283 vmspace_rel(oldvm); 3284 } 3285 } 3286 3287 /* 3288 * Set the vmspace for a LWP. The vmspace is almost universally set the 3289 * same as the process vmspace, but virtual kernels need to swap out contexts 3290 * on a per-lwp basis. 3291 */ 3292 void 3293 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3294 { 3295 struct vmspace *oldvm; 3296 struct pmap *pmap; 3297 3298 oldvm = lp->lwp_vmspace; 3299 if (oldvm != newvm) { 3300 crit_enter(); 3301 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 3302 lp->lwp_vmspace = newvm; 3303 if (curthread->td_lwp == lp) { 3304 pmap = vmspace_pmap(newvm); 3305 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 3306 if (pmap->pm_active_lock & CPULOCK_EXCL) 3307 pmap_interlock_wait(newvm); 3308 #if defined(SWTCH_OPTIM_STATS) 3309 tlb_flush_count++; 3310 #endif 3311 pmap = vmspace_pmap(oldvm); 3312 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 3313 mycpu->gd_cpuid); 3314 } 3315 crit_exit(); 3316 } 3317 } 3318 3319 /* 3320 * The swtch code tried to switch in a heavy weight process whos pmap 3321 * is locked by another cpu. We have to wait for the lock to clear before 3322 * the pmap can be used. 3323 */ 3324 void 3325 pmap_interlock_wait (struct vmspace *vm) 3326 { 3327 pmap_t pmap = vmspace_pmap(vm); 3328 3329 if (pmap->pm_active_lock & CPULOCK_EXCL) { 3330 crit_enter(); 3331 while (pmap->pm_active_lock & CPULOCK_EXCL) { 3332 cpu_ccfence(); 3333 pthread_yield(); 3334 } 3335 crit_exit(); 3336 } 3337 } 3338 3339 vm_offset_t 3340 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3341 { 3342 3343 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3344 return addr; 3345 } 3346 3347 addr = roundup2(addr, NBPDR); 3348 return addr; 3349 } 3350 3351 /* 3352 * Used by kmalloc/kfree, page already exists at va 3353 */ 3354 vm_page_t 3355 pmap_kvtom(vm_offset_t va) 3356 { 3357 vpte_t *ptep; 3358 3359 KKASSERT(va >= KvaStart && va < KvaEnd); 3360 ptep = vtopte(va); 3361 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 3362 } 3363 3364 void 3365 pmap_object_init(vm_object_t object) 3366 { 3367 /* empty */ 3368 } 3369 3370 void 3371 pmap_object_free(vm_object_t object) 3372 { 3373 /* empty */ 3374 } 3375 3376 void 3377 pmap_pgscan(struct pmap_pgscan_info *pginfo) 3378 { 3379 pmap_t pmap = pginfo->pmap; 3380 vm_offset_t sva = pginfo->beg_addr; 3381 vm_offset_t eva = pginfo->end_addr; 3382 vm_offset_t va_next; 3383 pml4_entry_t *pml4e; 3384 pdp_entry_t *pdpe; 3385 pd_entry_t ptpaddr, *pde; 3386 pt_entry_t *pte; 3387 vm_page_t pt_m; 3388 int stop = 0; 3389 3390 vm_object_hold(pmap->pm_pteobj); 3391 3392 for (; sva < eva; sva = va_next) { 3393 if (stop) 3394 break; 3395 3396 pml4e = pmap_pml4e(pmap, sva); 3397 if ((*pml4e & VPTE_V) == 0) { 3398 va_next = (sva + NBPML4) & ~PML4MASK; 3399 if (va_next < sva) 3400 va_next = eva; 3401 continue; 3402 } 3403 3404 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3405 if ((*pdpe & VPTE_V) == 0) { 3406 va_next = (sva + NBPDP) & ~PDPMASK; 3407 if (va_next < sva) 3408 va_next = eva; 3409 continue; 3410 } 3411 3412 va_next = (sva + NBPDR) & ~PDRMASK; 3413 if (va_next < sva) 3414 va_next = eva; 3415 3416 pde = pmap_pdpe_to_pde(pdpe, sva); 3417 ptpaddr = *pde; 3418 3419 #if 0 3420 /* 3421 * Check for large page (ignore). 3422 */ 3423 if ((ptpaddr & VPTE_PS) != 0) { 3424 #if 0 3425 pmap_clean_pde(pde, pmap, sva); 3426 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 3427 #endif 3428 continue; 3429 } 3430 #endif 3431 3432 /* 3433 * Weed out invalid mappings. Note: we assume that the page 3434 * directory table is always allocated, and in kernel virtual. 3435 */ 3436 if (ptpaddr == 0) 3437 continue; 3438 3439 if (va_next > eva) 3440 va_next = eva; 3441 3442 pt_m = pmap_hold_pt_page(pde, sva); 3443 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3444 sva += PAGE_SIZE) { 3445 vm_page_t m; 3446 3447 if (stop) 3448 break; 3449 if ((*pte & VPTE_MANAGED) == 0) 3450 continue; 3451 3452 m = PHYS_TO_VM_PAGE(*pte & VPTE_FRAME); 3453 if (vm_page_busy_try(m, TRUE) == 0) { 3454 if (pginfo->callback(pginfo, sva, m) < 0) 3455 stop = 1; 3456 } 3457 } 3458 vm_page_unhold(pt_m); 3459 } 3460 vm_object_drop(pmap->pm_pteobj); 3461 } 3462 3463 void 3464 pmap_maybethreaded(pmap_t pmap) 3465 { 3466 /* nop */ 3467 } 3468 3469 /* 3470 * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE 3471 * flags if able. 3472 * 3473 * vkernel code is using the old pmap style so the flags should already 3474 * be properly set. 3475 */ 3476 int 3477 pmap_mapped_sync(vm_page_t m) 3478 { 3479 return (m->flags); 3480 } 3481