1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 45 */ 46 47 /* 48 * Manages physical address maps. 49 */ 50 51 #include "opt_msgbuf.h" 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/msgbuf.h> 58 #include <sys/vmmeter.h> 59 #include <sys/mman.h> 60 #include <sys/vmspace.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <sys/sysctl.h> 65 #include <sys/lock.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_zone.h> 74 75 #include <sys/user.h> 76 #include <sys/thread2.h> 77 #include <sys/spinlock2.h> 78 #include <vm/vm_page2.h> 79 80 #include <machine/cputypes.h> 81 #include <machine/md_var.h> 82 #include <machine/specialreg.h> 83 #include <machine/smp.h> 84 #include <machine/globaldata.h> 85 #include <machine/pmap.h> 86 #include <machine/pmap_inval.h> 87 88 #include <ddb/ddb.h> 89 90 #include <stdio.h> 91 #include <assert.h> 92 #include <stdlib.h> 93 #include <pthread.h> 94 95 #define PMAP_KEEP_PDIRS 96 #ifndef PMAP_SHPGPERPROC 97 #define PMAP_SHPGPERPROC 1000 98 #endif 99 100 #if defined(DIAGNOSTIC) 101 #define PMAP_DIAGNOSTIC 102 #endif 103 104 #define MINPV 2048 105 106 #if !defined(PMAP_DIAGNOSTIC) 107 #define PMAP_INLINE __inline 108 #else 109 #define PMAP_INLINE 110 #endif 111 112 /* 113 * Get PDEs and PTEs for user/kernel address space 114 */ 115 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 116 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 117 118 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & VPTE_V) != 0) 119 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & VPTE_WIRED) != 0) 120 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & VPTE_M) != 0) 121 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & VPTE_A) != 0) 122 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & VPTE_V) != 0) 123 124 /* 125 * Given a map and a machine independent protection code, 126 * convert to a vax protection code. 127 */ 128 #define pte_prot(m, p) \ 129 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 130 static uint64_t protection_codes[8]; 131 132 struct pmap kernel_pmap; 133 134 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 135 136 static struct vm_object kptobj; 137 static int nkpt; 138 139 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 140 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 141 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 142 143 extern int vmm_enabled; 144 extern void *vkernel_stack; 145 146 /* 147 * Data for the pv entry allocation mechanism 148 */ 149 static vm_zone_t pvzone; 150 static struct vm_zone pvzone_store; 151 static vm_pindex_t pv_entry_count = 0; 152 static vm_pindex_t pv_entry_max = 0; 153 static vm_pindex_t pv_entry_high_water = 0; 154 static int pmap_pagedaemon_waken = 0; 155 static struct pv_entry *pvinit; 156 157 /* 158 * All those kernel PT submaps that BSD is so fond of 159 */ 160 pt_entry_t *CMAP1 = NULL, *ptmmap; 161 caddr_t CADDR1 = NULL; 162 static pt_entry_t *msgbufmap; 163 164 uint64_t KPTphys; 165 166 static PMAP_INLINE void free_pv_entry (pv_entry_t pv); 167 static pv_entry_t get_pv_entry (void); 168 static void x86_64_protection_init (void); 169 static __inline void pmap_clearbit (vm_page_t m, int bit); 170 171 static void pmap_remove_all (vm_page_t m); 172 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 173 pt_entry_t oldpte, vm_offset_t sva); 174 static void pmap_remove_page (struct pmap *pmap, vm_offset_t va); 175 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 176 vm_offset_t va); 177 static boolean_t pmap_testbit (vm_page_t m, int bit); 178 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 179 vm_page_t mpte, vm_page_t m, pv_entry_t); 180 181 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 182 183 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 184 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 185 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 186 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t); 187 188 static int 189 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 190 { 191 if (pv1->pv_va < pv2->pv_va) 192 return(-1); 193 if (pv1->pv_va > pv2->pv_va) 194 return(1); 195 return(0); 196 } 197 198 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 199 pv_entry_compare, vm_offset_t, pv_va); 200 201 static __inline vm_pindex_t 202 pmap_pt_pindex(vm_offset_t va) 203 { 204 return va >> PDRSHIFT; 205 } 206 207 static __inline vm_pindex_t 208 pmap_pte_index(vm_offset_t va) 209 { 210 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 211 } 212 213 static __inline vm_pindex_t 214 pmap_pde_index(vm_offset_t va) 215 { 216 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 217 } 218 219 static __inline vm_pindex_t 220 pmap_pdpe_index(vm_offset_t va) 221 { 222 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 223 } 224 225 static __inline vm_pindex_t 226 pmap_pml4e_index(vm_offset_t va) 227 { 228 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 229 } 230 231 /* Return a pointer to the PML4 slot that corresponds to a VA */ 232 static __inline pml4_entry_t * 233 pmap_pml4e(pmap_t pmap, vm_offset_t va) 234 { 235 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 236 } 237 238 /* Return a pointer to the PDP slot that corresponds to a VA */ 239 static __inline pdp_entry_t * 240 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 241 { 242 pdp_entry_t *pdpe; 243 244 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & VPTE_FRAME); 245 return (&pdpe[pmap_pdpe_index(va)]); 246 } 247 248 /* Return a pointer to the PDP slot that corresponds to a VA */ 249 static __inline pdp_entry_t * 250 pmap_pdpe(pmap_t pmap, vm_offset_t va) 251 { 252 pml4_entry_t *pml4e; 253 254 pml4e = pmap_pml4e(pmap, va); 255 if ((*pml4e & VPTE_V) == 0) 256 return NULL; 257 return (pmap_pml4e_to_pdpe(pml4e, va)); 258 } 259 260 /* Return a pointer to the PD slot that corresponds to a VA */ 261 static __inline pd_entry_t * 262 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 263 { 264 pd_entry_t *pde; 265 266 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & VPTE_FRAME); 267 return (&pde[pmap_pde_index(va)]); 268 } 269 270 /* Return a pointer to the PD slot that corresponds to a VA */ 271 static __inline pd_entry_t * 272 pmap_pde(pmap_t pmap, vm_offset_t va) 273 { 274 pdp_entry_t *pdpe; 275 276 pdpe = pmap_pdpe(pmap, va); 277 if (pdpe == NULL || (*pdpe & VPTE_V) == 0) 278 return NULL; 279 return (pmap_pdpe_to_pde(pdpe, va)); 280 } 281 282 /* Return a pointer to the PT slot that corresponds to a VA */ 283 static __inline pt_entry_t * 284 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 285 { 286 pt_entry_t *pte; 287 288 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & VPTE_FRAME); 289 return (&pte[pmap_pte_index(va)]); 290 } 291 292 /* 293 * Hold pt_m for page table scans to prevent it from getting reused out 294 * from under us across blocking conditions in the body of the loop. 295 */ 296 static __inline 297 vm_page_t 298 pmap_hold_pt_page(pd_entry_t *pde, vm_offset_t va) 299 { 300 pt_entry_t pte; 301 vm_page_t pt_m; 302 303 pte = (pt_entry_t)*pde; 304 KKASSERT(pte != 0); 305 pt_m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME); 306 vm_page_hold(pt_m); 307 308 return pt_m; 309 } 310 311 /* Return a pointer to the PT slot that corresponds to a VA */ 312 static __inline pt_entry_t * 313 pmap_pte(pmap_t pmap, vm_offset_t va) 314 { 315 pd_entry_t *pde; 316 317 pde = pmap_pde(pmap, va); 318 if (pde == NULL || (*pde & VPTE_V) == 0) 319 return NULL; 320 if ((*pde & VPTE_PS) != 0) /* compat with x86 pmap_pte() */ 321 return ((pt_entry_t *)pde); 322 return (pmap_pde_to_pte(pde, va)); 323 } 324 325 static PMAP_INLINE pt_entry_t * 326 vtopte(vm_offset_t va) 327 { 328 pt_entry_t *x; 329 x = pmap_pte(&kernel_pmap, va); 330 assert(x != NULL); 331 return x; 332 } 333 334 static __inline pd_entry_t * 335 vtopde(vm_offset_t va) 336 { 337 pd_entry_t *x; 338 x = pmap_pde(&kernel_pmap, va); 339 assert(x != NULL); 340 return x; 341 } 342 343 /* 344 * Returns the physical address translation from va for a user address. 345 * (vm_paddr_t)-1 is returned on failure. 346 */ 347 vm_paddr_t 348 uservtophys(vm_offset_t va) 349 { 350 struct vmspace *vm = curproc->p_vmspace; 351 vm_page_t m; 352 vm_paddr_t pa; 353 int error; 354 int busy; 355 356 /* XXX No idea how to handle this case in a simple way, just abort */ 357 if (PAGE_SIZE - (va & PAGE_MASK) < sizeof(u_int)) 358 return ((vm_paddr_t)-1); 359 360 m = vm_fault_page(&vm->vm_map, trunc_page(va), 361 VM_PROT_READ|VM_PROT_WRITE, 362 VM_FAULT_NORMAL, 363 &error, &busy); 364 if (error) 365 return ((vm_paddr_t)-1); 366 367 pa = VM_PAGE_TO_PHYS(m) | (va & PAGE_MASK); 368 if (busy) 369 vm_page_wakeup(m); 370 else 371 vm_page_unhold(m); 372 373 return pa; 374 } 375 376 static uint64_t 377 allocpages(vm_paddr_t *firstaddr, int n) 378 { 379 uint64_t ret; 380 381 ret = *firstaddr; 382 /*bzero((void *)ret, n * PAGE_SIZE); not mapped yet */ 383 *firstaddr += n * PAGE_SIZE; 384 return (ret); 385 } 386 387 static void 388 create_dmap_vmm(vm_paddr_t *firstaddr) 389 { 390 void *stack_addr; 391 int pml4_stack_index; 392 int pdp_stack_index; 393 int pd_stack_index; 394 long i,j; 395 int regs[4]; 396 int amd_feature; 397 398 uint64_t KPDP_DMAP_phys = allocpages(firstaddr, NDMPML4E); 399 uint64_t KPDP_VSTACK_phys = allocpages(firstaddr, 1); 400 uint64_t KPD_VSTACK_phys = allocpages(firstaddr, 1); 401 402 pml4_entry_t *KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 403 pdp_entry_t *KPDP_DMAP_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_DMAP_phys); 404 pdp_entry_t *KPDP_VSTACK_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_VSTACK_phys); 405 pd_entry_t *KPD_VSTACK_virt = (pd_entry_t *)PHYS_TO_DMAP(KPD_VSTACK_phys); 406 407 bzero(KPDP_DMAP_virt, NDMPML4E * PAGE_SIZE); 408 bzero(KPDP_VSTACK_virt, 1 * PAGE_SIZE); 409 bzero(KPD_VSTACK_virt, 1 * PAGE_SIZE); 410 411 do_cpuid(0x80000001, regs); 412 amd_feature = regs[3]; 413 414 /* Build the mappings for the first 512GB */ 415 if (amd_feature & AMDID_PAGE1GB) { 416 /* In pages of 1 GB, if supported */ 417 for (i = 0; i < NPDPEPG; i++) { 418 KPDP_DMAP_virt[i] = ((uint64_t)i << PDPSHIFT); 419 KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_PS | VPTE_U; 420 } 421 } else { 422 /* In page of 2MB, otherwise */ 423 for (i = 0; i < NPDPEPG; i++) { 424 uint64_t KPD_DMAP_phys; 425 pd_entry_t *KPD_DMAP_virt; 426 427 KPD_DMAP_phys = allocpages(firstaddr, 1); 428 KPD_DMAP_virt = 429 (pd_entry_t *)PHYS_TO_DMAP(KPD_DMAP_phys); 430 431 bzero(KPD_DMAP_virt, PAGE_SIZE); 432 433 KPDP_DMAP_virt[i] = KPD_DMAP_phys; 434 KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_U; 435 436 /* For each PD, we have to allocate NPTEPG PT */ 437 for (j = 0; j < NPTEPG; j++) { 438 KPD_DMAP_virt[j] = (i << PDPSHIFT) | 439 (j << PDRSHIFT); 440 KPD_DMAP_virt[j] |= VPTE_RW | VPTE_V | 441 VPTE_PS | VPTE_U; 442 } 443 } 444 } 445 446 /* DMAP for the first 512G */ 447 KPML4virt[0] = KPDP_DMAP_phys; 448 KPML4virt[0] |= VPTE_RW | VPTE_V | VPTE_U; 449 450 /* create a 2 MB map of the new stack */ 451 pml4_stack_index = (uint64_t)&stack_addr >> PML4SHIFT; 452 KPML4virt[pml4_stack_index] = KPDP_VSTACK_phys; 453 KPML4virt[pml4_stack_index] |= VPTE_RW | VPTE_V | VPTE_U; 454 455 pdp_stack_index = ((uint64_t)&stack_addr & PML4MASK) >> PDPSHIFT; 456 KPDP_VSTACK_virt[pdp_stack_index] = KPD_VSTACK_phys; 457 KPDP_VSTACK_virt[pdp_stack_index] |= VPTE_RW | VPTE_V | VPTE_U; 458 459 pd_stack_index = ((uint64_t)&stack_addr & PDPMASK) >> PDRSHIFT; 460 KPD_VSTACK_virt[pd_stack_index] = (uint64_t) vkernel_stack; 461 KPD_VSTACK_virt[pd_stack_index] |= VPTE_RW | VPTE_V | VPTE_U | VPTE_PS; 462 } 463 464 static void 465 create_pagetables(vm_paddr_t *firstaddr, int64_t ptov_offset) 466 { 467 int i; 468 pml4_entry_t *KPML4virt; 469 pdp_entry_t *KPDPvirt; 470 pd_entry_t *KPDvirt; 471 pt_entry_t *KPTvirt; 472 int kpml4i = pmap_pml4e_index(ptov_offset); 473 int kpdpi = pmap_pdpe_index(ptov_offset); 474 int kpdi = pmap_pde_index(ptov_offset); 475 476 /* 477 * Calculate NKPT - number of kernel page tables. We have to 478 * accomodoate prealloction of the vm_page_array, dump bitmap, 479 * MSGBUF_SIZE, and other stuff. Be generous. 480 * 481 * Maxmem is in pages. 482 */ 483 nkpt = (Maxmem * (sizeof(struct vm_page) * 2) + MSGBUF_SIZE) / NBPDR; 484 /* 485 * Allocate pages 486 */ 487 KPML4phys = allocpages(firstaddr, 1); 488 KPDPphys = allocpages(firstaddr, NKPML4E); 489 KPDphys = allocpages(firstaddr, NKPDPE); 490 KPTphys = allocpages(firstaddr, nkpt); 491 492 KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 493 KPDPvirt = (pdp_entry_t *)PHYS_TO_DMAP(KPDPphys); 494 KPDvirt = (pd_entry_t *)PHYS_TO_DMAP(KPDphys); 495 KPTvirt = (pt_entry_t *)PHYS_TO_DMAP(KPTphys); 496 497 bzero(KPML4virt, 1 * PAGE_SIZE); 498 bzero(KPDPvirt, NKPML4E * PAGE_SIZE); 499 bzero(KPDvirt, NKPDPE * PAGE_SIZE); 500 bzero(KPTvirt, nkpt * PAGE_SIZE); 501 502 /* Now map the page tables at their location within PTmap */ 503 for (i = 0; i < nkpt; i++) { 504 KPDvirt[i + kpdi] = KPTphys + (i << PAGE_SHIFT); 505 KPDvirt[i + kpdi] |= VPTE_RW | VPTE_V | VPTE_U; 506 } 507 508 /* And connect up the PD to the PDP */ 509 for (i = 0; i < NKPDPE; i++) { 510 KPDPvirt[i + kpdpi] = KPDphys + (i << PAGE_SHIFT); 511 KPDPvirt[i + kpdpi] |= VPTE_RW | VPTE_V | VPTE_U; 512 } 513 514 /* And recursively map PML4 to itself in order to get PTmap */ 515 KPML4virt[PML4PML4I] = KPML4phys; 516 KPML4virt[PML4PML4I] |= VPTE_RW | VPTE_V | VPTE_U; 517 518 /* Connect the KVA slot up to the PML4 */ 519 KPML4virt[kpml4i] = KPDPphys; 520 KPML4virt[kpml4i] |= VPTE_RW | VPTE_V | VPTE_U; 521 } 522 523 /* 524 * Typically used to initialize a fictitious page by vm/device_pager.c 525 */ 526 void 527 pmap_page_init(struct vm_page *m) 528 { 529 vm_page_init(m); 530 TAILQ_INIT(&m->md.pv_list); 531 } 532 533 /* 534 * Bootstrap the system enough to run with virtual memory. 535 * 536 * On x86_64 this is called after mapping has already been enabled 537 * and just syncs the pmap module with what has already been done. 538 * [We can't call it easily with mapping off since the kernel is not 539 * mapped with PA == VA, hence we would have to relocate every address 540 * from the linked base (virtual) address "KERNBASE" to the actual 541 * (physical) address starting relative to 0] 542 */ 543 void 544 pmap_bootstrap(vm_paddr_t *firstaddr, int64_t ptov_offset) 545 { 546 vm_offset_t va; 547 pt_entry_t *pte; 548 549 /* 550 * Create an initial set of page tables to run the kernel in. 551 */ 552 create_pagetables(firstaddr, ptov_offset); 553 554 /* Create the DMAP for the VMM */ 555 if (vmm_enabled) { 556 create_dmap_vmm(firstaddr); 557 } 558 559 virtual_start = KvaStart; 560 virtual_end = KvaEnd; 561 562 /* 563 * Initialize protection array. 564 */ 565 x86_64_protection_init(); 566 567 /* 568 * The kernel's pmap is statically allocated so we don't have to use 569 * pmap_create, which is unlikely to work correctly at this part of 570 * the boot sequence (XXX and which no longer exists). 571 * 572 * The kernel_pmap's pm_pteobj is used only for locking and not 573 * for mmu pages. 574 */ 575 kernel_pmap.pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 576 kernel_pmap.pm_count = 1; 577 /* don't allow deactivation */ 578 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 579 kernel_pmap.pm_pteobj = NULL; /* see pmap_init */ 580 RB_INIT(&kernel_pmap.pm_pvroot); 581 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 582 583 /* 584 * Reserve some special page table entries/VA space for temporary 585 * mapping of pages. 586 */ 587 #define SYSMAP(c, p, v, n) \ 588 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 589 590 va = virtual_start; 591 pte = pmap_pte(&kernel_pmap, va); 592 /* 593 * CMAP1/CMAP2 are used for zeroing and copying pages. 594 */ 595 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 596 597 #if JGV 598 /* 599 * Crashdump maps. 600 */ 601 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 602 #endif 603 604 /* 605 * ptvmmap is used for reading arbitrary physical pages via 606 * /dev/mem. 607 */ 608 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 609 610 /* 611 * msgbufp is used to map the system message buffer. 612 * XXX msgbufmap is not used. 613 */ 614 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 615 atop(round_page(MSGBUF_SIZE))) 616 617 virtual_start = va; 618 619 *CMAP1 = 0; 620 /* Not ready to do an invltlb yet for VMM*/ 621 if (!vmm_enabled) 622 cpu_invltlb(); 623 624 } 625 626 /* 627 * Initialize the pmap module. 628 * Called by vm_init, to initialize any structures that the pmap 629 * system needs to map virtual memory. 630 * pmap_init has been enhanced to support in a fairly consistant 631 * way, discontiguous physical memory. 632 */ 633 void 634 pmap_init(void) 635 { 636 vm_pindex_t i; 637 vm_pindex_t initial_pvs; 638 639 /* 640 * object for kernel page table pages 641 */ 642 /* JG I think the number can be arbitrary */ 643 vm_object_init(&kptobj, 5); 644 kernel_pmap.pm_pteobj = &kptobj; 645 646 /* 647 * Allocate memory for random pmap data structures. Includes the 648 * pv_head_table. 649 */ 650 for (i = 0; i < vm_page_array_size; i++) { 651 vm_page_t m; 652 653 m = &vm_page_array[i]; 654 TAILQ_INIT(&m->md.pv_list); 655 m->md.pv_list_count = 0; 656 } 657 658 /* 659 * init the pv free list 660 */ 661 initial_pvs = vm_page_array_size; 662 if (initial_pvs < MINPV) 663 initial_pvs = MINPV; 664 pvzone = &pvzone_store; 665 pvinit = (struct pv_entry *) 666 kmem_alloc(&kernel_map, 667 initial_pvs * sizeof (struct pv_entry), 668 VM_SUBSYS_PVENTRY); 669 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, 670 initial_pvs); 671 672 /* 673 * Now it is safe to enable pv_table recording. 674 */ 675 pmap_initialized = TRUE; 676 } 677 678 /* 679 * Initialize the address space (zone) for the pv_entries. Set a 680 * high water mark so that the system can recover from excessive 681 * numbers of pv entries. 682 */ 683 void 684 pmap_init2(void) 685 { 686 vm_pindex_t shpgperproc = PMAP_SHPGPERPROC; 687 688 TUNABLE_LONG_FETCH("vm.pmap.shpgperproc", &shpgperproc); 689 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 690 TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &pv_entry_max); 691 pv_entry_high_water = 9 * (pv_entry_max / 10); 692 zinitna(pvzone, NULL, 0, pv_entry_max, ZONE_INTERRUPT); 693 } 694 695 696 /*************************************************** 697 * Low level helper routines..... 698 ***************************************************/ 699 700 /* 701 * The modification bit is not tracked for any pages in this range. XXX 702 * such pages in this maps should always use pmap_k*() functions and not 703 * be managed anyhow. 704 * 705 * XXX User and kernel address spaces are independant for virtual kernels, 706 * this function only applies to the kernel pmap. 707 */ 708 int 709 pmap_track_modified(pmap_t pmap, vm_offset_t va) 710 { 711 if (pmap != &kernel_pmap) 712 return 1; 713 if ((va < clean_sva) || (va >= clean_eva)) 714 return 1; 715 else 716 return 0; 717 } 718 719 /* 720 * Extract the physical page address associated with the map/VA pair. 721 * 722 * No requirements. 723 */ 724 vm_paddr_t 725 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 726 { 727 vm_paddr_t rtval; 728 pt_entry_t *pte; 729 pd_entry_t pde, *pdep; 730 731 vm_object_hold(pmap->pm_pteobj); 732 rtval = 0; 733 pdep = pmap_pde(pmap, va); 734 if (pdep != NULL) { 735 pde = *pdep; 736 if (pde) { 737 if ((pde & VPTE_PS) != 0) { 738 /* JGV */ 739 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 740 } else { 741 pte = pmap_pde_to_pte(pdep, va); 742 rtval = (*pte & VPTE_FRAME) | (va & PAGE_MASK); 743 } 744 } 745 } 746 if (handlep) 747 *handlep = NULL; /* XXX */ 748 vm_object_drop(pmap->pm_pteobj); 749 750 return rtval; 751 } 752 753 void 754 pmap_extract_done(void *handle) 755 { 756 pmap_t pmap; 757 758 if (handle) { 759 pmap = handle; 760 vm_object_drop(pmap->pm_pteobj); 761 } 762 } 763 764 /* 765 * Similar to extract but checks protections, SMP-friendly short-cut for 766 * vm_fault_page[_quick](). 767 * 768 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NEITHER IT NOR ITS TARGET 769 * DATA IS SUITABLE FOR WRITING. Writing can interfere with 770 * pageouts flushes, msync, etc. The hold_count is not enough 771 * to avoid races against pageouts and other flush code doesn't 772 * care about hold_count. 773 */ 774 vm_page_t 775 pmap_fault_page_quick(pmap_t pmap __unused, vm_offset_t vaddr __unused, 776 vm_prot_t prot __unused, int *busyp __unused) 777 { 778 return(NULL); 779 } 780 781 /* 782 * Routine: pmap_kextract 783 * Function: 784 * Extract the physical page address associated 785 * kernel virtual address. 786 */ 787 vm_paddr_t 788 pmap_kextract(vm_offset_t va) 789 { 790 pd_entry_t pde; 791 vm_paddr_t pa; 792 793 KKASSERT(va >= KvaStart && va < KvaEnd); 794 795 /* 796 * The DMAP region is not included in [KvaStart, KvaEnd) 797 */ 798 #if 0 799 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 800 pa = DMAP_TO_PHYS(va); 801 } else { 802 #endif 803 pde = *vtopde(va); 804 if (pde & VPTE_PS) { 805 /* JGV */ 806 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 807 } else { 808 /* 809 * Beware of a concurrent promotion that changes the 810 * PDE at this point! For example, vtopte() must not 811 * be used to access the PTE because it would use the 812 * new PDE. It is, however, safe to use the old PDE 813 * because the page table page is preserved by the 814 * promotion. 815 */ 816 pa = *pmap_pde_to_pte(&pde, va); 817 pa = (pa & VPTE_FRAME) | (va & PAGE_MASK); 818 } 819 #if 0 820 } 821 #endif 822 return pa; 823 } 824 825 /*************************************************** 826 * Low level mapping routines..... 827 ***************************************************/ 828 829 /* 830 * Enter a mapping into kernel_pmap. Mappings created in this fashion 831 * are not managed. Mappings must be immediately accessible on all cpus. 832 * 833 * Call pmap_inval_pte() to invalidate the virtual pte and clean out the 834 * real pmap and handle related races before storing the new vpte. The 835 * new semantics for kenter require use to do an UNCONDITIONAL invalidation, 836 * because the entry may have previously been cleared without an invalidation. 837 */ 838 void 839 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 840 { 841 pt_entry_t *ptep; 842 pt_entry_t npte; 843 844 KKASSERT(va >= KvaStart && va < KvaEnd); 845 npte = pa | VPTE_RW | VPTE_V | VPTE_U; 846 ptep = vtopte(va); 847 848 #if 1 849 pmap_inval_pte(ptep, &kernel_pmap, va); 850 #else 851 if (*pte & VPTE_V) 852 pmap_inval_pte(ptep, &kernel_pmap, va); 853 #endif 854 atomic_swap_long(ptep, npte); 855 } 856 857 /* 858 * Enter an unmanaged KVA mapping for the private use of the current 859 * cpu only. 860 * 861 * It is illegal for the mapping to be accessed by other cpus without 862 * proper invalidation. 863 */ 864 int 865 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 866 { 867 pt_entry_t *ptep; 868 pt_entry_t npte; 869 int res; 870 871 KKASSERT(va >= KvaStart && va < KvaEnd); 872 873 npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; 874 ptep = vtopte(va); 875 876 #if 1 877 pmap_inval_pte_quick(ptep, &kernel_pmap, va); 878 res = 1; 879 #else 880 /* FUTURE */ 881 res = (*ptep != 0); 882 if (*pte & VPTE_V) 883 pmap_inval_pte(pte, &kernel_pmap, va); 884 #endif 885 atomic_swap_long(ptep, npte); 886 887 return res; 888 } 889 890 /* 891 * Invalidation will occur later, ok to be lazy here. 892 */ 893 int 894 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 895 { 896 pt_entry_t *ptep; 897 pt_entry_t npte; 898 int res; 899 900 KKASSERT(va >= KvaStart && va < KvaEnd); 901 902 npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; 903 ptep = vtopte(va); 904 #if 1 905 res = 1; 906 #else 907 /* FUTURE */ 908 res = (*ptep != 0); 909 #endif 910 atomic_swap_long(ptep, npte); 911 912 return res; 913 } 914 915 /* 916 * Remove an unmanaged mapping created with pmap_kenter*(). 917 */ 918 void 919 pmap_kremove(vm_offset_t va) 920 { 921 pt_entry_t *ptep; 922 923 KKASSERT(va >= KvaStart && va < KvaEnd); 924 925 ptep = vtopte(va); 926 atomic_swap_long(ptep, 0); 927 pmap_inval_pte(ptep, &kernel_pmap, va); 928 } 929 930 /* 931 * Remove an unmanaged mapping created with pmap_kenter*() but synchronize 932 * only with this cpu. 933 * 934 * Unfortunately because we optimize new entries by testing VPTE_V later 935 * on, we actually still have to synchronize with all the cpus. XXX maybe 936 * store a junk value and test against 0 in the other places instead? 937 */ 938 void 939 pmap_kremove_quick(vm_offset_t va) 940 { 941 pt_entry_t *ptep; 942 943 KKASSERT(va >= KvaStart && va < KvaEnd); 944 945 ptep = vtopte(va); 946 atomic_swap_long(ptep, 0); 947 pmap_inval_pte(ptep, &kernel_pmap, va); /* NOT _quick */ 948 } 949 950 /* 951 * Invalidation will occur later, ok to be lazy here. 952 */ 953 void 954 pmap_kremove_noinval(vm_offset_t va) 955 { 956 pt_entry_t *ptep; 957 958 KKASSERT(va >= KvaStart && va < KvaEnd); 959 960 ptep = vtopte(va); 961 atomic_swap_long(ptep, 0); 962 } 963 964 /* 965 * Used to map a range of physical addresses into kernel 966 * virtual address space. 967 * 968 * For now, VM is already on, we only need to map the 969 * specified memory. 970 */ 971 vm_offset_t 972 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 973 { 974 return PHYS_TO_DMAP(start); 975 } 976 977 /* 978 * Map a set of unmanaged VM pages into KVM. 979 */ 980 static __inline void 981 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) 982 { 983 vm_offset_t end_va; 984 vm_offset_t va; 985 986 end_va = beg_va + count * PAGE_SIZE; 987 KKASSERT(beg_va >= KvaStart && end_va <= KvaEnd); 988 989 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 990 pt_entry_t *ptep; 991 992 ptep = vtopte(va); 993 atomic_swap_long(ptep, VM_PAGE_TO_PHYS(*m) | 994 VPTE_RW | VPTE_V | VPTE_U); 995 ++m; 996 } 997 if (doinval) 998 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 999 /* pmap_inval_pte(pte, &kernel_pmap, va); */ 1000 } 1001 1002 void 1003 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 1004 { 1005 _pmap_qenter(beg_va, m, count, 1); 1006 } 1007 1008 void 1009 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) 1010 { 1011 _pmap_qenter(beg_va, m, count, 0); 1012 } 1013 1014 /* 1015 * Undo the effects of pmap_qenter*(). 1016 */ 1017 void 1018 pmap_qremove(vm_offset_t beg_va, int count) 1019 { 1020 vm_offset_t end_va; 1021 vm_offset_t va; 1022 1023 end_va = beg_va + count * PAGE_SIZE; 1024 KKASSERT(beg_va >= KvaStart && end_va < KvaEnd); 1025 1026 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 1027 pt_entry_t *ptep; 1028 1029 ptep = vtopte(va); 1030 atomic_swap_long(ptep, 0); 1031 } 1032 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 1033 } 1034 1035 /* 1036 * Unlike the real pmap code, we can't avoid calling the real-kernel. 1037 */ 1038 void 1039 pmap_qremove_quick(vm_offset_t va, int count) 1040 { 1041 pmap_qremove(va, count); 1042 } 1043 1044 void 1045 pmap_qremove_noinval(vm_offset_t va, int count) 1046 { 1047 pmap_qremove(va, count); 1048 } 1049 1050 /* 1051 * This routine works like vm_page_lookup() but also blocks as long as the 1052 * page is busy. This routine does not busy the page it returns. 1053 * 1054 * Unless the caller is managing objects whos pages are in a known state, 1055 * the call should be made with a critical section held so the page's object 1056 * association remains valid on return. 1057 */ 1058 static vm_page_t 1059 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1060 { 1061 vm_page_t m; 1062 1063 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1064 m = vm_page_lookup_busy_wait(object, pindex, TRUE, "pplookp"); 1065 1066 return(m); 1067 } 1068 1069 /* 1070 * Create a new thread and optionally associate it with a (new) process. 1071 * NOTE! the new thread's cpu may not equal the current cpu. 1072 */ 1073 void 1074 pmap_init_thread(thread_t td) 1075 { 1076 /* enforce pcb placement */ 1077 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1078 td->td_savefpu = &td->td_pcb->pcb_save; 1079 td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */ 1080 } 1081 1082 /* 1083 * This routine directly affects the fork perf for a process. 1084 */ 1085 void 1086 pmap_init_proc(struct proc *p) 1087 { 1088 } 1089 1090 /* 1091 * Unwire a page table which has been removed from the pmap. We own the 1092 * wire_count, so the page cannot go away. The page representing the page 1093 * table is passed in unbusied and must be busied if we cannot trivially 1094 * unwire it. 1095 * 1096 * XXX NOTE! This code is not usually run because we do not currently 1097 * implement dynamic page table page removal. The page in 1098 * its parent assumes at least 1 wire count, so no call to this 1099 * function ever sees a wire count less than 2. 1100 */ 1101 static int 1102 pmap_unwire_pgtable(pmap_t pmap, vm_offset_t va, vm_page_t m) 1103 { 1104 /* 1105 * Try to unwire optimally. If non-zero is returned the wire_count 1106 * is 1 and we must busy the page to unwire it. 1107 */ 1108 if (vm_page_unwire_quick(m) == 0) 1109 return 0; 1110 1111 vm_page_busy_wait(m, TRUE, "pmuwpt"); 1112 KASSERT(m->queue == PQ_NONE, 1113 ("_pmap_unwire_pgtable: %p->queue != PQ_NONE", m)); 1114 1115 if (m->wire_count == 1) { 1116 /* 1117 * Unmap the page table page. 1118 */ 1119 /* pmap_inval_add(info, pmap, -1); */ 1120 1121 if (m->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1122 /* PDP page */ 1123 pml4_entry_t *pml4; 1124 pml4 = pmap_pml4e(pmap, va); 1125 *pml4 = 0; 1126 } else if (m->pindex >= NUPT_TOTAL) { 1127 /* PD page */ 1128 pdp_entry_t *pdp; 1129 pdp = pmap_pdpe(pmap, va); 1130 *pdp = 0; 1131 } else { 1132 /* PT page */ 1133 pd_entry_t *pd; 1134 pd = pmap_pde(pmap, va); 1135 *pd = 0; 1136 } 1137 1138 KKASSERT(pmap->pm_stats.resident_count > 0); 1139 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1140 1141 if (pmap->pm_ptphint == m) 1142 pmap->pm_ptphint = NULL; 1143 1144 if (m->pindex < NUPT_TOTAL) { 1145 /* We just released a PT, unhold the matching PD */ 1146 vm_page_t pdpg; 1147 1148 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & 1149 VPTE_FRAME); 1150 pmap_unwire_pgtable(pmap, va, pdpg); 1151 } 1152 if (m->pindex >= NUPT_TOTAL && 1153 m->pindex < (NUPT_TOTAL + NUPD_TOTAL)) { 1154 /* We just released a PD, unhold the matching PDP */ 1155 vm_page_t pdppg; 1156 1157 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & 1158 VPTE_FRAME); 1159 pmap_unwire_pgtable(pmap, va, pdppg); 1160 } 1161 1162 /* 1163 * This was our last wire, the page had better be unwired 1164 * after we decrement wire_count. 1165 * 1166 * FUTURE NOTE: shared page directory page could result in 1167 * multiple wire counts. 1168 */ 1169 vm_page_unwire(m, 0); 1170 KKASSERT(m->wire_count == 0); 1171 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1172 vm_page_flash(m); 1173 vm_page_free(m); 1174 return 1; 1175 } else { 1176 /* XXX SMP race to 1 if not holding vmobj */ 1177 vm_page_unwire(m, 0); 1178 vm_page_wakeup(m); 1179 return 0; 1180 } 1181 } 1182 1183 /* 1184 * After removing a page table entry, this routine is used to 1185 * conditionally free the page, and manage the hold/wire counts. 1186 * 1187 * If not NULL the caller owns a wire_count on mpte, so it can't disappear. 1188 * If NULL the caller owns a wire_count on what would be the mpte, we must 1189 * look it up. 1190 */ 1191 static int 1192 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1193 { 1194 vm_pindex_t ptepindex; 1195 1196 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1197 1198 if (mpte == NULL) { 1199 /* 1200 * page table pages in the kernel_pmap are not managed. 1201 */ 1202 if (pmap == &kernel_pmap) 1203 return(0); 1204 ptepindex = pmap_pt_pindex(va); 1205 if (pmap->pm_ptphint && 1206 (pmap->pm_ptphint->pindex == ptepindex)) { 1207 mpte = pmap->pm_ptphint; 1208 } else { 1209 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1210 pmap->pm_ptphint = mpte; 1211 vm_page_wakeup(mpte); 1212 } 1213 } 1214 return pmap_unwire_pgtable(pmap, va, mpte); 1215 } 1216 1217 /* 1218 * Initialize pmap0/vmspace0 . Since process 0 never enters user mode we 1219 * just dummy it up so it works well enough for fork(). 1220 * 1221 * In DragonFly, process pmaps may only be used to manipulate user address 1222 * space, never kernel address space. 1223 */ 1224 void 1225 pmap_pinit0(struct pmap *pmap) 1226 { 1227 pmap_pinit(pmap); 1228 } 1229 1230 /* 1231 * Initialize a preallocated and zeroed pmap structure, 1232 * such as one in a vmspace structure. 1233 */ 1234 void 1235 pmap_pinit(struct pmap *pmap) 1236 { 1237 vm_page_t ptdpg; 1238 1239 /* 1240 * No need to allocate page table space yet but we do need a valid 1241 * page directory table. 1242 */ 1243 if (pmap->pm_pml4 == NULL) { 1244 pmap->pm_pml4 = (pml4_entry_t *) 1245 kmem_alloc_pageable(&kernel_map, PAGE_SIZE, 1246 VM_SUBSYS_PML4); 1247 } 1248 1249 /* 1250 * Allocate an object for the ptes 1251 */ 1252 if (pmap->pm_pteobj == NULL) 1253 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL + 1); 1254 1255 /* 1256 * Allocate the page directory page, unless we already have 1257 * one cached. If we used the cached page the wire_count will 1258 * already be set appropriately. 1259 */ 1260 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1261 ptdpg = vm_page_grab(pmap->pm_pteobj, 1262 NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL, 1263 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | 1264 VM_ALLOC_ZERO); 1265 pmap->pm_pdirm = ptdpg; 1266 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_WRITEABLE); 1267 vm_page_wire(ptdpg); 1268 vm_page_wakeup(ptdpg); 1269 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1270 } 1271 pmap->pm_count = 1; 1272 CPUMASK_ASSZERO(pmap->pm_active); 1273 pmap->pm_ptphint = NULL; 1274 RB_INIT(&pmap->pm_pvroot); 1275 spin_init(&pmap->pm_spin, "pmapinit"); 1276 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1277 pmap->pm_stats.resident_count = 1; 1278 pmap->pm_stats.wired_count = 1; 1279 } 1280 1281 /* 1282 * Clean up a pmap structure so it can be physically freed. This routine 1283 * is called by the vmspace dtor function. A great deal of pmap data is 1284 * left passively mapped to improve vmspace management so we have a bit 1285 * of cleanup work to do here. 1286 * 1287 * No requirements. 1288 */ 1289 void 1290 pmap_puninit(pmap_t pmap) 1291 { 1292 vm_page_t p; 1293 1294 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 1295 if ((p = pmap->pm_pdirm) != NULL) { 1296 KKASSERT(pmap->pm_pml4 != NULL); 1297 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1298 vm_page_busy_wait(p, TRUE, "pgpun"); 1299 vm_page_unwire(p, 0); 1300 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1301 vm_page_free(p); 1302 pmap->pm_pdirm = NULL; 1303 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1304 KKASSERT(pmap->pm_stats.wired_count == 0); 1305 } 1306 if (pmap->pm_pml4) { 1307 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1308 pmap->pm_pml4 = NULL; 1309 } 1310 if (pmap->pm_pteobj) { 1311 vm_object_deallocate(pmap->pm_pteobj); 1312 pmap->pm_pteobj = NULL; 1313 } 1314 } 1315 1316 /* 1317 * This function is now unused (used to add the pmap to the pmap_list) 1318 */ 1319 void 1320 pmap_pinit2(struct pmap *pmap) 1321 { 1322 } 1323 1324 /* 1325 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1326 * 0 on failure (if the procedure had to sleep). 1327 * 1328 * When asked to remove the page directory page itself, we actually just 1329 * leave it cached so we do not have to incur the SMP inval overhead of 1330 * removing the kernel mapping. pmap_puninit() will take care of it. 1331 */ 1332 static int 1333 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1334 { 1335 /* 1336 * This code optimizes the case of freeing non-busy 1337 * page-table pages. Those pages are zero now, and 1338 * might as well be placed directly into the zero queue. 1339 */ 1340 if (vm_page_busy_try(p, TRUE)) { 1341 vm_page_sleep_busy(p, TRUE, "pmaprl"); 1342 return 1; 1343 } 1344 1345 /* 1346 * Remove the page table page from the processes address space. 1347 */ 1348 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1349 /* 1350 * We are the pml4 table itself. 1351 */ 1352 /* XXX anything to do here? */ 1353 } else if (p->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1354 /* 1355 * We are a PDP page. 1356 * We look for the PML4 entry that points to us. 1357 */ 1358 vm_page_t m4; 1359 pml4_entry_t *pml4; 1360 int idx; 1361 1362 m4 = vm_page_lookup(pmap->pm_pteobj, 1363 NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 1364 KKASSERT(m4 != NULL); 1365 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1366 idx = (p->pindex - (NUPT_TOTAL + NUPD_TOTAL)) % NPML4EPG; 1367 KKASSERT(pml4[idx] != 0); 1368 if (pml4[idx] == 0) 1369 kprintf("pmap_release: Unmapped PML4\n"); 1370 pml4[idx] = 0; 1371 vm_page_unwire_quick(m4); 1372 } else if (p->pindex >= NUPT_TOTAL) { 1373 /* 1374 * We are a PD page. 1375 * We look for the PDP entry that points to us. 1376 */ 1377 vm_page_t m3; 1378 pdp_entry_t *pdp; 1379 int idx; 1380 1381 m3 = vm_page_lookup(pmap->pm_pteobj, 1382 NUPT_TOTAL + NUPD_TOTAL + 1383 (p->pindex - NUPT_TOTAL) / NPDPEPG); 1384 KKASSERT(m3 != NULL); 1385 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1386 idx = (p->pindex - NUPT_TOTAL) % NPDPEPG; 1387 KKASSERT(pdp[idx] != 0); 1388 if (pdp[idx] == 0) 1389 kprintf("pmap_release: Unmapped PDP %d\n", idx); 1390 pdp[idx] = 0; 1391 vm_page_unwire_quick(m3); 1392 } else { 1393 /* We are a PT page. 1394 * We look for the PD entry that points to us. 1395 */ 1396 vm_page_t m2; 1397 pd_entry_t *pd; 1398 int idx; 1399 1400 m2 = vm_page_lookup(pmap->pm_pteobj, 1401 NUPT_TOTAL + p->pindex / NPDEPG); 1402 KKASSERT(m2 != NULL); 1403 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1404 idx = p->pindex % NPDEPG; 1405 if (pd[idx] == 0) 1406 kprintf("pmap_release: Unmapped PD %d\n", idx); 1407 pd[idx] = 0; 1408 vm_page_unwire_quick(m2); 1409 } 1410 KKASSERT(pmap->pm_stats.resident_count > 0); 1411 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1412 1413 if (p->wire_count > 1) { 1414 panic("pmap_release: freeing held pt page " 1415 "pmap=%p pg=%p dmap=%p pi=%ld {%ld,%ld,%ld}", 1416 pmap, p, (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(p)), 1417 p->pindex, NUPT_TOTAL, NUPD_TOTAL, NUPDP_TOTAL); 1418 } 1419 1420 if (pmap->pm_ptphint == p) 1421 pmap->pm_ptphint = NULL; 1422 1423 /* 1424 * We leave the top-level page table page cached, wired, and mapped in 1425 * the pmap until the dtor function (pmap_puninit()) gets called. 1426 * However, still clean it up. 1427 */ 1428 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1429 bzero(pmap->pm_pml4, PAGE_SIZE); 1430 vm_page_wakeup(p); 1431 } else { 1432 vm_page_unwire(p, 0); 1433 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 1434 vm_page_free(p); 1435 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1436 } 1437 return 0; 1438 } 1439 1440 /* 1441 * Locate the requested PT, PD, or PDP page table page. 1442 * 1443 * Returns a busied page, caller must vm_page_wakeup() when done. 1444 */ 1445 static vm_page_t 1446 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1447 { 1448 vm_page_t m; 1449 vm_page_t pm; 1450 vm_pindex_t pindex; 1451 pt_entry_t *ptep; 1452 pt_entry_t data; 1453 1454 /* 1455 * Find or fabricate a new pagetable page. A non-zero wire_count 1456 * indicates that the page has already been mapped into its parent. 1457 */ 1458 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1459 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1460 if (m->wire_count != 0) 1461 return m; 1462 1463 /* 1464 * Map the page table page into its parent, giving it 1 wire count. 1465 */ 1466 vm_page_wire(m); 1467 vm_page_unmanage(m); 1468 atomic_add_long(&pmap->pm_stats.resident_count, 1); 1469 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 1470 1471 data = VM_PAGE_TO_PHYS(m) | 1472 VPTE_RW | VPTE_V | VPTE_U | VPTE_A | VPTE_M | VPTE_WIRED; 1473 atomic_add_long(&pmap->pm_stats.wired_count, 1); 1474 1475 if (ptepindex >= (NUPT_TOTAL + NUPD_TOTAL)) { 1476 /* 1477 * Map PDP into the PML4 1478 */ 1479 pindex = ptepindex - (NUPT_TOTAL + NUPD_TOTAL); 1480 pindex &= (NUPDP_TOTAL - 1); 1481 ptep = (pt_entry_t *)pmap->pm_pml4; 1482 pm = NULL; 1483 } else if (ptepindex >= NUPT_TOTAL) { 1484 /* 1485 * Map PD into its PDP 1486 */ 1487 pindex = (ptepindex - NUPT_TOTAL) >> NPDPEPGSHIFT; 1488 pindex += NUPT_TOTAL + NUPD_TOTAL; 1489 pm = _pmap_allocpte(pmap, pindex); 1490 pindex = (ptepindex - NUPT_TOTAL) & (NPDPEPG - 1); 1491 ptep = (void *)PHYS_TO_DMAP(pm->phys_addr); 1492 } else { 1493 /* 1494 * Map PT into its PD 1495 */ 1496 pindex = ptepindex >> NPDPEPGSHIFT; 1497 pindex += NUPT_TOTAL; 1498 pm = _pmap_allocpte(pmap, pindex); 1499 pindex = ptepindex & (NPTEPG - 1); 1500 ptep = (void *)PHYS_TO_DMAP(pm->phys_addr); 1501 } 1502 1503 /* 1504 * Install the pte in (pm). (m) prevents races. 1505 */ 1506 ptep += pindex; 1507 data = atomic_swap_long(ptep, data); 1508 if (pm) { 1509 vm_page_wire_quick(pm); 1510 vm_page_wakeup(pm); 1511 } 1512 pmap->pm_ptphint = pm; 1513 1514 return m; 1515 } 1516 1517 /* 1518 * Determine the page table page required to access the VA in the pmap 1519 * and allocate it if necessary. Return a held vm_page_t for the page. 1520 * 1521 * Only used with user pmaps. 1522 */ 1523 static vm_page_t 1524 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1525 { 1526 vm_pindex_t ptepindex; 1527 vm_page_t m; 1528 1529 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); 1530 1531 /* 1532 * Calculate pagetable page index, and return the PT page to 1533 * the caller. 1534 */ 1535 ptepindex = pmap_pt_pindex(va); 1536 m = _pmap_allocpte(pmap, ptepindex); 1537 1538 return m; 1539 } 1540 1541 /*************************************************** 1542 * Pmap allocation/deallocation routines. 1543 ***************************************************/ 1544 1545 /* 1546 * Release any resources held by the given physical map. 1547 * Called when a pmap initialized by pmap_pinit is being released. 1548 * Should only be called if the map contains no valid mappings. 1549 */ 1550 static int pmap_release_callback(struct vm_page *p, void *data); 1551 1552 void 1553 pmap_release(struct pmap *pmap) 1554 { 1555 vm_object_t object = pmap->pm_pteobj; 1556 struct rb_vm_page_scan_info info; 1557 1558 KKASSERT(pmap != &kernel_pmap); 1559 1560 #if defined(DIAGNOSTIC) 1561 if (object->ref_count != 1) 1562 panic("pmap_release: pteobj reference count != 1"); 1563 #endif 1564 1565 info.pmap = pmap; 1566 info.object = object; 1567 1568 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 1569 ("pmap %p still active! %016jx", 1570 pmap, 1571 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 1572 1573 vm_object_hold(object); 1574 do { 1575 info.error = 0; 1576 info.mpte = NULL; 1577 info.limit = object->generation; 1578 1579 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1580 pmap_release_callback, &info); 1581 if (info.error == 0 && info.mpte) { 1582 if (pmap_release_free_page(pmap, info.mpte)) 1583 info.error = 1; 1584 } 1585 } while (info.error); 1586 1587 pmap->pm_ptphint = NULL; 1588 1589 KASSERT((pmap->pm_stats.wired_count == (pmap->pm_pdirm != NULL)), 1590 ("pmap_release: dangling count %p %ld", 1591 pmap, pmap->pm_stats.wired_count)); 1592 1593 vm_object_drop(object); 1594 } 1595 1596 static int 1597 pmap_release_callback(struct vm_page *p, void *data) 1598 { 1599 struct rb_vm_page_scan_info *info = data; 1600 1601 if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) { 1602 info->mpte = p; 1603 return(0); 1604 } 1605 if (pmap_release_free_page(info->pmap, p)) { 1606 info->error = 1; 1607 return(-1); 1608 } 1609 if (info->object->generation != info->limit) { 1610 info->error = 1; 1611 return(-1); 1612 } 1613 return(0); 1614 } 1615 1616 /* 1617 * Grow the number of kernel page table entries, if needed. 1618 * 1619 * kernel_map must be locked exclusively by the caller. 1620 */ 1621 void 1622 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 1623 { 1624 vm_offset_t addr; 1625 vm_paddr_t paddr; 1626 vm_offset_t ptppaddr; 1627 vm_page_t nkpg; 1628 pd_entry_t *pde, newpdir; 1629 pdp_entry_t newpdp; 1630 1631 addr = kend; 1632 1633 vm_object_hold(&kptobj); 1634 if (kernel_vm_end == 0) { 1635 kernel_vm_end = KvaStart; 1636 nkpt = 0; 1637 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & VPTE_V) != 0) { 1638 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1639 nkpt++; 1640 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1641 kernel_vm_end = kernel_map.max_offset; 1642 break; 1643 } 1644 } 1645 } 1646 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1647 if (addr - 1 >= kernel_map.max_offset) 1648 addr = kernel_map.max_offset; 1649 while (kernel_vm_end < addr) { 1650 pde = pmap_pde(&kernel_pmap, kernel_vm_end); 1651 if (pde == NULL) { 1652 /* We need a new PDP entry */ 1653 nkpg = vm_page_alloc(&kptobj, nkpt, 1654 VM_ALLOC_NORMAL | 1655 VM_ALLOC_SYSTEM | 1656 VM_ALLOC_INTERRUPT); 1657 if (nkpg == NULL) { 1658 panic("pmap_growkernel: no memory to " 1659 "grow kernel"); 1660 } 1661 paddr = VM_PAGE_TO_PHYS(nkpg); 1662 pmap_zero_page(paddr); 1663 newpdp = (pdp_entry_t)(paddr | 1664 VPTE_V | VPTE_RW | VPTE_U | 1665 VPTE_A | VPTE_M | VPTE_WIRED); 1666 *pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp; 1667 atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1); 1668 nkpt++; 1669 continue; /* try again */ 1670 } 1671 if ((*pde & VPTE_V) != 0) { 1672 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1673 ~(PAGE_SIZE * NPTEPG - 1); 1674 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1675 kernel_vm_end = kernel_map.max_offset; 1676 break; 1677 } 1678 continue; 1679 } 1680 1681 /* 1682 * This index is bogus, but out of the way 1683 */ 1684 nkpg = vm_page_alloc(&kptobj, nkpt, 1685 VM_ALLOC_NORMAL | 1686 VM_ALLOC_SYSTEM | 1687 VM_ALLOC_INTERRUPT); 1688 if (nkpg == NULL) 1689 panic("pmap_growkernel: no memory to grow kernel"); 1690 1691 vm_page_wire(nkpg); 1692 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1693 pmap_zero_page(ptppaddr); 1694 newpdir = (pd_entry_t)(ptppaddr | 1695 VPTE_V | VPTE_RW | VPTE_U | 1696 VPTE_A | VPTE_M | VPTE_WIRED); 1697 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir; 1698 atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1); 1699 nkpt++; 1700 1701 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1702 ~(PAGE_SIZE * NPTEPG - 1); 1703 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1704 kernel_vm_end = kernel_map.max_offset; 1705 break; 1706 } 1707 } 1708 vm_object_drop(&kptobj); 1709 } 1710 1711 /* 1712 * Add a reference to the specified pmap. 1713 * 1714 * No requirements. 1715 */ 1716 void 1717 pmap_reference(pmap_t pmap) 1718 { 1719 if (pmap) 1720 atomic_add_int(&pmap->pm_count, 1); 1721 } 1722 1723 /************************************************************************ 1724 * VMSPACE MANAGEMENT * 1725 ************************************************************************ 1726 * 1727 * The VMSPACE management we do in our virtual kernel must be reflected 1728 * in the real kernel. This is accomplished by making vmspace system 1729 * calls to the real kernel. 1730 */ 1731 void 1732 cpu_vmspace_alloc(struct vmspace *vm) 1733 { 1734 int r; 1735 void *rp; 1736 vpte_t vpte; 1737 1738 /* 1739 * If VMM enable, don't do nothing, we 1740 * are able to use real page tables 1741 */ 1742 if (vmm_enabled) 1743 return; 1744 1745 #define USER_SIZE (VM_MAX_USER_ADDRESS - VM_MIN_USER_ADDRESS) 1746 1747 if (vmspace_create(&vm->vm_pmap, 0, NULL) < 0) 1748 panic("vmspace_create() failed"); 1749 1750 rp = vmspace_mmap(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1751 PROT_READ|PROT_WRITE|PROT_EXEC, 1752 MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED, 1753 MemImageFd, 0); 1754 if (rp == MAP_FAILED) 1755 panic("vmspace_mmap: failed"); 1756 vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1757 MADV_NOSYNC, 0); 1758 vpte = VM_PAGE_TO_PHYS(vmspace_pmap(vm)->pm_pdirm) | 1759 VPTE_RW | VPTE_V | VPTE_U; 1760 r = vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE, 1761 MADV_SETMAP, vpte); 1762 if (r < 0) 1763 panic("vmspace_mcontrol: failed"); 1764 } 1765 1766 void 1767 cpu_vmspace_free(struct vmspace *vm) 1768 { 1769 /* 1770 * If VMM enable, don't do nothing, we 1771 * are able to use real page tables 1772 */ 1773 if (vmm_enabled) 1774 return; 1775 1776 if (vmspace_destroy(&vm->vm_pmap) < 0) 1777 panic("vmspace_destroy() failed"); 1778 } 1779 1780 /*************************************************** 1781 * page management routines. 1782 ***************************************************/ 1783 1784 /* 1785 * free the pv_entry back to the free list. This function may be 1786 * called from an interrupt. 1787 */ 1788 static __inline void 1789 free_pv_entry(pv_entry_t pv) 1790 { 1791 atomic_add_long(&pv_entry_count, -1); 1792 zfree(pvzone, pv); 1793 } 1794 1795 /* 1796 * get a new pv_entry, allocating a block from the system 1797 * when needed. This function may be called from an interrupt. 1798 */ 1799 static pv_entry_t 1800 get_pv_entry(void) 1801 { 1802 atomic_add_long(&pv_entry_count, 1); 1803 if (pv_entry_high_water && 1804 (pv_entry_count > pv_entry_high_water) && 1805 atomic_swap_int(&pmap_pagedaemon_waken, 1) == 0) { 1806 wakeup(&vm_pages_needed); 1807 } 1808 return zalloc(pvzone); 1809 } 1810 1811 /* 1812 * This routine is very drastic, but can save the system 1813 * in a pinch. 1814 * 1815 * No requirements. 1816 */ 1817 void 1818 pmap_collect(void) 1819 { 1820 int i; 1821 vm_page_t m; 1822 static int warningdone=0; 1823 1824 if (pmap_pagedaemon_waken == 0) 1825 return; 1826 pmap_pagedaemon_waken = 0; 1827 1828 if (warningdone < 5) { 1829 kprintf("pmap_collect: collecting pv entries -- " 1830 "suggest increasing PMAP_SHPGPERPROC\n"); 1831 warningdone++; 1832 } 1833 1834 for (i = 0; i < vm_page_array_size; i++) { 1835 m = &vm_page_array[i]; 1836 if (m->wire_count || m->hold_count) 1837 continue; 1838 if (vm_page_busy_try(m, TRUE) == 0) { 1839 if (m->wire_count == 0 && m->hold_count == 0) { 1840 pmap_remove_all(m); 1841 } 1842 vm_page_wakeup(m); 1843 } 1844 } 1845 } 1846 1847 1848 /* 1849 * If it is the first entry on the list, it is actually 1850 * in the header and we must copy the following entry up 1851 * to the header. Otherwise we must search the list for 1852 * the entry. In either case we free the now unused entry. 1853 * 1854 * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller. 1855 */ 1856 static int 1857 pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va) 1858 { 1859 pv_entry_t pv; 1860 int rtval; 1861 1862 vm_page_spin_lock(m); 1863 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, va); 1864 1865 /* 1866 * Note that pv_ptem is NULL if the page table page itself is not 1867 * managed, even if the page being removed IS managed. 1868 */ 1869 rtval = 0; 1870 if (pv) { 1871 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1872 if (TAILQ_EMPTY(&m->md.pv_list)) 1873 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1874 m->md.pv_list_count--; 1875 KKASSERT(m->md.pv_list_count >= 0); 1876 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 1877 atomic_add_int(&pmap->pm_generation, 1); 1878 vm_page_spin_unlock(m); 1879 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); 1880 free_pv_entry(pv); 1881 } else { 1882 vm_page_spin_unlock(m); 1883 kprintf("pmap_remove_entry: could not find " 1884 "pmap=%p m=%p va=%016jx\n", 1885 pmap, m, va); 1886 } 1887 return rtval; 1888 } 1889 1890 /* 1891 * Create a pv entry for page at pa for (pmap, va). If the page table page 1892 * holding the VA is managed, mpte will be non-NULL. 1893 * 1894 * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller. 1895 */ 1896 static void 1897 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m, 1898 pv_entry_t pv) 1899 { 1900 pv->pv_va = va; 1901 pv->pv_pmap = pmap; 1902 pv->pv_ptem = mpte; 1903 1904 m->md.pv_list_count++; 1905 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1906 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pv); 1907 vm_page_flag_set(m, PG_MAPPED); 1908 KKASSERT(pv == NULL); 1909 } 1910 1911 /* 1912 * pmap_remove_pte: do the things to unmap a page in a process 1913 * 1914 * Caller holds pmap->pm_pteobj and holds the associated page table 1915 * page busy to prevent races. 1916 */ 1917 static int 1918 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, pt_entry_t oldpte, 1919 vm_offset_t va) 1920 { 1921 vm_page_t m; 1922 int error; 1923 1924 if (ptq) 1925 oldpte = pmap_inval_loadandclear(ptq, pmap, va); 1926 1927 if (oldpte & VPTE_WIRED) 1928 atomic_add_long(&pmap->pm_stats.wired_count, -1); 1929 KKASSERT(pmap->pm_stats.wired_count >= 0); 1930 1931 #if 0 1932 /* 1933 * Machines that don't support invlpg, also don't support 1934 * PG_G. XXX PG_G is disabled for SMP so don't worry about 1935 * the SMP case. 1936 */ 1937 if (oldpte & PG_G) 1938 cpu_invlpg((void *)va); 1939 #endif 1940 KKASSERT(pmap->pm_stats.resident_count > 0); 1941 atomic_add_long(&pmap->pm_stats.resident_count, -1); 1942 if (oldpte & VPTE_MANAGED) { 1943 m = PHYS_TO_VM_PAGE(oldpte); 1944 1945 /* 1946 * NOTE: pmap_remove_entry() will spin-lock the page 1947 */ 1948 if (oldpte & VPTE_M) { 1949 #if defined(PMAP_DIAGNOSTIC) 1950 if (pmap_nw_modified(oldpte)) { 1951 kprintf("pmap_remove: modified page not " 1952 "writable: va: 0x%lx, pte: 0x%lx\n", 1953 va, oldpte); 1954 } 1955 #endif 1956 if (pmap_track_modified(pmap, va)) 1957 vm_page_dirty(m); 1958 } 1959 if (oldpte & VPTE_A) 1960 vm_page_flag_set(m, PG_REFERENCED); 1961 error = pmap_remove_entry(pmap, m, va); 1962 } else { 1963 error = pmap_unuse_pt(pmap, va, NULL); 1964 } 1965 return error; 1966 } 1967 1968 /* 1969 * pmap_remove_page: 1970 * 1971 * Remove a single page from a process address space. 1972 * 1973 * This function may not be called from an interrupt if the pmap is 1974 * not kernel_pmap. 1975 * 1976 * Caller holds pmap->pm_pteobj 1977 */ 1978 static void 1979 pmap_remove_page(struct pmap *pmap, vm_offset_t va) 1980 { 1981 pt_entry_t *pte; 1982 1983 pte = pmap_pte(pmap, va); 1984 if (pte == NULL) 1985 return; 1986 if ((*pte & VPTE_V) == 0) 1987 return; 1988 pmap_remove_pte(pmap, pte, 0, va); 1989 } 1990 1991 /* 1992 * Remove the given range of addresses from the specified map. 1993 * 1994 * It is assumed that the start and end are properly rounded to 1995 * the page size. 1996 * 1997 * This function may not be called from an interrupt if the pmap is 1998 * not kernel_pmap. 1999 * 2000 * No requirements. 2001 */ 2002 void 2003 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 2004 { 2005 vm_offset_t va_next; 2006 pml4_entry_t *pml4e; 2007 pdp_entry_t *pdpe; 2008 pd_entry_t ptpaddr, *pde; 2009 pt_entry_t *pte; 2010 vm_page_t pt_m; 2011 2012 if (pmap == NULL) 2013 return; 2014 2015 vm_object_hold(pmap->pm_pteobj); 2016 KKASSERT(pmap->pm_stats.resident_count >= 0); 2017 if (pmap->pm_stats.resident_count == 0) { 2018 vm_object_drop(pmap->pm_pteobj); 2019 return; 2020 } 2021 2022 /* 2023 * special handling of removing one page. a very 2024 * common operation and easy to short circuit some 2025 * code. 2026 */ 2027 if (sva + PAGE_SIZE == eva) { 2028 pde = pmap_pde(pmap, sva); 2029 if (pde && (*pde & VPTE_PS) == 0) { 2030 pmap_remove_page(pmap, sva); 2031 vm_object_drop(pmap->pm_pteobj); 2032 return; 2033 } 2034 } 2035 2036 for (; sva < eva; sva = va_next) { 2037 pml4e = pmap_pml4e(pmap, sva); 2038 if ((*pml4e & VPTE_V) == 0) { 2039 va_next = (sva + NBPML4) & ~PML4MASK; 2040 if (va_next < sva) 2041 va_next = eva; 2042 continue; 2043 } 2044 2045 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2046 if ((*pdpe & VPTE_V) == 0) { 2047 va_next = (sva + NBPDP) & ~PDPMASK; 2048 if (va_next < sva) 2049 va_next = eva; 2050 continue; 2051 } 2052 2053 /* 2054 * Calculate index for next page table. 2055 */ 2056 va_next = (sva + NBPDR) & ~PDRMASK; 2057 if (va_next < sva) 2058 va_next = eva; 2059 2060 pde = pmap_pdpe_to_pde(pdpe, sva); 2061 ptpaddr = *pde; 2062 2063 /* 2064 * Weed out invalid mappings. 2065 */ 2066 if (ptpaddr == 0) 2067 continue; 2068 2069 /* 2070 * Check for large page. 2071 */ 2072 if ((ptpaddr & VPTE_PS) != 0) { 2073 /* JG FreeBSD has more complex treatment here */ 2074 KKASSERT(*pde != 0); 2075 pmap_inval_pde(pde, pmap, sva); 2076 atomic_add_long(&pmap->pm_stats.resident_count, 2077 -NBPDR / PAGE_SIZE); 2078 continue; 2079 } 2080 2081 /* 2082 * Limit our scan to either the end of the va represented 2083 * by the current page table page, or to the end of the 2084 * range being removed. 2085 */ 2086 if (va_next > eva) 2087 va_next = eva; 2088 2089 /* 2090 * NOTE: pmap_remove_pte() can block. 2091 */ 2092 pt_m = pmap_hold_pt_page(pde, sva); 2093 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2094 sva += PAGE_SIZE) { 2095 if (*pte) { 2096 if (pmap_remove_pte(pmap, pte, 0, sva)) 2097 break; 2098 } 2099 } 2100 vm_page_unhold(pt_m); 2101 } 2102 vm_object_drop(pmap->pm_pteobj); 2103 } 2104 2105 /* 2106 * Removes this physical page from all physical maps in which it resides. 2107 * Reflects back modify bits to the pager. 2108 * 2109 * This routine may not be called from an interrupt. 2110 * 2111 * No requirements. 2112 */ 2113 static void 2114 pmap_remove_all(vm_page_t m) 2115 { 2116 pt_entry_t *pte, tpte; 2117 pv_entry_t pv; 2118 vm_object_t pmobj; 2119 pmap_t pmap; 2120 2121 #if defined(PMAP_DIAGNOSTIC) 2122 /* 2123 * XXX this makes pmap_page_protect(NONE) illegal for non-managed 2124 * pages! 2125 */ 2126 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2127 panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m)); 2128 } 2129 #endif 2130 2131 restart: 2132 vm_page_spin_lock(m); 2133 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2134 pmap = pv->pv_pmap; 2135 pmobj = pmap->pm_pteobj; 2136 2137 /* 2138 * Handle reversed lock ordering 2139 */ 2140 if (vm_object_hold_try(pmobj) == 0) { 2141 refcount_acquire(&pmobj->hold_count); 2142 vm_page_spin_unlock(m); 2143 vm_object_lock(pmobj); 2144 vm_page_spin_lock(m); 2145 if (pv != TAILQ_FIRST(&m->md.pv_list) || 2146 pmap != pv->pv_pmap || 2147 pmobj != pmap->pm_pteobj) { 2148 vm_page_spin_unlock(m); 2149 vm_object_drop(pmobj); 2150 goto restart; 2151 } 2152 } 2153 2154 KKASSERT(pmap->pm_stats.resident_count > 0); 2155 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2156 2157 pte = pmap_pte(pmap, pv->pv_va); 2158 KKASSERT(pte != NULL); 2159 2160 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2161 if (tpte & VPTE_WIRED) 2162 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2163 KKASSERT(pmap->pm_stats.wired_count >= 0); 2164 2165 if (tpte & VPTE_A) 2166 vm_page_flag_set(m, PG_REFERENCED); 2167 2168 /* 2169 * Update the vm_page_t clean and reference bits. 2170 */ 2171 if (tpte & VPTE_M) { 2172 #if defined(PMAP_DIAGNOSTIC) 2173 if (pmap_nw_modified(tpte)) { 2174 kprintf( 2175 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2176 pv->pv_va, tpte); 2177 } 2178 #endif 2179 if (pmap_track_modified(pmap, pv->pv_va)) 2180 vm_page_dirty(m); 2181 } 2182 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2183 if (TAILQ_EMPTY(&m->md.pv_list)) 2184 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2185 m->md.pv_list_count--; 2186 KKASSERT(m->md.pv_list_count >= 0); 2187 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2188 atomic_add_int(&pmap->pm_generation, 1); 2189 vm_page_spin_unlock(m); 2190 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2191 free_pv_entry(pv); 2192 2193 vm_object_drop(pmobj); 2194 vm_page_spin_lock(m); 2195 } 2196 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2197 vm_page_spin_unlock(m); 2198 } 2199 2200 /* 2201 * Removes the page from a particular pmap 2202 */ 2203 void 2204 pmap_remove_specific(pmap_t pmap, vm_page_t m) 2205 { 2206 pt_entry_t *pte, tpte; 2207 pv_entry_t pv; 2208 2209 vm_object_hold(pmap->pm_pteobj); 2210 again: 2211 vm_page_spin_lock(m); 2212 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2213 if (pv->pv_pmap != pmap) 2214 continue; 2215 2216 KKASSERT(pmap->pm_stats.resident_count > 0); 2217 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2218 2219 pte = pmap_pte(pmap, pv->pv_va); 2220 KKASSERT(pte != NULL); 2221 2222 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2223 if (tpte & VPTE_WIRED) 2224 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2225 KKASSERT(pmap->pm_stats.wired_count >= 0); 2226 2227 if (tpte & VPTE_A) 2228 vm_page_flag_set(m, PG_REFERENCED); 2229 2230 /* 2231 * Update the vm_page_t clean and reference bits. 2232 */ 2233 if (tpte & VPTE_M) { 2234 if (pmap_track_modified(pmap, pv->pv_va)) 2235 vm_page_dirty(m); 2236 } 2237 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2238 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 2239 atomic_add_int(&pmap->pm_generation, 1); 2240 m->md.pv_list_count--; 2241 KKASSERT(m->md.pv_list_count >= 0); 2242 if (TAILQ_EMPTY(&m->md.pv_list)) 2243 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2244 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2245 vm_page_spin_unlock(m); 2246 free_pv_entry(pv); 2247 goto again; 2248 } 2249 vm_page_spin_unlock(m); 2250 vm_object_drop(pmap->pm_pteobj); 2251 } 2252 2253 /* 2254 * Set the physical protection on the specified range of this map 2255 * as requested. 2256 * 2257 * This function may not be called from an interrupt if the map is 2258 * not the kernel_pmap. 2259 * 2260 * No requirements. 2261 */ 2262 void 2263 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2264 { 2265 vm_offset_t va_next; 2266 pml4_entry_t *pml4e; 2267 pdp_entry_t *pdpe; 2268 pd_entry_t ptpaddr, *pde; 2269 pt_entry_t *pte; 2270 vm_page_t pt_m; 2271 2272 if (pmap == NULL) 2273 return; 2274 2275 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 2276 pmap_remove(pmap, sva, eva); 2277 return; 2278 } 2279 2280 if (prot & VM_PROT_WRITE) 2281 return; 2282 2283 vm_object_hold(pmap->pm_pteobj); 2284 2285 for (; sva < eva; sva = va_next) { 2286 pml4e = pmap_pml4e(pmap, sva); 2287 if ((*pml4e & VPTE_V) == 0) { 2288 va_next = (sva + NBPML4) & ~PML4MASK; 2289 if (va_next < sva) 2290 va_next = eva; 2291 continue; 2292 } 2293 2294 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2295 if ((*pdpe & VPTE_V) == 0) { 2296 va_next = (sva + NBPDP) & ~PDPMASK; 2297 if (va_next < sva) 2298 va_next = eva; 2299 continue; 2300 } 2301 2302 va_next = (sva + NBPDR) & ~PDRMASK; 2303 if (va_next < sva) 2304 va_next = eva; 2305 2306 pde = pmap_pdpe_to_pde(pdpe, sva); 2307 ptpaddr = *pde; 2308 2309 #if 0 2310 /* 2311 * Check for large page. 2312 */ 2313 if ((ptpaddr & VPTE_PS) != 0) { 2314 /* JG correct? */ 2315 pmap_clean_pde(pde, pmap, sva); 2316 atomic_add_long(&pmap->pm_stats.resident_count, 2317 -NBPDR / PAGE_SIZE); 2318 continue; 2319 } 2320 #endif 2321 2322 /* 2323 * Weed out invalid mappings. Note: we assume that the page 2324 * directory table is always allocated, and in kernel virtual. 2325 */ 2326 if (ptpaddr == 0) 2327 continue; 2328 2329 if (va_next > eva) 2330 va_next = eva; 2331 2332 pt_m = pmap_hold_pt_page(pde, sva); 2333 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2334 sva += PAGE_SIZE) { 2335 /* 2336 * Clean managed pages and also check the accessed 2337 * bit. Just remove write perms for unmanaged 2338 * pages. Be careful of races, turning off write 2339 * access will force a fault rather then setting 2340 * the modified bit at an unexpected time. 2341 */ 2342 pmap_clean_pte(pte, pmap, sva, NULL); 2343 } 2344 vm_page_unhold(pt_m); 2345 } 2346 vm_object_drop(pmap->pm_pteobj); 2347 } 2348 2349 /* 2350 * Enter a managed page into a pmap. If the page is not wired related pmap 2351 * data can be destroyed at any time for later demand-operation. 2352 * 2353 * Insert the vm_page (m) at virtual address (v) in (pmap), with the 2354 * specified protection, and wire the mapping if requested. 2355 * 2356 * NOTE: This routine may not lazy-evaluate or lose information. The 2357 * page must actually be inserted into the given map NOW. 2358 * 2359 * NOTE: When entering a page at a KVA address, the pmap must be the 2360 * kernel_pmap. 2361 * 2362 * No requirements. 2363 */ 2364 void 2365 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2366 boolean_t wired, vm_map_entry_t entry __unused) 2367 { 2368 vm_paddr_t pa; 2369 pv_entry_t pv; 2370 pt_entry_t *pte; 2371 pt_entry_t origpte, newpte; 2372 vm_paddr_t opa; 2373 vm_page_t mpte; 2374 2375 if (pmap == NULL) 2376 return; 2377 2378 va = trunc_page(va); 2379 2380 vm_object_hold(pmap->pm_pteobj); 2381 2382 /* 2383 * Get the page table page. The kernel_pmap's page table pages 2384 * are preallocated and have no associated vm_page_t. 2385 * 2386 * If not NULL, mpte will be busied and we must vm_page_wakeup() 2387 * to cleanup. There will already be at least one wire count from 2388 * it being mapped into its parent. 2389 */ 2390 if (pmap == &kernel_pmap) { 2391 mpte = NULL; 2392 pte = vtopte(va); 2393 } else { 2394 mpte = pmap_allocpte(pmap, va); 2395 pte = (void *)PHYS_TO_DMAP(mpte->phys_addr); 2396 pte += pmap_pte_index(va); 2397 } 2398 2399 /* 2400 * Deal with races against the kernel's real MMU by cleaning the 2401 * page, even if we are re-entering the same page. 2402 */ 2403 pa = VM_PAGE_TO_PHYS(m); 2404 origpte = pmap_inval_loadandclear(pte, pmap, va); 2405 /*origpte = pmap_clean_pte(pte, pmap, va, NULL);*/ 2406 opa = origpte & VPTE_FRAME; 2407 2408 if (origpte & VPTE_PS) 2409 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2410 2411 if ((origpte & (VPTE_MANAGED|VPTE_M)) == (VPTE_MANAGED|VPTE_M)) { 2412 if (pmap_track_modified(pmap, va)) { 2413 vm_page_t om = PHYS_TO_VM_PAGE(opa); 2414 vm_page_dirty(om); 2415 } 2416 } 2417 2418 /* 2419 * Mapping has not changed, must be protection or wiring change. 2420 */ 2421 if (origpte && (opa == pa)) { 2422 /* 2423 * Wiring change, just update stats. We don't worry about 2424 * wiring PT pages as they remain resident as long as there 2425 * are valid mappings in them. Hence, if a user page is wired, 2426 * the PT page will be also. 2427 */ 2428 if (wired && ((origpte & VPTE_WIRED) == 0)) 2429 atomic_add_long(&pmap->pm_stats.wired_count, 1); 2430 else if (!wired && (origpte & VPTE_WIRED)) 2431 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2432 2433 if (origpte & VPTE_MANAGED) { 2434 pa |= VPTE_MANAGED; 2435 KKASSERT(m->flags & PG_MAPPED); 2436 KKASSERT(!(m->flags & (PG_FICTITIOUS|PG_UNMANAGED))); 2437 } else { 2438 KKASSERT((m->flags & (PG_FICTITIOUS|PG_UNMANAGED))); 2439 } 2440 vm_page_spin_lock(m); 2441 goto validate; 2442 } 2443 2444 /* 2445 * Bump the wire_count for the page table page. 2446 */ 2447 if (mpte) 2448 vm_page_wire_quick(mpte); 2449 2450 /* 2451 * Mapping has changed, invalidate old range and fall through to 2452 * handle validating new mapping. Don't inherit anything from 2453 * oldpte. 2454 */ 2455 if (opa) { 2456 int err; 2457 err = pmap_remove_pte(pmap, NULL, origpte, va); 2458 origpte = 0; 2459 if (err) 2460 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2461 } 2462 2463 /* 2464 * Enter on the PV list if part of our managed memory. Note that we 2465 * raise IPL while manipulating pv_table since pmap_enter can be 2466 * called at interrupt time. 2467 */ 2468 if (pmap_initialized) { 2469 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2470 /* 2471 * WARNING! We are using m's spin-lock as a 2472 * man's pte lock to interlock against 2473 * pmap_page_protect() operations. 2474 * 2475 * This is a bad hack (obviously). 2476 */ 2477 pv = get_pv_entry(); 2478 vm_page_spin_lock(m); 2479 pmap_insert_entry(pmap, va, mpte, m, pv); 2480 pa |= VPTE_MANAGED; 2481 /* vm_page_spin_unlock(m); */ 2482 } else { 2483 vm_page_spin_lock(m); 2484 } 2485 } else { 2486 vm_page_spin_lock(m); 2487 } 2488 2489 /* 2490 * Increment counters 2491 */ 2492 atomic_add_long(&pmap->pm_stats.resident_count, 1); 2493 if (wired) 2494 atomic_add_long(&pmap->pm_stats.wired_count, 1); 2495 2496 validate: 2497 /* 2498 * Now validate mapping with desired protection/wiring. 2499 */ 2500 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | VPTE_V | VPTE_U); 2501 newpte |= VPTE_A; 2502 2503 if (wired) 2504 newpte |= VPTE_WIRED; 2505 // if (pmap != &kernel_pmap) 2506 newpte |= VPTE_U; 2507 if (newpte & VPTE_RW) 2508 vm_page_flag_set(m, PG_WRITEABLE); 2509 KKASSERT((newpte & VPTE_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2510 2511 origpte = atomic_swap_long(pte, newpte); 2512 if (origpte & VPTE_M) { 2513 kprintf("pmap [M] race @ %016jx\n", va); 2514 atomic_set_long(pte, VPTE_M); 2515 } 2516 vm_page_spin_unlock(m); 2517 2518 if (mpte) 2519 vm_page_wakeup(mpte); 2520 vm_object_drop(pmap->pm_pteobj); 2521 } 2522 2523 /* 2524 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 2525 * 2526 * Currently this routine may only be used on user pmaps, not kernel_pmap. 2527 * 2528 * No requirements. 2529 */ 2530 void 2531 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 2532 { 2533 pmap_enter(pmap, va, m, VM_PROT_READ, 0, NULL); 2534 } 2535 2536 /* 2537 * Make a temporary mapping for a physical address. This is only intended 2538 * to be used for panic dumps. 2539 * 2540 * The caller is responsible for calling smp_invltlb(). 2541 */ 2542 void * 2543 pmap_kenter_temporary(vm_paddr_t pa, long i) 2544 { 2545 pmap_kenter_quick(crashdumpmap + (i * PAGE_SIZE), pa); 2546 return ((void *)crashdumpmap); 2547 } 2548 2549 #define MAX_INIT_PT (96) 2550 2551 /* 2552 * This routine preloads the ptes for a given object into the specified pmap. 2553 * This eliminates the blast of soft faults on process startup and 2554 * immediately after an mmap. 2555 * 2556 * No requirements. 2557 */ 2558 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2559 2560 void 2561 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 2562 vm_object_t object, vm_pindex_t pindex, 2563 vm_size_t size, int limit) 2564 { 2565 struct rb_vm_page_scan_info info; 2566 struct lwp *lp; 2567 vm_size_t psize; 2568 2569 /* 2570 * We can't preinit if read access isn't set or there is no pmap 2571 * or object. 2572 */ 2573 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2574 return; 2575 2576 /* 2577 * We can't preinit if the pmap is not the current pmap 2578 */ 2579 lp = curthread->td_lwp; 2580 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2581 return; 2582 2583 /* 2584 * Misc additional checks 2585 */ 2586 psize = x86_64_btop(size); 2587 2588 if ((object->type != OBJT_VNODE) || 2589 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2590 (object->resident_page_count > MAX_INIT_PT))) { 2591 return; 2592 } 2593 2594 if (psize + pindex > object->size) { 2595 if (object->size < pindex) 2596 return; 2597 psize = object->size - pindex; 2598 } 2599 2600 if (psize == 0) 2601 return; 2602 2603 /* 2604 * Use a red-black scan to traverse the requested range and load 2605 * any valid pages found into the pmap. 2606 * 2607 * We cannot safely scan the object's memq unless we are in a 2608 * critical section since interrupts can remove pages from objects. 2609 */ 2610 info.start_pindex = pindex; 2611 info.end_pindex = pindex + psize - 1; 2612 info.limit = limit; 2613 info.mpte = NULL; 2614 info.addr = addr; 2615 info.pmap = pmap; 2616 2617 vm_object_hold_shared(object); 2618 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2619 pmap_object_init_pt_callback, &info); 2620 vm_object_drop(object); 2621 } 2622 2623 static 2624 int 2625 pmap_object_init_pt_callback(vm_page_t p, void *data) 2626 { 2627 struct rb_vm_page_scan_info *info = data; 2628 vm_pindex_t rel_index; 2629 /* 2630 * don't allow an madvise to blow away our really 2631 * free pages allocating pv entries. 2632 */ 2633 if ((info->limit & MAP_PREFAULT_MADVISE) && 2634 vmstats.v_free_count < vmstats.v_free_reserved) { 2635 return(-1); 2636 } 2637 2638 /* 2639 * Ignore list markers and ignore pages we cannot instantly 2640 * busy (while holding the object token). 2641 */ 2642 if (p->flags & PG_MARKER) 2643 return 0; 2644 if (vm_page_busy_try(p, TRUE)) 2645 return 0; 2646 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2647 (p->flags & PG_FICTITIOUS) == 0) { 2648 if ((p->queue - p->pc) == PQ_CACHE) 2649 vm_page_deactivate(p); 2650 rel_index = p->pindex - info->start_pindex; 2651 pmap_enter_quick(info->pmap, 2652 info->addr + x86_64_ptob(rel_index), p); 2653 } 2654 vm_page_wakeup(p); 2655 return(0); 2656 } 2657 2658 /* 2659 * Return TRUE if the pmap is in shape to trivially 2660 * pre-fault the specified address. 2661 * 2662 * Returns FALSE if it would be non-trivial or if a 2663 * pte is already loaded into the slot. 2664 * 2665 * No requirements. 2666 */ 2667 int 2668 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2669 { 2670 pt_entry_t *pte; 2671 pd_entry_t *pde; 2672 int ret; 2673 2674 vm_object_hold(pmap->pm_pteobj); 2675 pde = pmap_pde(pmap, addr); 2676 if (pde == NULL || *pde == 0) { 2677 ret = 0; 2678 } else { 2679 pte = pmap_pde_to_pte(pde, addr); 2680 ret = (*pte) ? 0 : 1; 2681 } 2682 vm_object_drop(pmap->pm_pteobj); 2683 2684 return (ret); 2685 } 2686 2687 /* 2688 * Change the wiring attribute for a map/virtual-address pair. 2689 * 2690 * The mapping must already exist in the pmap. 2691 * No other requirements. 2692 */ 2693 vm_page_t 2694 pmap_unwire(pmap_t pmap, vm_offset_t va) 2695 { 2696 pt_entry_t *pte; 2697 vm_paddr_t pa; 2698 vm_page_t m; 2699 2700 if (pmap == NULL) 2701 return NULL; 2702 2703 vm_object_hold(pmap->pm_pteobj); 2704 pte = pmap_pte(pmap, va); 2705 2706 if (pte == NULL || (*pte & VPTE_V) == 0) { 2707 vm_object_drop(pmap->pm_pteobj); 2708 return NULL; 2709 } 2710 2711 /* 2712 * Wiring is not a hardware characteristic so there is no need to 2713 * invalidate TLB. However, in an SMP environment we must use 2714 * a locked bus cycle to update the pte (if we are not using 2715 * the pmap_inval_*() API that is)... it's ok to do this for simple 2716 * wiring changes. 2717 */ 2718 if (pmap_pte_w(pte)) 2719 atomic_add_long(&pmap->pm_stats.wired_count, -1); 2720 /* XXX else return NULL so caller doesn't unwire m ? */ 2721 atomic_clear_long(pte, VPTE_WIRED); 2722 2723 pa = *pte & VPTE_FRAME; 2724 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 2725 2726 vm_object_drop(pmap->pm_pteobj); 2727 2728 return m; 2729 } 2730 2731 /* 2732 * Copy the range specified by src_addr/len 2733 * from the source map to the range dst_addr/len 2734 * in the destination map. 2735 * 2736 * This routine is only advisory and need not do anything. 2737 */ 2738 void 2739 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2740 vm_size_t len, vm_offset_t src_addr) 2741 { 2742 /* 2743 * XXX BUGGY. Amoung other things srcmpte is assumed to remain 2744 * valid through blocking calls, and that's just not going to 2745 * be the case. 2746 * 2747 * FIXME! 2748 */ 2749 return; 2750 } 2751 2752 /* 2753 * pmap_zero_page: 2754 * 2755 * Zero the specified physical page. 2756 * 2757 * This function may be called from an interrupt and no locking is 2758 * required. 2759 */ 2760 void 2761 pmap_zero_page(vm_paddr_t phys) 2762 { 2763 vm_offset_t va = PHYS_TO_DMAP(phys); 2764 2765 bzero((void *)va, PAGE_SIZE); 2766 } 2767 2768 /* 2769 * pmap_zero_page: 2770 * 2771 * Zero part of a physical page by mapping it into memory and clearing 2772 * its contents with bzero. 2773 * 2774 * off and size may not cover an area beyond a single hardware page. 2775 */ 2776 void 2777 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 2778 { 2779 vm_offset_t virt = PHYS_TO_DMAP(phys); 2780 2781 bzero((char *)virt + off, size); 2782 } 2783 2784 /* 2785 * pmap_copy_page: 2786 * 2787 * Copy the physical page from the source PA to the target PA. 2788 * This function may be called from an interrupt. No locking 2789 * is required. 2790 */ 2791 void 2792 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 2793 { 2794 vm_offset_t src_virt, dst_virt; 2795 2796 src_virt = PHYS_TO_DMAP(src); 2797 dst_virt = PHYS_TO_DMAP(dst); 2798 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 2799 } 2800 2801 /* 2802 * pmap_copy_page_frag: 2803 * 2804 * Copy the physical page from the source PA to the target PA. 2805 * This function may be called from an interrupt. No locking 2806 * is required. 2807 */ 2808 void 2809 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 2810 { 2811 vm_offset_t src_virt, dst_virt; 2812 2813 src_virt = PHYS_TO_DMAP(src); 2814 dst_virt = PHYS_TO_DMAP(dst); 2815 bcopy((char *)src_virt + (src & PAGE_MASK), 2816 (char *)dst_virt + (dst & PAGE_MASK), 2817 bytes); 2818 } 2819 2820 /* 2821 * Returns true if the pmap's pv is one of the first 16 pvs linked to 2822 * from this page. This count may be changed upwards or downwards 2823 * in the future; it is only necessary that true be returned for a small 2824 * subset of pmaps for proper page aging. 2825 * 2826 * No other requirements. 2827 */ 2828 boolean_t 2829 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 2830 { 2831 pv_entry_t pv; 2832 int loops = 0; 2833 2834 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2835 return FALSE; 2836 2837 vm_page_spin_lock(m); 2838 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2839 if (pv->pv_pmap == pmap) { 2840 vm_page_spin_unlock(m); 2841 return TRUE; 2842 } 2843 loops++; 2844 if (loops >= 16) 2845 break; 2846 } 2847 vm_page_spin_unlock(m); 2848 2849 return (FALSE); 2850 } 2851 2852 /* 2853 * Remove all pages from specified address space this aids process 2854 * exit speeds. Also, this code is special cased for current 2855 * process only, but can have the more generic (and slightly slower) 2856 * mode enabled. This is much faster than pmap_remove in the case 2857 * of running down an entire address space. 2858 * 2859 * No other requirements. 2860 */ 2861 void 2862 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2863 { 2864 pmap_remove(pmap, sva, eva); 2865 #if 0 2866 pt_entry_t *pte, tpte; 2867 pv_entry_t pv, npv; 2868 vm_page_t m; 2869 int save_generation; 2870 2871 if (pmap->pm_pteobj) 2872 vm_object_hold(pmap->pm_pteobj); 2873 2874 pmap_invalidate_range(pmap, sva, eva); 2875 2876 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 2877 if (pv->pv_va >= eva || pv->pv_va < sva) { 2878 npv = TAILQ_NEXT(pv, pv_plist); 2879 continue; 2880 } 2881 2882 KKASSERT(pmap == pv->pv_pmap); 2883 2884 pte = pmap_pte(pmap, pv->pv_va); 2885 2886 /* 2887 * We cannot remove wired pages from a process' mapping 2888 * at this time 2889 */ 2890 if (*pte & VPTE_WIRED) { 2891 npv = TAILQ_NEXT(pv, pv_plist); 2892 continue; 2893 } 2894 tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va); 2895 2896 m = PHYS_TO_VM_PAGE(tpte & VPTE_FRAME); 2897 vm_page_spin_lock(m); 2898 2899 KASSERT(m < &vm_page_array[vm_page_array_size], 2900 ("pmap_remove_pages: bad tpte %lx", tpte)); 2901 2902 KKASSERT(pmap->pm_stats.resident_count > 0); 2903 atomic_add_long(&pmap->pm_stats.resident_count, -1); 2904 2905 /* 2906 * Update the vm_page_t clean and reference bits. 2907 */ 2908 if (tpte & VPTE_M) { 2909 vm_page_dirty(m); 2910 } 2911 2912 npv = TAILQ_NEXT(pv, pv_plist); 2913 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2914 atomic_add_int(&pmap->pm_generation, 1); 2915 save_generation = pmap->pm_generation; 2916 m->md.pv_list_count--; 2917 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2918 if (TAILQ_EMPTY(&m->md.pv_list)) 2919 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2920 vm_page_spin_unlock(m); 2921 2922 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); 2923 free_pv_entry(pv); 2924 2925 /* 2926 * Restart the scan if we blocked during the unuse or free 2927 * calls and other removals were made. 2928 */ 2929 if (save_generation != pmap->pm_generation) { 2930 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 2931 npv = TAILQ_FIRST(&pmap->pm_pvlist); 2932 } 2933 } 2934 if (pmap->pm_pteobj) 2935 vm_object_drop(pmap->pm_pteobj); 2936 pmap_remove(pmap, sva, eva); 2937 #endif 2938 } 2939 2940 /* 2941 * pmap_testbit tests bits in active mappings of a VM page. 2942 */ 2943 static boolean_t 2944 pmap_testbit(vm_page_t m, int bit) 2945 { 2946 pv_entry_t pv; 2947 pt_entry_t *pte; 2948 2949 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2950 return FALSE; 2951 2952 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 2953 return FALSE; 2954 2955 vm_page_spin_lock(m); 2956 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2957 /* 2958 * if the bit being tested is the modified bit, then 2959 * mark clean_map and ptes as never 2960 * modified. 2961 */ 2962 if (bit & (VPTE_A|VPTE_M)) { 2963 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) 2964 continue; 2965 } 2966 2967 #if defined(PMAP_DIAGNOSTIC) 2968 if (pv->pv_pmap == NULL) { 2969 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 2970 continue; 2971 } 2972 #endif 2973 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2974 if (*pte & bit) { 2975 vm_page_spin_unlock(m); 2976 return TRUE; 2977 } 2978 } 2979 vm_page_spin_unlock(m); 2980 return (FALSE); 2981 } 2982 2983 /* 2984 * This routine is used to clear bits in ptes. Certain bits require special 2985 * handling, in particular (on virtual kernels) the VPTE_M (modify) bit. 2986 * 2987 * This routine is only called with certain VPTE_* bit combinations. 2988 */ 2989 static __inline void 2990 pmap_clearbit(vm_page_t m, int bit) 2991 { 2992 pv_entry_t pv; 2993 pt_entry_t *pte; 2994 pt_entry_t pbits; 2995 vm_object_t pmobj; 2996 pmap_t pmap; 2997 2998 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 2999 if (bit == VPTE_RW) 3000 vm_page_flag_clear(m, PG_WRITEABLE); 3001 return; 3002 } 3003 3004 /* 3005 * Loop over all current mappings setting/clearing as appropos If 3006 * setting RO do we need to clear the VAC? 3007 */ 3008 restart: 3009 vm_page_spin_lock(m); 3010 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3011 /* 3012 * Need the pmap object lock(?) 3013 */ 3014 pmap = pv->pv_pmap; 3015 pmobj = pmap->pm_pteobj; 3016 3017 if (vm_object_hold_try(pmobj) == 0) { 3018 refcount_acquire(&pmobj->hold_count); 3019 vm_page_spin_unlock(m); 3020 vm_object_lock(pmobj); 3021 vm_object_drop(pmobj); 3022 goto restart; 3023 } 3024 3025 /* 3026 * don't write protect pager mappings 3027 */ 3028 if (bit == VPTE_RW) { 3029 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) { 3030 vm_object_drop(pmobj); 3031 continue; 3032 } 3033 } 3034 3035 #if defined(PMAP_DIAGNOSTIC) 3036 if (pv->pv_pmap == NULL) { 3037 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 3038 vm_object_drop(pmobj); 3039 continue; 3040 } 3041 #endif 3042 3043 /* 3044 * Careful here. We can use a locked bus instruction to 3045 * clear VPTE_A or VPTE_M safely but we need to synchronize 3046 * with the target cpus when we mess with VPTE_RW. 3047 * 3048 * On virtual kernels we must force a new fault-on-write 3049 * in the real kernel if we clear the Modify bit ourselves, 3050 * otherwise the real kernel will not get a new fault and 3051 * will never set our Modify bit again. 3052 */ 3053 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3054 if (*pte & bit) { 3055 if (bit == VPTE_RW) { 3056 /* 3057 * We must also clear VPTE_M when clearing 3058 * VPTE_RW and synchronize its state to 3059 * the page. 3060 */ 3061 pbits = pmap_clean_pte(pte, pv->pv_pmap, 3062 pv->pv_va, m); 3063 } else if (bit == VPTE_M) { 3064 /* 3065 * We must invalidate the real-kernel pte 3066 * when clearing VPTE_M bit to force the 3067 * real-kernel to take a new fault to re-set 3068 * VPTE_M. 3069 */ 3070 atomic_clear_long(pte, VPTE_M); 3071 if (*pte & VPTE_RW) { 3072 pmap_invalidate_range(pv->pv_pmap, 3073 pv->pv_va, 3074 pv->pv_va + PAGE_SIZE); 3075 } 3076 } else if ((bit & (VPTE_RW|VPTE_M)) == 3077 (VPTE_RW|VPTE_M)) { 3078 /* 3079 * We've been asked to clear W & M, I guess 3080 * the caller doesn't want us to update 3081 * the dirty status of the VM page. 3082 */ 3083 pmap_clean_pte(pte, pv->pv_pmap, pv->pv_va, m); 3084 panic("shouldn't be called"); 3085 } else { 3086 /* 3087 * We've been asked to clear bits that do 3088 * not interact with hardware. 3089 */ 3090 atomic_clear_long(pte, bit); 3091 } 3092 } 3093 vm_object_drop(pmobj); 3094 } 3095 if (bit == VPTE_RW) 3096 vm_page_flag_clear(m, PG_WRITEABLE); 3097 vm_page_spin_unlock(m); 3098 } 3099 3100 /* 3101 * Lower the permission for all mappings to a given page. 3102 * 3103 * No other requirements. 3104 */ 3105 void 3106 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3107 { 3108 if ((prot & VM_PROT_WRITE) == 0) { 3109 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3110 pmap_clearbit(m, VPTE_RW); 3111 } else { 3112 pmap_remove_all(m); 3113 } 3114 } 3115 } 3116 3117 vm_paddr_t 3118 pmap_phys_address(vm_pindex_t ppn) 3119 { 3120 return (x86_64_ptob(ppn)); 3121 } 3122 3123 /* 3124 * Return a count of reference bits for a page, clearing those bits. 3125 * It is not necessary for every reference bit to be cleared, but it 3126 * is necessary that 0 only be returned when there are truly no 3127 * reference bits set. 3128 * 3129 * XXX: The exact number of bits to check and clear is a matter that 3130 * should be tested and standardized at some point in the future for 3131 * optimal aging of shared pages. 3132 * 3133 * No other requirements. 3134 */ 3135 int 3136 pmap_ts_referenced(vm_page_t m) 3137 { 3138 pv_entry_t pv, pvf, pvn; 3139 pt_entry_t *pte; 3140 int rtval = 0; 3141 3142 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3143 return (rtval); 3144 3145 vm_page_spin_lock(m); 3146 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3147 pvf = pv; 3148 do { 3149 pvn = TAILQ_NEXT(pv, pv_list); 3150 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3151 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3152 3153 if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) 3154 continue; 3155 3156 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 3157 3158 if (pte && (*pte & VPTE_A)) { 3159 atomic_clear_long(pte, VPTE_A); 3160 rtval++; 3161 if (rtval > 4) { 3162 break; 3163 } 3164 } 3165 } while ((pv = pvn) != NULL && pv != pvf); 3166 } 3167 vm_page_spin_unlock(m); 3168 3169 return (rtval); 3170 } 3171 3172 /* 3173 * Return whether or not the specified physical page was modified 3174 * in any physical maps. 3175 * 3176 * No other requirements. 3177 */ 3178 boolean_t 3179 pmap_is_modified(vm_page_t m) 3180 { 3181 boolean_t res; 3182 3183 res = pmap_testbit(m, VPTE_M); 3184 3185 return (res); 3186 } 3187 3188 /* 3189 * Clear the modify bits on the specified physical page. For the vkernel 3190 * we really need to clean the page, which clears VPTE_RW and VPTE_M, in 3191 * order to ensure that we take a fault on the next write to the page. 3192 * Otherwise the page may become dirty without us knowing it. 3193 * 3194 * No other requirements. 3195 */ 3196 void 3197 pmap_clear_modify(vm_page_t m) 3198 { 3199 pmap_clearbit(m, VPTE_RW); 3200 } 3201 3202 /* 3203 * Clear the reference bit on the specified physical page. 3204 * 3205 * No other requirements. 3206 */ 3207 void 3208 pmap_clear_reference(vm_page_t m) 3209 { 3210 pmap_clearbit(m, VPTE_A); 3211 } 3212 3213 /* 3214 * Miscellaneous support routines follow 3215 */ 3216 static void 3217 x86_64_protection_init(void) 3218 { 3219 uint64_t *kp; 3220 int prot; 3221 3222 kp = protection_codes; 3223 for (prot = 0; prot < 8; prot++) { 3224 if (prot & VM_PROT_READ) 3225 *kp |= 0; /* R */ 3226 if (prot & VM_PROT_WRITE) 3227 *kp |= VPTE_RW; /* R+W */ 3228 if (prot && (prot & VM_PROT_EXECUTE) == 0) 3229 *kp |= VPTE_NX; /* NX - !executable */ 3230 ++kp; 3231 } 3232 } 3233 3234 /* 3235 * Sets the memory attribute for the specified page. 3236 */ 3237 void 3238 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 3239 { 3240 /* This is a vkernel, do nothing */ 3241 } 3242 3243 /* 3244 * Change the PAT attribute on an existing kernel memory map. Caller 3245 * must ensure that the virtual memory in question is not accessed 3246 * during the adjustment. 3247 */ 3248 void 3249 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 3250 { 3251 /* This is a vkernel, do nothing */ 3252 } 3253 3254 /* 3255 * Perform the pmap work for mincore 3256 * 3257 * No other requirements. 3258 */ 3259 int 3260 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3261 { 3262 pt_entry_t *ptep, pte; 3263 vm_page_t m; 3264 int val = 0; 3265 3266 vm_object_hold(pmap->pm_pteobj); 3267 ptep = pmap_pte(pmap, addr); 3268 3269 if (ptep && (pte = *ptep) != 0) { 3270 vm_paddr_t pa; 3271 3272 val = MINCORE_INCORE; 3273 if ((pte & VPTE_MANAGED) == 0) 3274 goto done; 3275 3276 pa = pte & VPTE_FRAME; 3277 3278 m = PHYS_TO_VM_PAGE(pa); 3279 3280 /* 3281 * Modified by us 3282 */ 3283 if (pte & VPTE_M) 3284 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3285 /* 3286 * Modified by someone 3287 */ 3288 else if (m->dirty || pmap_is_modified(m)) 3289 val |= MINCORE_MODIFIED_OTHER; 3290 /* 3291 * Referenced by us 3292 */ 3293 if (pte & VPTE_A) 3294 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3295 3296 /* 3297 * Referenced by someone 3298 */ 3299 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3300 val |= MINCORE_REFERENCED_OTHER; 3301 vm_page_flag_set(m, PG_REFERENCED); 3302 } 3303 } 3304 done: 3305 vm_object_drop(pmap->pm_pteobj); 3306 3307 return val; 3308 } 3309 3310 /* 3311 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3312 * vmspace will be ref'd and the old one will be deref'd. 3313 * 3314 * Caller must hold vmspace->vm_map.token for oldvm and newvm 3315 */ 3316 void 3317 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3318 { 3319 struct vmspace *oldvm; 3320 struct lwp *lp; 3321 3322 oldvm = p->p_vmspace; 3323 if (oldvm != newvm) { 3324 if (adjrefs) 3325 vmspace_ref(newvm); 3326 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 3327 p->p_vmspace = newvm; 3328 KKASSERT(p->p_nthreads == 1); 3329 lp = RB_ROOT(&p->p_lwp_tree); 3330 pmap_setlwpvm(lp, newvm); 3331 if (adjrefs) 3332 vmspace_rel(oldvm); 3333 } 3334 } 3335 3336 /* 3337 * Set the vmspace for a LWP. The vmspace is almost universally set the 3338 * same as the process vmspace, but virtual kernels need to swap out contexts 3339 * on a per-lwp basis. 3340 */ 3341 void 3342 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3343 { 3344 struct vmspace *oldvm; 3345 struct pmap *pmap; 3346 3347 oldvm = lp->lwp_vmspace; 3348 if (oldvm != newvm) { 3349 crit_enter(); 3350 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 3351 lp->lwp_vmspace = newvm; 3352 if (curthread->td_lwp == lp) { 3353 pmap = vmspace_pmap(newvm); 3354 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 3355 if (pmap->pm_active_lock & CPULOCK_EXCL) 3356 pmap_interlock_wait(newvm); 3357 #if defined(SWTCH_OPTIM_STATS) 3358 tlb_flush_count++; 3359 #endif 3360 pmap = vmspace_pmap(oldvm); 3361 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 3362 mycpu->gd_cpuid); 3363 } 3364 crit_exit(); 3365 } 3366 } 3367 3368 /* 3369 * The swtch code tried to switch in a heavy weight process whos pmap 3370 * is locked by another cpu. We have to wait for the lock to clear before 3371 * the pmap can be used. 3372 */ 3373 void 3374 pmap_interlock_wait (struct vmspace *vm) 3375 { 3376 pmap_t pmap = vmspace_pmap(vm); 3377 3378 if (pmap->pm_active_lock & CPULOCK_EXCL) { 3379 crit_enter(); 3380 while (pmap->pm_active_lock & CPULOCK_EXCL) { 3381 cpu_ccfence(); 3382 pthread_yield(); 3383 } 3384 crit_exit(); 3385 } 3386 } 3387 3388 vm_offset_t 3389 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3390 { 3391 3392 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3393 return addr; 3394 } 3395 3396 addr = roundup2(addr, NBPDR); 3397 return addr; 3398 } 3399 3400 /* 3401 * Used by kmalloc/kfree, page already exists at va 3402 */ 3403 vm_page_t 3404 pmap_kvtom(vm_offset_t va) 3405 { 3406 vpte_t *ptep; 3407 3408 KKASSERT(va >= KvaStart && va < KvaEnd); 3409 ptep = vtopte(va); 3410 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 3411 } 3412 3413 void 3414 pmap_object_init(vm_object_t object) 3415 { 3416 /* empty */ 3417 } 3418 3419 void 3420 pmap_object_free(vm_object_t object) 3421 { 3422 /* empty */ 3423 } 3424 3425 void 3426 pmap_pgscan(struct pmap_pgscan_info *pginfo) 3427 { 3428 pmap_t pmap = pginfo->pmap; 3429 vm_offset_t sva = pginfo->beg_addr; 3430 vm_offset_t eva = pginfo->end_addr; 3431 vm_offset_t va_next; 3432 pml4_entry_t *pml4e; 3433 pdp_entry_t *pdpe; 3434 pd_entry_t ptpaddr, *pde; 3435 pt_entry_t *pte; 3436 vm_page_t pt_m; 3437 int stop = 0; 3438 3439 vm_object_hold(pmap->pm_pteobj); 3440 3441 for (; sva < eva; sva = va_next) { 3442 if (stop) 3443 break; 3444 3445 pml4e = pmap_pml4e(pmap, sva); 3446 if ((*pml4e & VPTE_V) == 0) { 3447 va_next = (sva + NBPML4) & ~PML4MASK; 3448 if (va_next < sva) 3449 va_next = eva; 3450 continue; 3451 } 3452 3453 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3454 if ((*pdpe & VPTE_V) == 0) { 3455 va_next = (sva + NBPDP) & ~PDPMASK; 3456 if (va_next < sva) 3457 va_next = eva; 3458 continue; 3459 } 3460 3461 va_next = (sva + NBPDR) & ~PDRMASK; 3462 if (va_next < sva) 3463 va_next = eva; 3464 3465 pde = pmap_pdpe_to_pde(pdpe, sva); 3466 ptpaddr = *pde; 3467 3468 #if 0 3469 /* 3470 * Check for large page (ignore). 3471 */ 3472 if ((ptpaddr & VPTE_PS) != 0) { 3473 #if 0 3474 pmap_clean_pde(pde, pmap, sva); 3475 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 3476 #endif 3477 continue; 3478 } 3479 #endif 3480 3481 /* 3482 * Weed out invalid mappings. Note: we assume that the page 3483 * directory table is always allocated, and in kernel virtual. 3484 */ 3485 if (ptpaddr == 0) 3486 continue; 3487 3488 if (va_next > eva) 3489 va_next = eva; 3490 3491 pt_m = pmap_hold_pt_page(pde, sva); 3492 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3493 sva += PAGE_SIZE) { 3494 vm_page_t m; 3495 3496 if (stop) 3497 break; 3498 if ((*pte & VPTE_MANAGED) == 0) 3499 continue; 3500 3501 m = PHYS_TO_VM_PAGE(*pte & VPTE_FRAME); 3502 if (vm_page_busy_try(m, TRUE) == 0) { 3503 if (pginfo->callback(pginfo, sva, m) < 0) 3504 stop = 1; 3505 } 3506 } 3507 vm_page_unhold(pt_m); 3508 } 3509 vm_object_drop(pmap->pm_pteobj); 3510 } 3511