1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2003 Peter Wemm 8 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 9 * Copyright (c) 2008, 2009 The DragonFly Project. 10 * Copyright (c) 2008, 2009 Jordan Gordeev. 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 47 */ 48 49 /* 50 * Manages physical address maps. 51 * 52 * In addition to hardware address maps, this 53 * module is called upon to provide software-use-only 54 * maps which may or may not be stored in the same 55 * form as hardware maps. These pseudo-maps are 56 * used to store intermediate results from copy 57 * operations to and from address spaces. 58 * 59 * Since the information managed by this module is 60 * also stored by the logical address mapping module, 61 * this module may throw away valid virtual-to-physical 62 * mappings at almost any time. However, invalidations 63 * of virtual-to-physical mappings must be done as 64 * requested. 65 * 66 * In order to cope with hardware architectures which 67 * make virtual-to-physical map invalidates expensive, 68 * this module may delay invalidate or reduced protection 69 * operations until such time as they are actually 70 * necessary. This module is given full information as 71 * to which processors are currently using which maps, 72 * and to when physical maps must be made correct. 73 */ 74 75 #if JG 76 #include "opt_disable_pse.h" 77 #include "opt_pmap.h" 78 #endif 79 #include "opt_msgbuf.h" 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/kernel.h> 84 #include <sys/proc.h> 85 #include <sys/msgbuf.h> 86 #include <sys/vmmeter.h> 87 #include <sys/mman.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <sys/sysctl.h> 92 #include <sys/lock.h> 93 #include <vm/vm_kern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vm_map.h> 96 #include <vm/vm_object.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_pageout.h> 99 #include <vm/vm_pager.h> 100 #include <vm/vm_zone.h> 101 102 #include <sys/user.h> 103 #include <sys/thread2.h> 104 #include <sys/sysref2.h> 105 106 #include <machine/cputypes.h> 107 #include <machine/md_var.h> 108 #include <machine/specialreg.h> 109 #include <machine/smp.h> 110 #include <machine_base/apic/apicreg.h> 111 #include <machine/globaldata.h> 112 #include <machine/pmap.h> 113 #include <machine/pmap_inval.h> 114 115 #include <ddb/ddb.h> 116 117 #define PMAP_KEEP_PDIRS 118 #ifndef PMAP_SHPGPERPROC 119 #define PMAP_SHPGPERPROC 200 120 #endif 121 122 #if defined(DIAGNOSTIC) 123 #define PMAP_DIAGNOSTIC 124 #endif 125 126 #define MINPV 2048 127 128 /* 129 * Get PDEs and PTEs for user/kernel address space 130 */ 131 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 132 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 133 134 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & PG_V) != 0) 135 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & PG_W) != 0) 136 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & PG_M) != 0) 137 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & PG_A) != 0) 138 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & PG_V) != 0) 139 140 141 /* 142 * Given a map and a machine independent protection code, 143 * convert to a vax protection code. 144 */ 145 #define pte_prot(m, p) \ 146 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 147 static int protection_codes[8]; 148 149 struct pmap kernel_pmap; 150 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); 151 152 vm_paddr_t avail_start; /* PA of first available physical page */ 153 vm_paddr_t avail_end; /* PA of last available physical page */ 154 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 155 vm_offset_t virtual2_end; 156 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 157 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 158 vm_offset_t KvaStart; /* VA start of KVA space */ 159 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 160 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 161 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 162 static int pgeflag; /* PG_G or-in */ 163 static int pseflag; /* PG_PS or-in */ 164 165 static vm_object_t kptobj; 166 167 static int ndmpdp; 168 static vm_paddr_t dmaplimit; 169 static int nkpt; 170 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 171 172 static uint64_t KPTbase; 173 static uint64_t KPTphys; 174 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 175 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 176 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 177 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 178 179 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 180 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 181 182 /* 183 * Data for the pv entry allocation mechanism 184 */ 185 static vm_zone_t pvzone; 186 static struct vm_zone pvzone_store; 187 static struct vm_object pvzone_obj; 188 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; 189 static int pmap_pagedaemon_waken = 0; 190 static struct pv_entry *pvinit; 191 192 /* 193 * All those kernel PT submaps that BSD is so fond of 194 */ 195 pt_entry_t *CMAP1 = 0, *ptmmap; 196 caddr_t CADDR1 = 0, ptvmmap = 0; 197 static pt_entry_t *msgbufmap; 198 struct msgbuf *msgbufp=0; 199 200 /* 201 * Crashdump maps. 202 */ 203 static pt_entry_t *pt_crashdumpmap; 204 static caddr_t crashdumpmap; 205 206 #define DISABLE_PSE 207 208 static pv_entry_t get_pv_entry (void); 209 static void i386_protection_init (void); 210 static void create_pagetables(vm_paddr_t *firstaddr); 211 static void pmap_remove_all (vm_page_t m); 212 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 213 vm_offset_t sva, pmap_inval_info_t info); 214 static void pmap_remove_page (struct pmap *pmap, 215 vm_offset_t va, pmap_inval_info_t info); 216 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 217 vm_offset_t va, pmap_inval_info_t info); 218 static boolean_t pmap_testbit (vm_page_t m, int bit); 219 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 220 vm_page_t mpte, vm_page_t m); 221 222 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 223 224 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 225 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 226 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 227 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 228 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 229 pmap_inval_info_t info); 230 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t); 231 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 232 233 static unsigned pdir4mb; 234 235 /* 236 * Move the kernel virtual free pointer to the next 237 * 2MB. This is used to help improve performance 238 * by using a large (2MB) page for much of the kernel 239 * (.text, .data, .bss) 240 */ 241 static 242 vm_offset_t 243 pmap_kmem_choose(vm_offset_t addr) 244 { 245 vm_offset_t newaddr = addr; 246 247 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 248 return newaddr; 249 } 250 251 /* 252 * pmap_pte_quick: 253 * 254 * Super fast pmap_pte routine best used when scanning the pv lists. 255 * This eliminates many course-grained invltlb calls. Note that many of 256 * the pv list scans are across different pmaps and it is very wasteful 257 * to do an entire invltlb when checking a single mapping. 258 * 259 * Should only be called while in a critical section. 260 */ 261 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 262 263 static 264 pt_entry_t * 265 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 266 { 267 return pmap_pte(pmap, va); 268 } 269 270 /* Return a non-clipped PD index for a given VA */ 271 static __inline 272 vm_pindex_t 273 pmap_pde_pindex(vm_offset_t va) 274 { 275 return va >> PDRSHIFT; 276 } 277 278 /* Return various clipped indexes for a given VA */ 279 static __inline 280 vm_pindex_t 281 pmap_pte_index(vm_offset_t va) 282 { 283 284 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 285 } 286 287 static __inline 288 vm_pindex_t 289 pmap_pde_index(vm_offset_t va) 290 { 291 292 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 293 } 294 295 static __inline 296 vm_pindex_t 297 pmap_pdpe_index(vm_offset_t va) 298 { 299 300 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 301 } 302 303 static __inline 304 vm_pindex_t 305 pmap_pml4e_index(vm_offset_t va) 306 { 307 308 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 309 } 310 311 /* Return a pointer to the PML4 slot that corresponds to a VA */ 312 static __inline 313 pml4_entry_t * 314 pmap_pml4e(pmap_t pmap, vm_offset_t va) 315 { 316 317 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 318 } 319 320 /* Return a pointer to the PDP slot that corresponds to a VA */ 321 static __inline 322 pdp_entry_t * 323 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 324 { 325 pdp_entry_t *pdpe; 326 327 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 328 return (&pdpe[pmap_pdpe_index(va)]); 329 } 330 331 /* Return a pointer to the PDP slot that corresponds to a VA */ 332 static __inline 333 pdp_entry_t * 334 pmap_pdpe(pmap_t pmap, vm_offset_t va) 335 { 336 pml4_entry_t *pml4e; 337 338 pml4e = pmap_pml4e(pmap, va); 339 if ((*pml4e & PG_V) == 0) 340 return NULL; 341 return (pmap_pml4e_to_pdpe(pml4e, va)); 342 } 343 344 /* Return a pointer to the PD slot that corresponds to a VA */ 345 static __inline 346 pd_entry_t * 347 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 348 { 349 pd_entry_t *pde; 350 351 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 352 return (&pde[pmap_pde_index(va)]); 353 } 354 355 /* Return a pointer to the PD slot that corresponds to a VA */ 356 static __inline 357 pd_entry_t * 358 pmap_pde(pmap_t pmap, vm_offset_t va) 359 { 360 pdp_entry_t *pdpe; 361 362 pdpe = pmap_pdpe(pmap, va); 363 if (pdpe == NULL || (*pdpe & PG_V) == 0) 364 return NULL; 365 return (pmap_pdpe_to_pde(pdpe, va)); 366 } 367 368 /* Return a pointer to the PT slot that corresponds to a VA */ 369 static __inline 370 pt_entry_t * 371 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 372 { 373 pt_entry_t *pte; 374 375 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 376 return (&pte[pmap_pte_index(va)]); 377 } 378 379 /* Return a pointer to the PT slot that corresponds to a VA */ 380 static __inline 381 pt_entry_t * 382 pmap_pte(pmap_t pmap, vm_offset_t va) 383 { 384 pd_entry_t *pde; 385 386 pde = pmap_pde(pmap, va); 387 if (pde == NULL || (*pde & PG_V) == 0) 388 return NULL; 389 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 390 return ((pt_entry_t *)pde); 391 return (pmap_pde_to_pte(pde, va)); 392 } 393 394 static __inline 395 pt_entry_t * 396 vtopte(vm_offset_t va) 397 { 398 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 399 400 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 401 } 402 403 static __inline 404 pd_entry_t * 405 vtopde(vm_offset_t va) 406 { 407 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 408 409 return (PDmap + ((va >> PDRSHIFT) & mask)); 410 } 411 412 static uint64_t 413 allocpages(vm_paddr_t *firstaddr, long n) 414 { 415 uint64_t ret; 416 417 ret = *firstaddr; 418 bzero((void *)ret, n * PAGE_SIZE); 419 *firstaddr += n * PAGE_SIZE; 420 return (ret); 421 } 422 423 static 424 void 425 create_pagetables(vm_paddr_t *firstaddr) 426 { 427 long i; /* must be 64 bits */ 428 long nkpt_base; 429 long nkpt_phys; 430 431 /* 432 * We are running (mostly) V=P at this point 433 * 434 * Calculate NKPT - number of kernel page tables. We have to 435 * accomodoate prealloction of the vm_page_array, dump bitmap, 436 * MSGBUF_SIZE, and other stuff. Be generous. 437 * 438 * Maxmem is in pages. 439 */ 440 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 441 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 442 ndmpdp = 4; 443 444 /* 445 * Starting at the beginning of kvm (not KERNBASE). 446 */ 447 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 448 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 449 nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E + ndmpdp) + 450 511) / 512; 451 nkpt_phys += 128; 452 453 /* 454 * Starting at KERNBASE - map 2G worth of page table pages. 455 * KERNBASE is offset -2G from the end of kvm. 456 */ 457 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 458 459 /* 460 * Allocate pages 461 */ 462 KPTbase = allocpages(firstaddr, nkpt_base); 463 KPTphys = allocpages(firstaddr, nkpt_phys); 464 KPML4phys = allocpages(firstaddr, 1); 465 KPDPphys = allocpages(firstaddr, NKPML4E); 466 KPDphys = allocpages(firstaddr, NKPDPE); 467 468 /* 469 * Calculate the page directory base for KERNBASE, 470 * that is where we start populating the page table pages. 471 * Basically this is the end - 2. 472 */ 473 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 474 475 DMPDPphys = allocpages(firstaddr, NDMPML4E); 476 if ((amd_feature & AMDID_PAGE1GB) == 0) 477 DMPDphys = allocpages(firstaddr, ndmpdp); 478 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 479 480 /* 481 * Fill in the underlying page table pages for the area around 482 * KERNBASE. This remaps low physical memory to KERNBASE. 483 * 484 * Read-only from zero to physfree 485 * XXX not fully used, underneath 2M pages 486 */ 487 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 488 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 489 ((pt_entry_t *)KPTbase)[i] |= PG_RW | PG_V | PG_G; 490 } 491 492 /* 493 * Now map the initial kernel page tables. One block of page 494 * tables is placed at the beginning of kernel virtual memory, 495 * and another block is placed at KERNBASE to map the kernel binary, 496 * data, bss, and initial pre-allocations. 497 */ 498 for (i = 0; i < nkpt_base; i++) { 499 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 500 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V; 501 } 502 for (i = 0; i < nkpt_phys; i++) { 503 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 504 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 505 } 506 507 /* 508 * Map from zero to end of allocations using 2M pages as an 509 * optimization. This will bypass some of the KPTBase pages 510 * above in the KERNBASE area. 511 */ 512 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 513 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 514 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V | PG_PS | PG_G; 515 } 516 517 /* 518 * And connect up the PD to the PDP. The kernel pmap is expected 519 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 520 */ 521 for (i = 0; i < NKPDPE; i++) { 522 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 523 KPDphys + (i << PAGE_SHIFT); 524 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 525 PG_RW | PG_V | PG_U; 526 } 527 528 /* Now set up the direct map space using either 2MB or 1GB pages */ 529 /* Preset PG_M and PG_A because demotion expects it */ 530 if ((amd_feature & AMDID_PAGE1GB) == 0) { 531 for (i = 0; i < NPDEPG * ndmpdp; i++) { 532 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 533 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | 534 PG_G | PG_M | PG_A; 535 } 536 /* And the direct map space's PDP */ 537 for (i = 0; i < ndmpdp; i++) { 538 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 539 (i << PAGE_SHIFT); 540 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 541 } 542 } else { 543 for (i = 0; i < ndmpdp; i++) { 544 ((pdp_entry_t *)DMPDPphys)[i] = 545 (vm_paddr_t)i << PDPSHIFT; 546 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | 547 PG_G | PG_M | PG_A; 548 } 549 } 550 551 /* And recursively map PML4 to itself in order to get PTmap */ 552 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 553 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 554 555 /* Connect the Direct Map slot up to the PML4 */ 556 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; 557 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; 558 559 /* Connect the KVA slot up to the PML4 */ 560 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 561 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 562 } 563 564 /* 565 * Bootstrap the system enough to run with virtual memory. 566 * 567 * On the i386 this is called after mapping has already been enabled 568 * and just syncs the pmap module with what has already been done. 569 * [We can't call it easily with mapping off since the kernel is not 570 * mapped with PA == VA, hence we would have to relocate every address 571 * from the linked base (virtual) address "KERNBASE" to the actual 572 * (physical) address starting relative to 0] 573 */ 574 void 575 pmap_bootstrap(vm_paddr_t *firstaddr) 576 { 577 vm_offset_t va; 578 pt_entry_t *pte; 579 struct mdglobaldata *gd; 580 int pg; 581 582 KvaStart = VM_MIN_KERNEL_ADDRESS; 583 KvaEnd = VM_MAX_KERNEL_ADDRESS; 584 KvaSize = KvaEnd - KvaStart; 585 586 avail_start = *firstaddr; 587 588 /* 589 * Create an initial set of page tables to run the kernel in. 590 */ 591 create_pagetables(firstaddr); 592 593 virtual2_start = KvaStart; 594 virtual2_end = PTOV_OFFSET; 595 596 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 597 virtual_start = pmap_kmem_choose(virtual_start); 598 599 virtual_end = VM_MAX_KERNEL_ADDRESS; 600 601 /* XXX do %cr0 as well */ 602 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 603 load_cr3(KPML4phys); 604 605 /* 606 * Initialize protection array. 607 */ 608 i386_protection_init(); 609 610 /* 611 * The kernel's pmap is statically allocated so we don't have to use 612 * pmap_create, which is unlikely to work correctly at this part of 613 * the boot sequence (XXX and which no longer exists). 614 */ 615 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 616 kernel_pmap.pm_count = 1; 617 kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK; 618 TAILQ_INIT(&kernel_pmap.pm_pvlist); 619 620 /* 621 * Reserve some special page table entries/VA space for temporary 622 * mapping of pages. 623 */ 624 #define SYSMAP(c, p, v, n) \ 625 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 626 627 va = virtual_start; 628 pte = vtopte(va); 629 630 /* 631 * CMAP1/CMAP2 are used for zeroing and copying pages. 632 */ 633 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 634 635 /* 636 * Crashdump maps. 637 */ 638 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 639 640 /* 641 * ptvmmap is used for reading arbitrary physical pages via 642 * /dev/mem. 643 */ 644 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 645 646 /* 647 * msgbufp is used to map the system message buffer. 648 * XXX msgbufmap is not used. 649 */ 650 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 651 atop(round_page(MSGBUF_SIZE))) 652 653 virtual_start = va; 654 655 *CMAP1 = 0; 656 657 /* 658 * PG_G is terribly broken on SMP because we IPI invltlb's in some 659 * cases rather then invl1pg. Actually, I don't even know why it 660 * works under UP because self-referential page table mappings 661 */ 662 #ifdef SMP 663 pgeflag = 0; 664 #else 665 if (cpu_feature & CPUID_PGE) 666 pgeflag = PG_G; 667 #endif 668 669 /* 670 * Initialize the 4MB page size flag 671 */ 672 pseflag = 0; 673 /* 674 * The 4MB page version of the initial 675 * kernel page mapping. 676 */ 677 pdir4mb = 0; 678 679 #if !defined(DISABLE_PSE) 680 if (cpu_feature & CPUID_PSE) { 681 pt_entry_t ptditmp; 682 /* 683 * Note that we have enabled PSE mode 684 */ 685 pseflag = PG_PS; 686 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 687 ptditmp &= ~(NBPDR - 1); 688 ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; 689 pdir4mb = ptditmp; 690 691 #ifndef SMP 692 /* 693 * Enable the PSE mode. If we are SMP we can't do this 694 * now because the APs will not be able to use it when 695 * they boot up. 696 */ 697 load_cr4(rcr4() | CR4_PSE); 698 699 /* 700 * We can do the mapping here for the single processor 701 * case. We simply ignore the old page table page from 702 * now on. 703 */ 704 /* 705 * For SMP, we still need 4K pages to bootstrap APs, 706 * PSE will be enabled as soon as all APs are up. 707 */ 708 PTD[KPTDI] = (pd_entry_t)ptditmp; 709 cpu_invltlb(); 710 #endif 711 } 712 #endif 713 714 /* 715 * We need to finish setting up the globaldata page for the BSP. 716 * locore has already populated the page table for the mdglobaldata 717 * portion. 718 */ 719 pg = MDGLOBALDATA_BASEALLOC_PAGES; 720 gd = &CPU_prvspace[0].mdglobaldata; 721 722 cpu_invltlb(); 723 } 724 725 #ifdef SMP 726 /* 727 * Set 4mb pdir for mp startup 728 */ 729 void 730 pmap_set_opt(void) 731 { 732 if (pseflag && (cpu_feature & CPUID_PSE)) { 733 load_cr4(rcr4() | CR4_PSE); 734 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 735 cpu_invltlb(); 736 } 737 } 738 } 739 #endif 740 741 /* 742 * XXX: Hack. Required by pmap_init() 743 */ 744 extern vm_offset_t cpu_apic_addr; 745 746 /* 747 * Initialize the pmap module. 748 * Called by vm_init, to initialize any structures that the pmap 749 * system needs to map virtual memory. 750 * pmap_init has been enhanced to support in a fairly consistant 751 * way, discontiguous physical memory. 752 */ 753 void 754 pmap_init(void) 755 { 756 int i; 757 int initial_pvs; 758 759 /* 760 * object for kernel page table pages 761 */ 762 /* JG I think the number can be arbitrary */ 763 kptobj = vm_object_allocate(OBJT_DEFAULT, 5); 764 765 /* 766 * Allocate memory for random pmap data structures. Includes the 767 * pv_head_table. 768 */ 769 770 for(i = 0; i < vm_page_array_size; i++) { 771 vm_page_t m; 772 773 m = &vm_page_array[i]; 774 TAILQ_INIT(&m->md.pv_list); 775 m->md.pv_list_count = 0; 776 } 777 778 /* 779 * init the pv free list 780 */ 781 initial_pvs = vm_page_array_size; 782 if (initial_pvs < MINPV) 783 initial_pvs = MINPV; 784 pvzone = &pvzone_store; 785 pvinit = (void *)kmem_alloc(&kernel_map, 786 initial_pvs * sizeof (struct pv_entry)); 787 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 788 pvinit, initial_pvs); 789 790 /* 791 * Now it is safe to enable pv_table recording. 792 */ 793 pmap_initialized = TRUE; 794 #ifdef SMP 795 /* 796 * XXX: Hack 797 */ 798 lapic = pmap_mapdev_uncacheable(cpu_apic_addr, sizeof(struct LAPIC)); 799 #endif 800 } 801 802 /* 803 * Initialize the address space (zone) for the pv_entries. Set a 804 * high water mark so that the system can recover from excessive 805 * numbers of pv entries. 806 */ 807 void 808 pmap_init2(void) 809 { 810 int shpgperproc = PMAP_SHPGPERPROC; 811 int entry_max; 812 813 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 814 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 815 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 816 pv_entry_high_water = 9 * (pv_entry_max / 10); 817 818 /* 819 * Subtract out pages already installed in the zone (hack) 820 */ 821 entry_max = pv_entry_max - vm_page_array_size; 822 if (entry_max <= 0) 823 entry_max = 1; 824 825 zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1); 826 } 827 828 829 /*************************************************** 830 * Low level helper routines..... 831 ***************************************************/ 832 833 #if defined(PMAP_DIAGNOSTIC) 834 835 /* 836 * This code checks for non-writeable/modified pages. 837 * This should be an invalid condition. 838 */ 839 static 840 int 841 pmap_nw_modified(pt_entry_t pte) 842 { 843 if ((pte & (PG_M|PG_RW)) == PG_M) 844 return 1; 845 else 846 return 0; 847 } 848 #endif 849 850 851 /* 852 * this routine defines the region(s) of memory that should 853 * not be tested for the modified bit. 854 */ 855 static __inline 856 int 857 pmap_track_modified(vm_offset_t va) 858 { 859 if ((va < clean_sva) || (va >= clean_eva)) 860 return 1; 861 else 862 return 0; 863 } 864 865 /* 866 * Extract the physical page address associated with the map/VA pair. 867 * 868 * The caller must hold vm_token if non-blocking operation is desired. 869 */ 870 vm_paddr_t 871 pmap_extract(pmap_t pmap, vm_offset_t va) 872 { 873 vm_paddr_t rtval; 874 pt_entry_t *pte; 875 pd_entry_t pde, *pdep; 876 877 lwkt_gettoken(&vm_token); 878 rtval = 0; 879 pdep = pmap_pde(pmap, va); 880 if (pdep != NULL) { 881 pde = *pdep; 882 if (pde) { 883 if ((pde & PG_PS) != 0) { 884 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 885 } else { 886 pte = pmap_pde_to_pte(pdep, va); 887 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 888 } 889 } 890 } 891 lwkt_reltoken(&vm_token); 892 return rtval; 893 } 894 895 /* 896 * Extract the physical page address associated kernel virtual address. 897 */ 898 vm_paddr_t 899 pmap_kextract(vm_offset_t va) 900 { 901 pd_entry_t pde; 902 vm_paddr_t pa; 903 904 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 905 pa = DMAP_TO_PHYS(va); 906 } else { 907 pde = *vtopde(va); 908 if (pde & PG_PS) { 909 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 910 } else { 911 /* 912 * Beware of a concurrent promotion that changes the 913 * PDE at this point! For example, vtopte() must not 914 * be used to access the PTE because it would use the 915 * new PDE. It is, however, safe to use the old PDE 916 * because the page table page is preserved by the 917 * promotion. 918 */ 919 pa = *pmap_pde_to_pte(&pde, va); 920 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 921 } 922 } 923 return pa; 924 } 925 926 /*************************************************** 927 * Low level mapping routines..... 928 ***************************************************/ 929 930 /* 931 * Routine: pmap_kenter 932 * Function: 933 * Add a wired page to the KVA 934 * NOTE! note that in order for the mapping to take effect -- you 935 * should do an invltlb after doing the pmap_kenter(). 936 */ 937 void 938 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 939 { 940 pt_entry_t *pte; 941 pt_entry_t npte; 942 pmap_inval_info info; 943 944 pmap_inval_init(&info); 945 npte = pa | PG_RW | PG_V | pgeflag; 946 pte = vtopte(va); 947 pmap_inval_interlock(&info, &kernel_pmap, va); 948 *pte = npte; 949 pmap_inval_deinterlock(&info, &kernel_pmap); 950 pmap_inval_done(&info); 951 } 952 953 /* 954 * Routine: pmap_kenter_quick 955 * Function: 956 * Similar to pmap_kenter(), except we only invalidate the 957 * mapping on the current CPU. 958 */ 959 void 960 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 961 { 962 pt_entry_t *pte; 963 pt_entry_t npte; 964 965 npte = pa | PG_RW | PG_V | pgeflag; 966 pte = vtopte(va); 967 *pte = npte; 968 cpu_invlpg((void *)va); 969 } 970 971 void 972 pmap_kenter_sync(vm_offset_t va) 973 { 974 pmap_inval_info info; 975 976 pmap_inval_init(&info); 977 pmap_inval_interlock(&info, &kernel_pmap, va); 978 pmap_inval_deinterlock(&info, &kernel_pmap); 979 pmap_inval_done(&info); 980 } 981 982 void 983 pmap_kenter_sync_quick(vm_offset_t va) 984 { 985 cpu_invlpg((void *)va); 986 } 987 988 /* 989 * remove a page from the kernel pagetables 990 */ 991 void 992 pmap_kremove(vm_offset_t va) 993 { 994 pt_entry_t *pte; 995 pmap_inval_info info; 996 997 pmap_inval_init(&info); 998 pte = vtopte(va); 999 pmap_inval_interlock(&info, &kernel_pmap, va); 1000 *pte = 0; 1001 pmap_inval_deinterlock(&info, &kernel_pmap); 1002 pmap_inval_done(&info); 1003 } 1004 1005 void 1006 pmap_kremove_quick(vm_offset_t va) 1007 { 1008 pt_entry_t *pte; 1009 pte = vtopte(va); 1010 *pte = 0; 1011 cpu_invlpg((void *)va); 1012 } 1013 1014 /* 1015 * XXX these need to be recoded. They are not used in any critical path. 1016 */ 1017 void 1018 pmap_kmodify_rw(vm_offset_t va) 1019 { 1020 *vtopte(va) |= PG_RW; 1021 cpu_invlpg((void *)va); 1022 } 1023 1024 void 1025 pmap_kmodify_nc(vm_offset_t va) 1026 { 1027 *vtopte(va) |= PG_N; 1028 cpu_invlpg((void *)va); 1029 } 1030 1031 /* 1032 * Used to map a range of physical addresses into kernel virtual 1033 * address space during the low level boot, typically to map the 1034 * dump bitmap, message buffer, and vm_page_array. 1035 * 1036 * These mappings are typically made at some pointer after the end of the 1037 * kernel text+data. 1038 * 1039 * We could return PHYS_TO_DMAP(start) here and not allocate any 1040 * via (*virtp), but then kmem from userland and kernel dumps won't 1041 * have access to the related pointers. 1042 */ 1043 vm_offset_t 1044 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1045 { 1046 vm_offset_t va; 1047 vm_offset_t va_start; 1048 1049 /*return PHYS_TO_DMAP(start);*/ 1050 1051 va_start = *virtp; 1052 va = va_start; 1053 1054 while (start < end) { 1055 pmap_kenter_quick(va, start); 1056 va += PAGE_SIZE; 1057 start += PAGE_SIZE; 1058 } 1059 *virtp = va; 1060 return va_start; 1061 } 1062 1063 1064 /* 1065 * Add a list of wired pages to the kva 1066 * this routine is only used for temporary 1067 * kernel mappings that do not need to have 1068 * page modification or references recorded. 1069 * Note that old mappings are simply written 1070 * over. The page *must* be wired. 1071 */ 1072 void 1073 pmap_qenter(vm_offset_t va, vm_page_t *m, int count) 1074 { 1075 vm_offset_t end_va; 1076 1077 end_va = va + count * PAGE_SIZE; 1078 1079 while (va < end_va) { 1080 pt_entry_t *pte; 1081 1082 pte = vtopte(va); 1083 *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag; 1084 cpu_invlpg((void *)va); 1085 va += PAGE_SIZE; 1086 m++; 1087 } 1088 smp_invltlb(); 1089 } 1090 1091 /* 1092 * This routine jerks page mappings from the 1093 * kernel -- it is meant only for temporary mappings. 1094 * 1095 * MPSAFE, INTERRUPT SAFE (cluster callback) 1096 */ 1097 void 1098 pmap_qremove(vm_offset_t va, int count) 1099 { 1100 vm_offset_t end_va; 1101 1102 end_va = va + count * PAGE_SIZE; 1103 1104 while (va < end_va) { 1105 pt_entry_t *pte; 1106 1107 pte = vtopte(va); 1108 *pte = 0; 1109 cpu_invlpg((void *)va); 1110 va += PAGE_SIZE; 1111 } 1112 smp_invltlb(); 1113 } 1114 1115 /* 1116 * This routine works like vm_page_lookup() but also blocks as long as the 1117 * page is busy. This routine does not busy the page it returns. 1118 * 1119 * Unless the caller is managing objects whos pages are in a known state, 1120 * the call should be made with a critical section held so the page's object 1121 * association remains valid on return. 1122 */ 1123 static 1124 vm_page_t 1125 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1126 { 1127 vm_page_t m; 1128 1129 do { 1130 m = vm_page_lookup(object, pindex); 1131 } while (m && vm_page_sleep_busy(m, FALSE, "pplookp")); 1132 1133 return(m); 1134 } 1135 1136 /* 1137 * Create a new thread and optionally associate it with a (new) process. 1138 * NOTE! the new thread's cpu may not equal the current cpu. 1139 */ 1140 void 1141 pmap_init_thread(thread_t td) 1142 { 1143 /* enforce pcb placement & alignment */ 1144 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1145 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 1146 td->td_savefpu = &td->td_pcb->pcb_save; 1147 td->td_sp = (char *)td->td_pcb; /* no -16 */ 1148 } 1149 1150 /* 1151 * This routine directly affects the fork perf for a process. 1152 */ 1153 void 1154 pmap_init_proc(struct proc *p) 1155 { 1156 } 1157 1158 /* 1159 * Dispose the UPAGES for a process that has exited. 1160 * This routine directly impacts the exit perf of a process. 1161 */ 1162 void 1163 pmap_dispose_proc(struct proc *p) 1164 { 1165 KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p)); 1166 } 1167 1168 /*************************************************** 1169 * Page table page management routines..... 1170 ***************************************************/ 1171 1172 /* 1173 * This routine unholds page table pages, and if the hold count 1174 * drops to zero, then it decrements the wire count. 1175 */ 1176 static __inline 1177 int 1178 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1179 pmap_inval_info_t info) 1180 { 1181 KKASSERT(m->hold_count > 0); 1182 if (m->hold_count > 1) { 1183 vm_page_unhold(m); 1184 return 0; 1185 } else { 1186 return _pmap_unwire_pte_hold(pmap, va, m, info); 1187 } 1188 } 1189 1190 static 1191 int 1192 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1193 pmap_inval_info_t info) 1194 { 1195 /* 1196 * Wait until we can busy the page ourselves. We cannot have 1197 * any active flushes if we block. We own one hold count on the 1198 * page so it cannot be freed out from under us. 1199 */ 1200 if (m->flags & PG_BUSY) { 1201 pmap_inval_flush(info); 1202 while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) 1203 ; 1204 } 1205 KASSERT(m->queue == PQ_NONE, 1206 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m)); 1207 1208 /* 1209 * This case can occur if new references were acquired while 1210 * we were blocked. 1211 */ 1212 if (m->hold_count > 1) { 1213 KKASSERT(m->hold_count > 1); 1214 vm_page_unhold(m); 1215 return 0; 1216 } 1217 1218 /* 1219 * Unmap the page table page 1220 */ 1221 KKASSERT(m->hold_count == 1); 1222 vm_page_busy(m); 1223 pmap_inval_interlock(info, pmap, -1); 1224 1225 if (m->pindex >= (NUPDE + NUPDPE)) { 1226 /* PDP page */ 1227 pml4_entry_t *pml4; 1228 pml4 = pmap_pml4e(pmap, va); 1229 *pml4 = 0; 1230 } else if (m->pindex >= NUPDE) { 1231 /* PD page */ 1232 pdp_entry_t *pdp; 1233 pdp = pmap_pdpe(pmap, va); 1234 *pdp = 0; 1235 } else { 1236 /* PT page */ 1237 pd_entry_t *pd; 1238 pd = pmap_pde(pmap, va); 1239 *pd = 0; 1240 } 1241 1242 KKASSERT(pmap->pm_stats.resident_count > 0); 1243 --pmap->pm_stats.resident_count; 1244 1245 if (pmap->pm_ptphint == m) 1246 pmap->pm_ptphint = NULL; 1247 pmap_inval_deinterlock(info, pmap); 1248 1249 if (m->pindex < NUPDE) { 1250 /* We just released a PT, unhold the matching PD */ 1251 vm_page_t pdpg; 1252 1253 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1254 pmap_unwire_pte_hold(pmap, va, pdpg, info); 1255 } 1256 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1257 /* We just released a PD, unhold the matching PDP */ 1258 vm_page_t pdppg; 1259 1260 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1261 pmap_unwire_pte_hold(pmap, va, pdppg, info); 1262 } 1263 1264 /* 1265 * This was our last hold, the page had better be unwired 1266 * after we decrement wire_count. 1267 * 1268 * FUTURE NOTE: shared page directory page could result in 1269 * multiple wire counts. 1270 */ 1271 vm_page_unhold(m); 1272 --m->wire_count; 1273 KKASSERT(m->wire_count == 0); 1274 --vmstats.v_wire_count; 1275 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1276 vm_page_flash(m); 1277 vm_page_free_zero(m); 1278 1279 return 1; 1280 } 1281 1282 /* 1283 * After removing a page table entry, this routine is used to 1284 * conditionally free the page, and manage the hold/wire counts. 1285 */ 1286 static 1287 int 1288 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte, 1289 pmap_inval_info_t info) 1290 { 1291 vm_pindex_t ptepindex; 1292 1293 if (va >= VM_MAX_USER_ADDRESS) 1294 return 0; 1295 1296 if (mpte == NULL) { 1297 ptepindex = pmap_pde_pindex(va); 1298 #if JGHINT 1299 if (pmap->pm_ptphint && 1300 (pmap->pm_ptphint->pindex == ptepindex)) { 1301 mpte = pmap->pm_ptphint; 1302 } else { 1303 #endif 1304 pmap_inval_flush(info); 1305 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1306 pmap->pm_ptphint = mpte; 1307 #if JGHINT 1308 } 1309 #endif 1310 } 1311 return pmap_unwire_pte_hold(pmap, va, mpte, info); 1312 } 1313 1314 /* 1315 * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because 1316 * it, and IdlePTD, represents the template used to update all other pmaps. 1317 * 1318 * On architectures where the kernel pmap is not integrated into the user 1319 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1320 * kernel_pmap should be used to directly access the kernel_pmap. 1321 */ 1322 void 1323 pmap_pinit0(struct pmap *pmap) 1324 { 1325 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1326 pmap->pm_count = 1; 1327 pmap->pm_active = 0; 1328 pmap->pm_ptphint = NULL; 1329 TAILQ_INIT(&pmap->pm_pvlist); 1330 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1331 } 1332 1333 /* 1334 * Initialize a preallocated and zeroed pmap structure, 1335 * such as one in a vmspace structure. 1336 */ 1337 void 1338 pmap_pinit(struct pmap *pmap) 1339 { 1340 vm_page_t ptdpg; 1341 1342 /* 1343 * No need to allocate page table space yet but we do need a valid 1344 * page directory table. 1345 */ 1346 if (pmap->pm_pml4 == NULL) { 1347 pmap->pm_pml4 = 1348 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 1349 } 1350 1351 /* 1352 * Allocate an object for the ptes 1353 */ 1354 if (pmap->pm_pteobj == NULL) 1355 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + PML4PML4I + 1); 1356 1357 /* 1358 * Allocate the page directory page, unless we already have 1359 * one cached. If we used the cached page the wire_count will 1360 * already be set appropriately. 1361 */ 1362 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1363 ptdpg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I, 1364 VM_ALLOC_NORMAL | VM_ALLOC_RETRY); 1365 pmap->pm_pdirm = ptdpg; 1366 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); 1367 ptdpg->valid = VM_PAGE_BITS_ALL; 1368 if (ptdpg->wire_count == 0) 1369 ++vmstats.v_wire_count; 1370 ptdpg->wire_count = 1; 1371 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1372 } 1373 if ((ptdpg->flags & PG_ZERO) == 0) 1374 bzero(pmap->pm_pml4, PAGE_SIZE); 1375 #ifdef PMAP_DEBUG 1376 else 1377 pmap_page_assertzero(VM_PAGE_TO_PHYS(ptdpg)); 1378 #endif 1379 1380 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1381 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; 1382 1383 /* install self-referential address mapping entry */ 1384 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; 1385 1386 pmap->pm_count = 1; 1387 pmap->pm_active = 0; 1388 pmap->pm_ptphint = NULL; 1389 TAILQ_INIT(&pmap->pm_pvlist); 1390 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1391 pmap->pm_stats.resident_count = 1; 1392 } 1393 1394 /* 1395 * Clean up a pmap structure so it can be physically freed. This routine 1396 * is called by the vmspace dtor function. A great deal of pmap data is 1397 * left passively mapped to improve vmspace management so we have a bit 1398 * of cleanup work to do here. 1399 */ 1400 void 1401 pmap_puninit(pmap_t pmap) 1402 { 1403 vm_page_t p; 1404 1405 KKASSERT(pmap->pm_active == 0); 1406 lwkt_gettoken(&vm_token); 1407 if ((p = pmap->pm_pdirm) != NULL) { 1408 KKASSERT(pmap->pm_pml4 != NULL); 1409 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1410 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1411 p->wire_count--; 1412 vmstats.v_wire_count--; 1413 KKASSERT((p->flags & PG_BUSY) == 0); 1414 vm_page_busy(p); 1415 vm_page_free_zero(p); 1416 pmap->pm_pdirm = NULL; 1417 } 1418 if (pmap->pm_pml4) { 1419 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1420 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1421 pmap->pm_pml4 = NULL; 1422 } 1423 if (pmap->pm_pteobj) { 1424 vm_object_deallocate(pmap->pm_pteobj); 1425 pmap->pm_pteobj = NULL; 1426 } 1427 lwkt_reltoken(&vm_token); 1428 } 1429 1430 /* 1431 * Wire in kernel global address entries. To avoid a race condition 1432 * between pmap initialization and pmap_growkernel, this procedure 1433 * adds the pmap to the master list (which growkernel scans to update), 1434 * then copies the template. 1435 */ 1436 void 1437 pmap_pinit2(struct pmap *pmap) 1438 { 1439 crit_enter(); 1440 lwkt_gettoken(&vm_token); 1441 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); 1442 /* XXX copies current process, does not fill in MPPTDI */ 1443 lwkt_reltoken(&vm_token); 1444 crit_exit(); 1445 } 1446 1447 /* 1448 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1449 * 0 on failure (if the procedure had to sleep). 1450 * 1451 * When asked to remove the page directory page itself, we actually just 1452 * leave it cached so we do not have to incur the SMP inval overhead of 1453 * removing the kernel mapping. pmap_puninit() will take care of it. 1454 */ 1455 static 1456 int 1457 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1458 { 1459 /* 1460 * This code optimizes the case of freeing non-busy 1461 * page-table pages. Those pages are zero now, and 1462 * might as well be placed directly into the zero queue. 1463 */ 1464 if (vm_page_sleep_busy(p, FALSE, "pmaprl")) 1465 return 0; 1466 1467 vm_page_busy(p); 1468 1469 /* 1470 * Remove the page table page from the processes address space. 1471 */ 1472 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1473 /* 1474 * We are the pml4 table itself. 1475 */ 1476 /* XXX anything to do here? */ 1477 } else if (p->pindex >= (NUPDE + NUPDPE)) { 1478 /* 1479 * Remove a PDP page from the PML4. We do not maintain 1480 * hold counts on the PML4 page. 1481 */ 1482 pml4_entry_t *pml4; 1483 vm_page_t m4; 1484 int idx; 1485 1486 m4 = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I); 1487 KKASSERT(m4 != NULL); 1488 pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1489 idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG; 1490 KKASSERT(pml4[idx] != 0); 1491 pml4[idx] = 0; 1492 } else if (p->pindex >= NUPDE) { 1493 /* 1494 * Remove a PD page from the PDP and drop the hold count 1495 * on the PDP. The PDP is left cached in the pmap if 1496 * the hold count drops to 0 so the wire count remains 1497 * intact. 1498 */ 1499 vm_page_t m3; 1500 pdp_entry_t *pdp; 1501 int idx; 1502 1503 m3 = vm_page_lookup(pmap->pm_pteobj, 1504 NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG); 1505 KKASSERT(m3 != NULL); 1506 pdp = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1507 idx = (p->pindex - NUPDE) % NPDPEPG; 1508 KKASSERT(pdp[idx] != 0); 1509 pdp[idx] = 0; 1510 m3->hold_count--; 1511 } else { 1512 /* 1513 * Remove a PT page from the PD and drop the hold count 1514 * on the PD. The PD is left cached in the pmap if 1515 * the hold count drops to 0 so the wire count remains 1516 * intact. 1517 */ 1518 vm_page_t m2; 1519 pd_entry_t *pd; 1520 int idx; 1521 1522 m2 = vm_page_lookup(pmap->pm_pteobj, 1523 NUPDE + p->pindex / NPDEPG); 1524 KKASSERT(m2 != NULL); 1525 pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1526 idx = p->pindex % NPDEPG; 1527 pd[idx] = 0; 1528 m2->hold_count--; 1529 } 1530 1531 /* 1532 * One fewer mappings in the pmap. p's hold count had better 1533 * be zero. 1534 */ 1535 KKASSERT(pmap->pm_stats.resident_count > 0); 1536 --pmap->pm_stats.resident_count; 1537 if (p->hold_count) 1538 panic("pmap_release: freeing held page table page"); 1539 if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) 1540 pmap->pm_ptphint = NULL; 1541 1542 /* 1543 * We leave the top-level page table page cached, wired, and mapped in 1544 * the pmap until the dtor function (pmap_puninit()) gets called. 1545 * However, still clean it up so we can set PG_ZERO. 1546 */ 1547 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1548 bzero(pmap->pm_pml4, PAGE_SIZE); 1549 vm_page_flag_set(p, PG_ZERO); 1550 vm_page_wakeup(p); 1551 } else { 1552 p->wire_count--; 1553 KKASSERT(p->wire_count == 0); 1554 vmstats.v_wire_count--; 1555 /* JG eventually revert to using vm_page_free_zero() */ 1556 vm_page_free(p); 1557 } 1558 return 1; 1559 } 1560 1561 /* 1562 * This routine is called when various levels in the page table need to 1563 * be populated. This routine cannot fail. 1564 */ 1565 static 1566 vm_page_t 1567 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1568 { 1569 vm_page_t m; 1570 1571 /* 1572 * Find or fabricate a new pagetable page. This will busy the page. 1573 */ 1574 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1575 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1576 if ((m->flags & PG_ZERO) == 0) { 1577 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 1578 } 1579 #ifdef PMAP_DEBUG 1580 else { 1581 pmap_page_assertzero(VM_PAGE_TO_PHYS(m)); 1582 } 1583 #endif 1584 1585 KASSERT(m->queue == PQ_NONE, 1586 ("_pmap_allocpte: %p->queue != PQ_NONE", m)); 1587 1588 /* 1589 * Increment the hold count for the page we will be returning to 1590 * the caller. 1591 */ 1592 m->hold_count++; 1593 if (m->wire_count++ == 0) 1594 vmstats.v_wire_count++; 1595 m->valid = VM_PAGE_BITS_ALL; 1596 vm_page_flag_clear(m, PG_ZERO); 1597 1598 /* 1599 * Map the pagetable page into the process address space, if 1600 * it isn't already there. 1601 * 1602 * It is possible that someone else got in and mapped the page 1603 * directory page while we were blocked, if so just unbusy and 1604 * return the held page. 1605 */ 1606 if (ptepindex >= (NUPDE + NUPDPE)) { 1607 /* 1608 * Wire up a new PDP page in the PML4 1609 */ 1610 vm_pindex_t pml4index; 1611 pml4_entry_t *pml4; 1612 1613 pml4index = ptepindex - (NUPDE + NUPDPE); 1614 pml4 = &pmap->pm_pml4[pml4index]; 1615 if (*pml4 & PG_V) { 1616 if (--m->wire_count == 0) 1617 --vmstats.v_wire_count; 1618 vm_page_wakeup(m); 1619 return(m); 1620 } 1621 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1622 } else if (ptepindex >= NUPDE) { 1623 /* 1624 * Wire up a new PD page in the PDP 1625 */ 1626 vm_pindex_t pml4index; 1627 vm_pindex_t pdpindex; 1628 vm_page_t pdppg; 1629 pml4_entry_t *pml4; 1630 pdp_entry_t *pdp; 1631 1632 pdpindex = ptepindex - NUPDE; 1633 pml4index = pdpindex >> NPML4EPGSHIFT; 1634 1635 pml4 = &pmap->pm_pml4[pml4index]; 1636 if ((*pml4 & PG_V) == 0) { 1637 /* 1638 * Have to allocate a new PDP page, recurse. 1639 * This always succeeds. Returned page will 1640 * be held. 1641 */ 1642 pdppg = _pmap_allocpte(pmap, 1643 NUPDE + NUPDPE + pml4index); 1644 } else { 1645 /* 1646 * Add a held reference to the PDP page. 1647 */ 1648 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1649 pdppg->hold_count++; 1650 } 1651 1652 /* 1653 * Now find the pdp_entry and map the PDP. If the PDP 1654 * has already been mapped unwind and return the 1655 * already-mapped PDP held. 1656 * 1657 * pdppg is left held (hold_count is incremented for 1658 * each PD in the PDP). 1659 */ 1660 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1661 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1662 if (*pdp & PG_V) { 1663 vm_page_unhold(pdppg); 1664 if (--m->wire_count == 0) 1665 --vmstats.v_wire_count; 1666 vm_page_wakeup(m); 1667 return(m); 1668 } 1669 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1670 } else { 1671 /* 1672 * Wire up the new PT page in the PD 1673 */ 1674 vm_pindex_t pml4index; 1675 vm_pindex_t pdpindex; 1676 pml4_entry_t *pml4; 1677 pdp_entry_t *pdp; 1678 pd_entry_t *pd; 1679 vm_page_t pdpg; 1680 1681 pdpindex = ptepindex >> NPDPEPGSHIFT; 1682 pml4index = pdpindex >> NPML4EPGSHIFT; 1683 1684 /* 1685 * Locate the PDP page in the PML4, then the PD page in 1686 * the PDP. If either does not exist we simply recurse 1687 * to allocate them. 1688 * 1689 * We can just recurse on the PD page as it will recurse 1690 * on the PDP if necessary. 1691 */ 1692 pml4 = &pmap->pm_pml4[pml4index]; 1693 if ((*pml4 & PG_V) == 0) { 1694 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1695 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1696 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1697 } else { 1698 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1699 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1700 if ((*pdp & PG_V) == 0) { 1701 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1702 } else { 1703 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1704 pdpg->hold_count++; 1705 } 1706 } 1707 1708 /* 1709 * Now fill in the pte in the PD. If the pte already exists 1710 * (again, if we raced the grab), unhold pdpg and unwire 1711 * m, returning a held m. 1712 * 1713 * pdpg is left held (hold_count is incremented for 1714 * each PT in the PD). 1715 */ 1716 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1717 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1718 if (*pd != 0) { 1719 vm_page_unhold(pdpg); 1720 if (--m->wire_count == 0) 1721 --vmstats.v_wire_count; 1722 vm_page_wakeup(m); 1723 return(m); 1724 } 1725 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1726 } 1727 1728 /* 1729 * We successfully loaded a PDP, PD, or PTE. Set the page table hint, 1730 * valid bits, mapped flag, unbusy, and we're done. 1731 */ 1732 pmap->pm_ptphint = m; 1733 ++pmap->pm_stats.resident_count; 1734 1735 #if 0 1736 m->valid = VM_PAGE_BITS_ALL; 1737 vm_page_flag_clear(m, PG_ZERO); 1738 #endif 1739 vm_page_flag_set(m, PG_MAPPED); 1740 vm_page_wakeup(m); 1741 1742 return (m); 1743 } 1744 1745 static 1746 vm_page_t 1747 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1748 { 1749 vm_pindex_t ptepindex; 1750 pd_entry_t *pd; 1751 vm_page_t m; 1752 1753 /* 1754 * Calculate pagetable page index 1755 */ 1756 ptepindex = pmap_pde_pindex(va); 1757 1758 /* 1759 * Get the page directory entry 1760 */ 1761 pd = pmap_pde(pmap, va); 1762 1763 /* 1764 * This supports switching from a 2MB page to a 1765 * normal 4K page. 1766 */ 1767 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1768 panic("no promotion/demotion yet"); 1769 *pd = 0; 1770 pd = NULL; 1771 cpu_invltlb(); 1772 smp_invltlb(); 1773 } 1774 1775 /* 1776 * If the page table page is mapped, we just increment the 1777 * hold count, and activate it. 1778 */ 1779 if (pd != NULL && (*pd & PG_V) != 0) { 1780 /* YYY hint is used here on i386 */ 1781 m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 1782 pmap->pm_ptphint = m; 1783 m->hold_count++; 1784 return m; 1785 } 1786 /* 1787 * Here if the pte page isn't mapped, or if it has been deallocated. 1788 */ 1789 return _pmap_allocpte(pmap, ptepindex); 1790 } 1791 1792 1793 /*************************************************** 1794 * Pmap allocation/deallocation routines. 1795 ***************************************************/ 1796 1797 /* 1798 * Release any resources held by the given physical map. 1799 * Called when a pmap initialized by pmap_pinit is being released. 1800 * Should only be called if the map contains no valid mappings. 1801 */ 1802 static int pmap_release_callback(struct vm_page *p, void *data); 1803 1804 void 1805 pmap_release(struct pmap *pmap) 1806 { 1807 vm_object_t object = pmap->pm_pteobj; 1808 struct rb_vm_page_scan_info info; 1809 1810 KASSERT(pmap->pm_active == 0, 1811 ("pmap still active! %016jx", (uintmax_t)pmap->pm_active)); 1812 #if defined(DIAGNOSTIC) 1813 if (object->ref_count != 1) 1814 panic("pmap_release: pteobj reference count != 1"); 1815 #endif 1816 1817 info.pmap = pmap; 1818 info.object = object; 1819 crit_enter(); 1820 lwkt_gettoken(&vm_token); 1821 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); 1822 crit_exit(); 1823 1824 do { 1825 crit_enter(); 1826 info.error = 0; 1827 info.mpte = NULL; 1828 info.limit = object->generation; 1829 1830 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1831 pmap_release_callback, &info); 1832 if (info.error == 0 && info.mpte) { 1833 if (!pmap_release_free_page(pmap, info.mpte)) 1834 info.error = 1; 1835 } 1836 crit_exit(); 1837 } while (info.error); 1838 lwkt_reltoken(&vm_token); 1839 } 1840 1841 static 1842 int 1843 pmap_release_callback(struct vm_page *p, void *data) 1844 { 1845 struct rb_vm_page_scan_info *info = data; 1846 1847 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1848 info->mpte = p; 1849 return(0); 1850 } 1851 if (!pmap_release_free_page(info->pmap, p)) { 1852 info->error = 1; 1853 return(-1); 1854 } 1855 if (info->object->generation != info->limit) { 1856 info->error = 1; 1857 return(-1); 1858 } 1859 return(0); 1860 } 1861 1862 /* 1863 * Grow the number of kernel page table entries, if needed. 1864 * 1865 * This routine is always called to validate any address space 1866 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 1867 * space below KERNBASE. 1868 */ 1869 void 1870 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 1871 { 1872 vm_paddr_t paddr; 1873 vm_offset_t ptppaddr; 1874 vm_page_t nkpg; 1875 pd_entry_t *pde, newpdir; 1876 pdp_entry_t newpdp; 1877 int update_kernel_vm_end; 1878 1879 crit_enter(); 1880 lwkt_gettoken(&vm_token); 1881 1882 /* 1883 * bootstrap kernel_vm_end on first real VM use 1884 */ 1885 if (kernel_vm_end == 0) { 1886 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 1887 nkpt = 0; 1888 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & PG_V) != 0) { 1889 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1890 ~(PAGE_SIZE * NPTEPG - 1); 1891 nkpt++; 1892 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1893 kernel_vm_end = kernel_map.max_offset; 1894 break; 1895 } 1896 } 1897 } 1898 1899 /* 1900 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 1901 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 1902 * do not want to force-fill 128G worth of page tables. 1903 */ 1904 if (kstart < KERNBASE) { 1905 if (kstart > kernel_vm_end) 1906 kstart = kernel_vm_end; 1907 KKASSERT(kend <= KERNBASE); 1908 update_kernel_vm_end = 1; 1909 } else { 1910 update_kernel_vm_end = 0; 1911 } 1912 1913 kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG); 1914 kend = roundup2(kend, PAGE_SIZE * NPTEPG); 1915 1916 if (kend - 1 >= kernel_map.max_offset) 1917 kend = kernel_map.max_offset; 1918 1919 while (kstart < kend) { 1920 pde = pmap_pde(&kernel_pmap, kstart); 1921 if (pde == NULL) { 1922 /* We need a new PDP entry */ 1923 nkpg = vm_page_alloc(kptobj, nkpt, 1924 VM_ALLOC_NORMAL | 1925 VM_ALLOC_SYSTEM | 1926 VM_ALLOC_INTERRUPT); 1927 if (nkpg == NULL) { 1928 panic("pmap_growkernel: no memory to grow " 1929 "kernel"); 1930 } 1931 paddr = VM_PAGE_TO_PHYS(nkpg); 1932 if ((nkpg->flags & PG_ZERO) == 0) 1933 pmap_zero_page(paddr); 1934 vm_page_flag_clear(nkpg, PG_ZERO); 1935 newpdp = (pdp_entry_t) 1936 (paddr | PG_V | PG_RW | PG_A | PG_M); 1937 *pmap_pdpe(&kernel_pmap, kstart) = newpdp; 1938 nkpt++; 1939 continue; /* try again */ 1940 } 1941 if ((*pde & PG_V) != 0) { 1942 kstart = (kstart + PAGE_SIZE * NPTEPG) & 1943 ~(PAGE_SIZE * NPTEPG - 1); 1944 if (kstart - 1 >= kernel_map.max_offset) { 1945 kstart = kernel_map.max_offset; 1946 break; 1947 } 1948 continue; 1949 } 1950 1951 /* 1952 * This index is bogus, but out of the way 1953 */ 1954 nkpg = vm_page_alloc(kptobj, nkpt, 1955 VM_ALLOC_NORMAL | 1956 VM_ALLOC_SYSTEM | 1957 VM_ALLOC_INTERRUPT); 1958 if (nkpg == NULL) 1959 panic("pmap_growkernel: no memory to grow kernel"); 1960 1961 vm_page_wire(nkpg); 1962 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1963 pmap_zero_page(ptppaddr); 1964 vm_page_flag_clear(nkpg, PG_ZERO); 1965 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 1966 *pmap_pde(&kernel_pmap, kstart) = newpdir; 1967 nkpt++; 1968 1969 kstart = (kstart + PAGE_SIZE * NPTEPG) & 1970 ~(PAGE_SIZE * NPTEPG - 1); 1971 1972 if (kstart - 1 >= kernel_map.max_offset) { 1973 kstart = kernel_map.max_offset; 1974 break; 1975 } 1976 } 1977 1978 /* 1979 * Only update kernel_vm_end for areas below KERNBASE. 1980 */ 1981 if (update_kernel_vm_end && kernel_vm_end < kstart) 1982 kernel_vm_end = kstart; 1983 1984 lwkt_reltoken(&vm_token); 1985 crit_exit(); 1986 } 1987 1988 /* 1989 * Retire the given physical map from service. 1990 * Should only be called if the map contains 1991 * no valid mappings. 1992 */ 1993 void 1994 pmap_destroy(pmap_t pmap) 1995 { 1996 int count; 1997 1998 if (pmap == NULL) 1999 return; 2000 2001 lwkt_gettoken(&vm_token); 2002 count = --pmap->pm_count; 2003 if (count == 0) { 2004 pmap_release(pmap); 2005 panic("destroying a pmap is not yet implemented"); 2006 } 2007 lwkt_reltoken(&vm_token); 2008 } 2009 2010 /* 2011 * Add a reference to the specified pmap. 2012 */ 2013 void 2014 pmap_reference(pmap_t pmap) 2015 { 2016 if (pmap != NULL) { 2017 lwkt_gettoken(&vm_token); 2018 pmap->pm_count++; 2019 lwkt_reltoken(&vm_token); 2020 } 2021 } 2022 2023 /*************************************************** 2024 * page management routines. 2025 ***************************************************/ 2026 2027 /* 2028 * free the pv_entry back to the free list. This function may be 2029 * called from an interrupt. 2030 */ 2031 static __inline 2032 void 2033 free_pv_entry(pv_entry_t pv) 2034 { 2035 pv_entry_count--; 2036 KKASSERT(pv_entry_count >= 0); 2037 zfree(pvzone, pv); 2038 } 2039 2040 /* 2041 * get a new pv_entry, allocating a block from the system 2042 * when needed. This function may be called from an interrupt. 2043 */ 2044 static 2045 pv_entry_t 2046 get_pv_entry(void) 2047 { 2048 pv_entry_count++; 2049 if (pv_entry_high_water && 2050 (pv_entry_count > pv_entry_high_water) && 2051 (pmap_pagedaemon_waken == 0)) { 2052 pmap_pagedaemon_waken = 1; 2053 wakeup(&vm_pages_needed); 2054 } 2055 return zalloc(pvzone); 2056 } 2057 2058 /* 2059 * This routine is very drastic, but can save the system 2060 * in a pinch. 2061 */ 2062 void 2063 pmap_collect(void) 2064 { 2065 int i; 2066 vm_page_t m; 2067 static int warningdone=0; 2068 2069 if (pmap_pagedaemon_waken == 0) 2070 return; 2071 lwkt_gettoken(&vm_token); 2072 if (warningdone < 5) { 2073 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); 2074 warningdone++; 2075 } 2076 2077 for(i = 0; i < vm_page_array_size; i++) { 2078 m = &vm_page_array[i]; 2079 if (m->wire_count || m->hold_count || m->busy || 2080 (m->flags & PG_BUSY)) 2081 continue; 2082 pmap_remove_all(m); 2083 } 2084 pmap_pagedaemon_waken = 0; 2085 lwkt_reltoken(&vm_token); 2086 } 2087 2088 2089 /* 2090 * If it is the first entry on the list, it is actually 2091 * in the header and we must copy the following entry up 2092 * to the header. Otherwise we must search the list for 2093 * the entry. In either case we free the now unused entry. 2094 */ 2095 static 2096 int 2097 pmap_remove_entry(struct pmap *pmap, vm_page_t m, 2098 vm_offset_t va, pmap_inval_info_t info) 2099 { 2100 pv_entry_t pv; 2101 int rtval; 2102 2103 crit_enter(); 2104 if (m->md.pv_list_count < pmap->pm_stats.resident_count) { 2105 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2106 if (pmap == pv->pv_pmap && va == pv->pv_va) 2107 break; 2108 } 2109 } else { 2110 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { 2111 if (va == pv->pv_va) 2112 break; 2113 } 2114 } 2115 2116 rtval = 0; 2117 KKASSERT(pv); 2118 2119 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2120 m->md.pv_list_count--; 2121 m->object->agg_pv_list_count--; 2122 KKASSERT(m->md.pv_list_count >= 0); 2123 if (TAILQ_EMPTY(&m->md.pv_list)) 2124 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2125 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2126 ++pmap->pm_generation; 2127 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info); 2128 free_pv_entry(pv); 2129 2130 crit_exit(); 2131 return rtval; 2132 } 2133 2134 /* 2135 * Create a pv entry for page at pa for 2136 * (pmap, va). 2137 */ 2138 static 2139 void 2140 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) 2141 { 2142 pv_entry_t pv; 2143 2144 crit_enter(); 2145 pv = get_pv_entry(); 2146 pv->pv_va = va; 2147 pv->pv_pmap = pmap; 2148 pv->pv_ptem = mpte; 2149 2150 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); 2151 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2152 ++pmap->pm_generation; 2153 m->md.pv_list_count++; 2154 m->object->agg_pv_list_count++; 2155 2156 crit_exit(); 2157 } 2158 2159 /* 2160 * pmap_remove_pte: do the things to unmap a page in a process 2161 */ 2162 static 2163 int 2164 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, 2165 pmap_inval_info_t info) 2166 { 2167 pt_entry_t oldpte; 2168 vm_page_t m; 2169 2170 pmap_inval_interlock(info, pmap, va); 2171 oldpte = pte_load_clear(ptq); 2172 pmap_inval_deinterlock(info, pmap); 2173 if (oldpte & PG_W) 2174 pmap->pm_stats.wired_count -= 1; 2175 /* 2176 * Machines that don't support invlpg, also don't support 2177 * PG_G. XXX PG_G is disabled for SMP so don't worry about 2178 * the SMP case. 2179 */ 2180 if (oldpte & PG_G) 2181 cpu_invlpg((void *)va); 2182 KKASSERT(pmap->pm_stats.resident_count > 0); 2183 --pmap->pm_stats.resident_count; 2184 if (oldpte & PG_MANAGED) { 2185 m = PHYS_TO_VM_PAGE(oldpte); 2186 if (oldpte & PG_M) { 2187 #if defined(PMAP_DIAGNOSTIC) 2188 if (pmap_nw_modified((pt_entry_t) oldpte)) { 2189 kprintf( 2190 "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2191 va, oldpte); 2192 } 2193 #endif 2194 if (pmap_track_modified(va)) 2195 vm_page_dirty(m); 2196 } 2197 if (oldpte & PG_A) 2198 vm_page_flag_set(m, PG_REFERENCED); 2199 return pmap_remove_entry(pmap, m, va, info); 2200 } else { 2201 return pmap_unuse_pt(pmap, va, NULL, info); 2202 } 2203 2204 return 0; 2205 } 2206 2207 /* 2208 * pmap_remove_page: 2209 * 2210 * Remove a single page from a process address space. 2211 * 2212 * This function may not be called from an interrupt if the pmap is 2213 * not kernel_pmap. 2214 */ 2215 static 2216 void 2217 pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info) 2218 { 2219 pt_entry_t *pte; 2220 2221 pte = pmap_pte(pmap, va); 2222 if (pte == NULL) 2223 return; 2224 if ((*pte & PG_V) == 0) 2225 return; 2226 pmap_remove_pte(pmap, pte, va, info); 2227 } 2228 2229 /* 2230 * pmap_remove: 2231 * 2232 * Remove the given range of addresses from the specified map. 2233 * 2234 * It is assumed that the start and end are properly 2235 * rounded to the page size. 2236 * 2237 * This function may not be called from an interrupt if the pmap is 2238 * not kernel_pmap. 2239 */ 2240 void 2241 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 2242 { 2243 vm_offset_t va_next; 2244 pml4_entry_t *pml4e; 2245 pdp_entry_t *pdpe; 2246 pd_entry_t ptpaddr, *pde; 2247 pt_entry_t *pte; 2248 struct pmap_inval_info info; 2249 2250 if (pmap == NULL) 2251 return; 2252 2253 lwkt_gettoken(&vm_token); 2254 if (pmap->pm_stats.resident_count == 0) { 2255 lwkt_reltoken(&vm_token); 2256 return; 2257 } 2258 2259 pmap_inval_init(&info); 2260 2261 /* 2262 * special handling of removing one page. a very 2263 * common operation and easy to short circuit some 2264 * code. 2265 */ 2266 if (sva + PAGE_SIZE == eva) { 2267 pde = pmap_pde(pmap, sva); 2268 if (pde && (*pde & PG_PS) == 0) { 2269 pmap_remove_page(pmap, sva, &info); 2270 pmap_inval_done(&info); 2271 lwkt_reltoken(&vm_token); 2272 return; 2273 } 2274 } 2275 2276 for (; sva < eva; sva = va_next) { 2277 pml4e = pmap_pml4e(pmap, sva); 2278 if ((*pml4e & PG_V) == 0) { 2279 va_next = (sva + NBPML4) & ~PML4MASK; 2280 if (va_next < sva) 2281 va_next = eva; 2282 continue; 2283 } 2284 2285 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2286 if ((*pdpe & PG_V) == 0) { 2287 va_next = (sva + NBPDP) & ~PDPMASK; 2288 if (va_next < sva) 2289 va_next = eva; 2290 continue; 2291 } 2292 2293 /* 2294 * Calculate index for next page table. 2295 */ 2296 va_next = (sva + NBPDR) & ~PDRMASK; 2297 if (va_next < sva) 2298 va_next = eva; 2299 2300 pde = pmap_pdpe_to_pde(pdpe, sva); 2301 ptpaddr = *pde; 2302 2303 /* 2304 * Weed out invalid mappings. 2305 */ 2306 if (ptpaddr == 0) 2307 continue; 2308 2309 /* 2310 * Check for large page. 2311 */ 2312 if ((ptpaddr & PG_PS) != 0) { 2313 /* JG FreeBSD has more complex treatment here */ 2314 pmap_inval_interlock(&info, pmap, -1); 2315 *pde = 0; 2316 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2317 pmap_inval_deinterlock(&info, pmap); 2318 continue; 2319 } 2320 2321 /* 2322 * Limit our scan to either the end of the va represented 2323 * by the current page table page, or to the end of the 2324 * range being removed. 2325 */ 2326 if (va_next > eva) 2327 va_next = eva; 2328 2329 /* 2330 * NOTE: pmap_remove_pte() can block. 2331 */ 2332 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2333 sva += PAGE_SIZE) { 2334 if (*pte == 0) 2335 continue; 2336 if (pmap_remove_pte(pmap, pte, sva, &info)) 2337 break; 2338 } 2339 } 2340 pmap_inval_done(&info); 2341 lwkt_reltoken(&vm_token); 2342 } 2343 2344 /* 2345 * pmap_remove_all: 2346 * 2347 * Removes this physical page from all physical maps in which it resides. 2348 * Reflects back modify bits to the pager. 2349 * 2350 * This routine may not be called from an interrupt. 2351 */ 2352 2353 static 2354 void 2355 pmap_remove_all(vm_page_t m) 2356 { 2357 struct pmap_inval_info info; 2358 pt_entry_t *pte, tpte; 2359 pv_entry_t pv; 2360 2361 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2362 return; 2363 2364 lwkt_gettoken(&vm_token); 2365 pmap_inval_init(&info); 2366 crit_enter(); 2367 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2368 KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0); 2369 --pv->pv_pmap->pm_stats.resident_count; 2370 2371 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2372 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 2373 tpte = pte_load_clear(pte); 2374 if (tpte & PG_W) 2375 pv->pv_pmap->pm_stats.wired_count--; 2376 pmap_inval_deinterlock(&info, pv->pv_pmap); 2377 if (tpte & PG_A) 2378 vm_page_flag_set(m, PG_REFERENCED); 2379 2380 /* 2381 * Update the vm_page_t clean and reference bits. 2382 */ 2383 if (tpte & PG_M) { 2384 #if defined(PMAP_DIAGNOSTIC) 2385 if (pmap_nw_modified(tpte)) { 2386 kprintf( 2387 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2388 pv->pv_va, tpte); 2389 } 2390 #endif 2391 if (pmap_track_modified(pv->pv_va)) 2392 vm_page_dirty(m); 2393 } 2394 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2395 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 2396 ++pv->pv_pmap->pm_generation; 2397 m->md.pv_list_count--; 2398 m->object->agg_pv_list_count--; 2399 KKASSERT(m->md.pv_list_count >= 0); 2400 if (TAILQ_EMPTY(&m->md.pv_list)) 2401 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2402 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info); 2403 free_pv_entry(pv); 2404 } 2405 crit_exit(); 2406 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2407 pmap_inval_done(&info); 2408 lwkt_reltoken(&vm_token); 2409 } 2410 2411 /* 2412 * pmap_protect: 2413 * 2414 * Set the physical protection on the specified range of this map 2415 * as requested. 2416 * 2417 * This function may not be called from an interrupt if the map is 2418 * not the kernel_pmap. 2419 */ 2420 void 2421 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2422 { 2423 vm_offset_t va_next; 2424 pml4_entry_t *pml4e; 2425 pdp_entry_t *pdpe; 2426 pd_entry_t ptpaddr, *pde; 2427 pt_entry_t *pte; 2428 pmap_inval_info info; 2429 2430 /* JG review for NX */ 2431 2432 if (pmap == NULL) 2433 return; 2434 2435 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2436 pmap_remove(pmap, sva, eva); 2437 return; 2438 } 2439 2440 if (prot & VM_PROT_WRITE) 2441 return; 2442 2443 lwkt_gettoken(&vm_token); 2444 pmap_inval_init(&info); 2445 2446 for (; sva < eva; sva = va_next) { 2447 2448 pml4e = pmap_pml4e(pmap, sva); 2449 if ((*pml4e & PG_V) == 0) { 2450 va_next = (sva + NBPML4) & ~PML4MASK; 2451 if (va_next < sva) 2452 va_next = eva; 2453 continue; 2454 } 2455 2456 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2457 if ((*pdpe & PG_V) == 0) { 2458 va_next = (sva + NBPDP) & ~PDPMASK; 2459 if (va_next < sva) 2460 va_next = eva; 2461 continue; 2462 } 2463 2464 va_next = (sva + NBPDR) & ~PDRMASK; 2465 if (va_next < sva) 2466 va_next = eva; 2467 2468 pde = pmap_pdpe_to_pde(pdpe, sva); 2469 ptpaddr = *pde; 2470 2471 /* 2472 * Check for large page. 2473 */ 2474 if ((ptpaddr & PG_PS) != 0) { 2475 pmap_inval_interlock(&info, pmap, -1); 2476 *pde &= ~(PG_M|PG_RW); 2477 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2478 pmap_inval_deinterlock(&info, pmap); 2479 continue; 2480 } 2481 2482 /* 2483 * Weed out invalid mappings. Note: we assume that the page 2484 * directory table is always allocated, and in kernel virtual. 2485 */ 2486 if (ptpaddr == 0) 2487 continue; 2488 2489 if (va_next > eva) 2490 va_next = eva; 2491 2492 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2493 sva += PAGE_SIZE) { 2494 pt_entry_t pbits; 2495 pt_entry_t cbits; 2496 vm_page_t m; 2497 2498 /* 2499 * XXX non-optimal. Note also that there can be 2500 * no pmap_inval_flush() calls until after we modify 2501 * ptbase[sindex] (or otherwise we have to do another 2502 * pmap_inval_add() call). 2503 */ 2504 pmap_inval_interlock(&info, pmap, sva); 2505 again: 2506 pbits = *pte; 2507 cbits = pbits; 2508 if ((pbits & PG_V) == 0) { 2509 pmap_inval_deinterlock(&info, pmap); 2510 continue; 2511 } 2512 if (pbits & PG_MANAGED) { 2513 m = NULL; 2514 if (pbits & PG_A) { 2515 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2516 vm_page_flag_set(m, PG_REFERENCED); 2517 cbits &= ~PG_A; 2518 } 2519 if (pbits & PG_M) { 2520 if (pmap_track_modified(sva)) { 2521 if (m == NULL) 2522 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2523 vm_page_dirty(m); 2524 cbits &= ~PG_M; 2525 } 2526 } 2527 } 2528 cbits &= ~PG_RW; 2529 if (pbits != cbits && 2530 !atomic_cmpset_long(pte, pbits, cbits)) { 2531 goto again; 2532 } 2533 pmap_inval_deinterlock(&info, pmap); 2534 } 2535 } 2536 pmap_inval_done(&info); 2537 lwkt_reltoken(&vm_token); 2538 } 2539 2540 /* 2541 * Insert the given physical page (p) at 2542 * the specified virtual address (v) in the 2543 * target physical map with the protection requested. 2544 * 2545 * If specified, the page will be wired down, meaning 2546 * that the related pte can not be reclaimed. 2547 * 2548 * NB: This is the only routine which MAY NOT lazy-evaluate 2549 * or lose information. That is, this routine must actually 2550 * insert this page into the given map NOW. 2551 */ 2552 void 2553 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2554 boolean_t wired) 2555 { 2556 vm_paddr_t pa; 2557 pd_entry_t *pde; 2558 pt_entry_t *pte; 2559 vm_paddr_t opa; 2560 pt_entry_t origpte, newpte; 2561 vm_page_t mpte; 2562 pmap_inval_info info; 2563 2564 if (pmap == NULL) 2565 return; 2566 2567 va = trunc_page(va); 2568 #ifdef PMAP_DIAGNOSTIC 2569 if (va >= KvaEnd) 2570 panic("pmap_enter: toobig"); 2571 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 2572 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); 2573 #endif 2574 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2575 kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n"); 2576 #ifdef DDB 2577 db_print_backtrace(); 2578 #endif 2579 } 2580 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2581 kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n"); 2582 #ifdef DDB 2583 db_print_backtrace(); 2584 #endif 2585 } 2586 2587 lwkt_gettoken(&vm_token); 2588 2589 /* 2590 * In the case that a page table page is not 2591 * resident, we are creating it here. 2592 */ 2593 if (va < VM_MAX_USER_ADDRESS) 2594 mpte = pmap_allocpte(pmap, va); 2595 else 2596 mpte = NULL; 2597 2598 pmap_inval_init(&info); 2599 pde = pmap_pde(pmap, va); 2600 if (pde != NULL && (*pde & PG_V) != 0) { 2601 if ((*pde & PG_PS) != 0) 2602 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2603 pte = pmap_pde_to_pte(pde, va); 2604 } else 2605 panic("pmap_enter: invalid page directory va=%#lx", va); 2606 2607 KKASSERT(pte != NULL); 2608 pa = VM_PAGE_TO_PHYS(m); 2609 origpte = *pte; 2610 opa = origpte & PG_FRAME; 2611 2612 /* 2613 * Mapping has not changed, must be protection or wiring change. 2614 */ 2615 if (origpte && (opa == pa)) { 2616 /* 2617 * Wiring change, just update stats. We don't worry about 2618 * wiring PT pages as they remain resident as long as there 2619 * are valid mappings in them. Hence, if a user page is wired, 2620 * the PT page will be also. 2621 */ 2622 if (wired && ((origpte & PG_W) == 0)) 2623 pmap->pm_stats.wired_count++; 2624 else if (!wired && (origpte & PG_W)) 2625 pmap->pm_stats.wired_count--; 2626 2627 #if defined(PMAP_DIAGNOSTIC) 2628 if (pmap_nw_modified(origpte)) { 2629 kprintf( 2630 "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2631 va, origpte); 2632 } 2633 #endif 2634 2635 /* 2636 * Remove the extra pte reference. Note that we cannot 2637 * optimize the RO->RW case because we have adjusted the 2638 * wiring count above and may need to adjust the wiring 2639 * bits below. 2640 */ 2641 if (mpte) 2642 mpte->hold_count--; 2643 2644 /* 2645 * We might be turning off write access to the page, 2646 * so we go ahead and sense modify status. 2647 */ 2648 if (origpte & PG_MANAGED) { 2649 if ((origpte & PG_M) && pmap_track_modified(va)) { 2650 vm_page_t om; 2651 om = PHYS_TO_VM_PAGE(opa); 2652 vm_page_dirty(om); 2653 } 2654 pa |= PG_MANAGED; 2655 KKASSERT(m->flags & PG_MAPPED); 2656 } 2657 goto validate; 2658 } 2659 /* 2660 * Mapping has changed, invalidate old range and fall through to 2661 * handle validating new mapping. 2662 */ 2663 while (opa) { 2664 int err; 2665 err = pmap_remove_pte(pmap, pte, va, &info); 2666 if (err) 2667 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2668 origpte = *pte; 2669 opa = origpte & PG_FRAME; 2670 if (opa) { 2671 kprintf("pmap_enter: Warning, raced pmap %p va %p\n", 2672 pmap, (void *)va); 2673 } 2674 } 2675 2676 /* 2677 * Enter on the PV list if part of our managed memory. Note that we 2678 * raise IPL while manipulating pv_table since pmap_enter can be 2679 * called at interrupt time. 2680 */ 2681 if (pmap_initialized && 2682 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2683 pmap_insert_entry(pmap, va, mpte, m); 2684 pa |= PG_MANAGED; 2685 vm_page_flag_set(m, PG_MAPPED); 2686 } 2687 2688 /* 2689 * Increment counters 2690 */ 2691 ++pmap->pm_stats.resident_count; 2692 if (wired) 2693 pmap->pm_stats.wired_count++; 2694 2695 validate: 2696 /* 2697 * Now validate mapping with desired protection/wiring. 2698 */ 2699 newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | PG_V); 2700 2701 if (wired) 2702 newpte |= PG_W; 2703 if (va < VM_MAX_USER_ADDRESS) 2704 newpte |= PG_U; 2705 if (pmap == &kernel_pmap) 2706 newpte |= pgeflag; 2707 2708 /* 2709 * if the mapping or permission bits are different, we need 2710 * to update the pte. 2711 */ 2712 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2713 pmap_inval_interlock(&info, pmap, va); 2714 *pte = newpte | PG_A; 2715 pmap_inval_deinterlock(&info, pmap); 2716 if (newpte & PG_RW) 2717 vm_page_flag_set(m, PG_WRITEABLE); 2718 } 2719 KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2720 pmap_inval_done(&info); 2721 lwkt_reltoken(&vm_token); 2722 } 2723 2724 /* 2725 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 2726 * This code also assumes that the pmap has no pre-existing entry for this 2727 * VA. 2728 * 2729 * This code currently may only be used on user pmaps, not kernel_pmap. 2730 */ 2731 void 2732 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 2733 { 2734 pt_entry_t *pte; 2735 vm_paddr_t pa; 2736 vm_page_t mpte; 2737 vm_pindex_t ptepindex; 2738 pd_entry_t *ptepa; 2739 pmap_inval_info info; 2740 2741 lwkt_gettoken(&vm_token); 2742 pmap_inval_init(&info); 2743 2744 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2745 kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n"); 2746 #ifdef DDB 2747 db_print_backtrace(); 2748 #endif 2749 } 2750 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2751 kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n"); 2752 #ifdef DDB 2753 db_print_backtrace(); 2754 #endif 2755 } 2756 2757 KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */ 2758 2759 /* 2760 * Calculate the page table page (mpte), allocating it if necessary. 2761 * 2762 * A held page table page (mpte), or NULL, is passed onto the 2763 * section following. 2764 */ 2765 if (va < VM_MAX_USER_ADDRESS) { 2766 /* 2767 * Calculate pagetable page index 2768 */ 2769 ptepindex = pmap_pde_pindex(va); 2770 2771 do { 2772 /* 2773 * Get the page directory entry 2774 */ 2775 ptepa = pmap_pde(pmap, va); 2776 2777 /* 2778 * If the page table page is mapped, we just increment 2779 * the hold count, and activate it. 2780 */ 2781 if (ptepa && (*ptepa & PG_V) != 0) { 2782 if (*ptepa & PG_PS) 2783 panic("pmap_enter_quick: unexpected mapping into 2MB page"); 2784 // if (pmap->pm_ptphint && 2785 // (pmap->pm_ptphint->pindex == ptepindex)) { 2786 // mpte = pmap->pm_ptphint; 2787 // } else { 2788 mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 2789 pmap->pm_ptphint = mpte; 2790 // } 2791 if (mpte) 2792 mpte->hold_count++; 2793 } else { 2794 mpte = _pmap_allocpte(pmap, ptepindex); 2795 } 2796 } while (mpte == NULL); 2797 } else { 2798 mpte = NULL; 2799 /* this code path is not yet used */ 2800 } 2801 2802 /* 2803 * With a valid (and held) page directory page, we can just use 2804 * vtopte() to get to the pte. If the pte is already present 2805 * we do not disturb it. 2806 */ 2807 pte = vtopte(va); 2808 if (*pte & PG_V) { 2809 if (mpte) 2810 pmap_unwire_pte_hold(pmap, va, mpte, &info); 2811 pa = VM_PAGE_TO_PHYS(m); 2812 KKASSERT(((*pte ^ pa) & PG_FRAME) == 0); 2813 pmap_inval_done(&info); 2814 lwkt_reltoken(&vm_token); 2815 return; 2816 } 2817 2818 /* 2819 * Enter on the PV list if part of our managed memory 2820 */ 2821 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2822 pmap_insert_entry(pmap, va, mpte, m); 2823 vm_page_flag_set(m, PG_MAPPED); 2824 } 2825 2826 /* 2827 * Increment counters 2828 */ 2829 ++pmap->pm_stats.resident_count; 2830 2831 pa = VM_PAGE_TO_PHYS(m); 2832 2833 /* 2834 * Now validate mapping with RO protection 2835 */ 2836 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2837 *pte = pa | PG_V | PG_U; 2838 else 2839 *pte = pa | PG_V | PG_U | PG_MANAGED; 2840 /* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */ 2841 pmap_inval_done(&info); 2842 lwkt_reltoken(&vm_token); 2843 } 2844 2845 /* 2846 * Make a temporary mapping for a physical address. This is only intended 2847 * to be used for panic dumps. 2848 */ 2849 /* JG Needed on x86_64? */ 2850 void * 2851 pmap_kenter_temporary(vm_paddr_t pa, long i) 2852 { 2853 pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 2854 return ((void *)crashdumpmap); 2855 } 2856 2857 #define MAX_INIT_PT (96) 2858 2859 /* 2860 * This routine preloads the ptes for a given object into the specified pmap. 2861 * This eliminates the blast of soft faults on process startup and 2862 * immediately after an mmap. 2863 */ 2864 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2865 2866 void 2867 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 2868 vm_object_t object, vm_pindex_t pindex, 2869 vm_size_t size, int limit) 2870 { 2871 struct rb_vm_page_scan_info info; 2872 struct lwp *lp; 2873 vm_size_t psize; 2874 2875 /* 2876 * We can't preinit if read access isn't set or there is no pmap 2877 * or object. 2878 */ 2879 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2880 return; 2881 2882 /* 2883 * We can't preinit if the pmap is not the current pmap 2884 */ 2885 lp = curthread->td_lwp; 2886 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2887 return; 2888 2889 psize = x86_64_btop(size); 2890 2891 if ((object->type != OBJT_VNODE) || 2892 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2893 (object->resident_page_count > MAX_INIT_PT))) { 2894 return; 2895 } 2896 2897 if (psize + pindex > object->size) { 2898 if (object->size < pindex) 2899 return; 2900 psize = object->size - pindex; 2901 } 2902 2903 if (psize == 0) 2904 return; 2905 2906 /* 2907 * Use a red-black scan to traverse the requested range and load 2908 * any valid pages found into the pmap. 2909 * 2910 * We cannot safely scan the object's memq unless we are in a 2911 * critical section since interrupts can remove pages from objects. 2912 */ 2913 info.start_pindex = pindex; 2914 info.end_pindex = pindex + psize - 1; 2915 info.limit = limit; 2916 info.mpte = NULL; 2917 info.addr = addr; 2918 info.pmap = pmap; 2919 2920 crit_enter(); 2921 lwkt_gettoken(&vm_token); 2922 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2923 pmap_object_init_pt_callback, &info); 2924 lwkt_reltoken(&vm_token); 2925 crit_exit(); 2926 } 2927 2928 static 2929 int 2930 pmap_object_init_pt_callback(vm_page_t p, void *data) 2931 { 2932 struct rb_vm_page_scan_info *info = data; 2933 vm_pindex_t rel_index; 2934 /* 2935 * don't allow an madvise to blow away our really 2936 * free pages allocating pv entries. 2937 */ 2938 if ((info->limit & MAP_PREFAULT_MADVISE) && 2939 vmstats.v_free_count < vmstats.v_free_reserved) { 2940 return(-1); 2941 } 2942 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2943 (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { 2944 if ((p->queue - p->pc) == PQ_CACHE) 2945 vm_page_deactivate(p); 2946 vm_page_busy(p); 2947 rel_index = p->pindex - info->start_pindex; 2948 pmap_enter_quick(info->pmap, 2949 info->addr + x86_64_ptob(rel_index), p); 2950 vm_page_wakeup(p); 2951 } 2952 return(0); 2953 } 2954 2955 /* 2956 * Return TRUE if the pmap is in shape to trivially 2957 * pre-fault the specified address. 2958 * 2959 * Returns FALSE if it would be non-trivial or if a 2960 * pte is already loaded into the slot. 2961 */ 2962 int 2963 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2964 { 2965 pt_entry_t *pte; 2966 pd_entry_t *pde; 2967 int ret; 2968 2969 lwkt_gettoken(&vm_token); 2970 pde = pmap_pde(pmap, addr); 2971 if (pde == NULL || *pde == 0) { 2972 ret = 0; 2973 } else { 2974 pte = vtopte(addr); 2975 ret = (*pte) ? 0 : 1; 2976 } 2977 lwkt_reltoken(&vm_token); 2978 return(ret); 2979 } 2980 2981 /* 2982 * Routine: pmap_change_wiring 2983 * Function: Change the wiring attribute for a map/virtual-address 2984 * pair. 2985 * In/out conditions: 2986 * The mapping must already exist in the pmap. 2987 */ 2988 void 2989 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 2990 { 2991 pt_entry_t *pte; 2992 2993 if (pmap == NULL) 2994 return; 2995 2996 lwkt_gettoken(&vm_token); 2997 pte = pmap_pte(pmap, va); 2998 2999 if (wired && !pmap_pte_w(pte)) 3000 pmap->pm_stats.wired_count++; 3001 else if (!wired && pmap_pte_w(pte)) 3002 pmap->pm_stats.wired_count--; 3003 3004 /* 3005 * Wiring is not a hardware characteristic so there is no need to 3006 * invalidate TLB. However, in an SMP environment we must use 3007 * a locked bus cycle to update the pte (if we are not using 3008 * the pmap_inval_*() API that is)... it's ok to do this for simple 3009 * wiring changes. 3010 */ 3011 #ifdef SMP 3012 if (wired) 3013 atomic_set_long(pte, PG_W); 3014 else 3015 atomic_clear_long(pte, PG_W); 3016 #else 3017 if (wired) 3018 atomic_set_long_nonlocked(pte, PG_W); 3019 else 3020 atomic_clear_long_nonlocked(pte, PG_W); 3021 #endif 3022 lwkt_reltoken(&vm_token); 3023 } 3024 3025 3026 3027 /* 3028 * Copy the range specified by src_addr/len 3029 * from the source map to the range dst_addr/len 3030 * in the destination map. 3031 * 3032 * This routine is only advisory and need not do anything. 3033 */ 3034 void 3035 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 3036 vm_size_t len, vm_offset_t src_addr) 3037 { 3038 return; 3039 #if 0 3040 pmap_inval_info info; 3041 vm_offset_t addr; 3042 vm_offset_t end_addr = src_addr + len; 3043 vm_offset_t pdnxt; 3044 pd_entry_t src_frame, dst_frame; 3045 vm_page_t m; 3046 3047 if (dst_addr != src_addr) 3048 return; 3049 #if JGPMAP32 3050 src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 3051 if (src_frame != (PTDpde & PG_FRAME)) { 3052 return; 3053 } 3054 3055 dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 3056 if (dst_frame != (APTDpde & PG_FRAME)) { 3057 APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); 3058 /* The page directory is not shared between CPUs */ 3059 cpu_invltlb(); 3060 } 3061 #endif 3062 pmap_inval_init(&info); 3063 pmap_inval_add(&info, dst_pmap, -1); 3064 pmap_inval_add(&info, src_pmap, -1); 3065 3066 /* 3067 * critical section protection is required to maintain the page/object 3068 * association, interrupts can free pages and remove them from 3069 * their objects. 3070 */ 3071 crit_enter(); 3072 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 3073 pt_entry_t *src_pte, *dst_pte; 3074 vm_page_t dstmpte, srcmpte; 3075 vm_offset_t srcptepaddr; 3076 vm_pindex_t ptepindex; 3077 3078 if (addr >= UPT_MIN_ADDRESS) 3079 panic("pmap_copy: invalid to pmap_copy page tables\n"); 3080 3081 /* 3082 * Don't let optional prefaulting of pages make us go 3083 * way below the low water mark of free pages or way 3084 * above high water mark of used pv entries. 3085 */ 3086 if (vmstats.v_free_count < vmstats.v_free_reserved || 3087 pv_entry_count > pv_entry_high_water) 3088 break; 3089 3090 pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); 3091 ptepindex = addr >> PDRSHIFT; 3092 3093 #if JGPMAP32 3094 srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; 3095 #endif 3096 if (srcptepaddr == 0) 3097 continue; 3098 3099 if (srcptepaddr & PG_PS) { 3100 #if JGPMAP32 3101 if (dst_pmap->pm_pdir[ptepindex] == 0) { 3102 dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; 3103 dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3104 } 3105 #endif 3106 continue; 3107 } 3108 3109 srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); 3110 if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || 3111 (srcmpte->flags & PG_BUSY)) { 3112 continue; 3113 } 3114 3115 if (pdnxt > end_addr) 3116 pdnxt = end_addr; 3117 3118 src_pte = vtopte(addr); 3119 #if JGPMAP32 3120 dst_pte = avtopte(addr); 3121 #endif 3122 while (addr < pdnxt) { 3123 pt_entry_t ptetemp; 3124 3125 ptetemp = *src_pte; 3126 /* 3127 * we only virtual copy managed pages 3128 */ 3129 if ((ptetemp & PG_MANAGED) != 0) { 3130 /* 3131 * We have to check after allocpte for the 3132 * pte still being around... allocpte can 3133 * block. 3134 * 3135 * pmap_allocpte() can block. If we lose 3136 * our page directory mappings we stop. 3137 */ 3138 dstmpte = pmap_allocpte(dst_pmap, addr); 3139 3140 #if JGPMAP32 3141 if (src_frame != (PTDpde & PG_FRAME) || 3142 dst_frame != (APTDpde & PG_FRAME) 3143 ) { 3144 kprintf("WARNING: pmap_copy: detected and corrected race\n"); 3145 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3146 goto failed; 3147 } else if ((*dst_pte == 0) && 3148 (ptetemp = *src_pte) != 0 && 3149 (ptetemp & PG_MANAGED)) { 3150 /* 3151 * Clear the modified and 3152 * accessed (referenced) bits 3153 * during the copy. 3154 */ 3155 m = PHYS_TO_VM_PAGE(ptetemp); 3156 *dst_pte = ptetemp & ~(PG_M | PG_A); 3157 ++dst_pmap->pm_stats.resident_count; 3158 pmap_insert_entry(dst_pmap, addr, 3159 dstmpte, m); 3160 KKASSERT(m->flags & PG_MAPPED); 3161 } else { 3162 kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n"); 3163 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3164 goto failed; 3165 } 3166 #endif 3167 if (dstmpte->hold_count >= srcmpte->hold_count) 3168 break; 3169 } 3170 addr += PAGE_SIZE; 3171 src_pte++; 3172 dst_pte++; 3173 } 3174 } 3175 failed: 3176 crit_exit(); 3177 pmap_inval_done(&info); 3178 #endif 3179 } 3180 3181 /* 3182 * pmap_zero_page: 3183 * 3184 * Zero the specified physical page. 3185 * 3186 * This function may be called from an interrupt and no locking is 3187 * required. 3188 */ 3189 void 3190 pmap_zero_page(vm_paddr_t phys) 3191 { 3192 vm_offset_t va = PHYS_TO_DMAP(phys); 3193 3194 pagezero((void *)va); 3195 } 3196 3197 /* 3198 * pmap_page_assertzero: 3199 * 3200 * Assert that a page is empty, panic if it isn't. 3201 */ 3202 void 3203 pmap_page_assertzero(vm_paddr_t phys) 3204 { 3205 vm_offset_t va = PHYS_TO_DMAP(phys); 3206 size_t i; 3207 3208 for (i = 0; i < PAGE_SIZE; i += sizeof(long)) { 3209 if (*(long *)((char *)va + i) != 0) { 3210 panic("pmap_page_assertzero() @ %p not zero!\n", 3211 (void *)(intptr_t)va); 3212 } 3213 } 3214 } 3215 3216 /* 3217 * pmap_zero_page: 3218 * 3219 * Zero part of a physical page by mapping it into memory and clearing 3220 * its contents with bzero. 3221 * 3222 * off and size may not cover an area beyond a single hardware page. 3223 */ 3224 void 3225 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 3226 { 3227 vm_offset_t virt = PHYS_TO_DMAP(phys); 3228 3229 bzero((char *)virt + off, size); 3230 } 3231 3232 /* 3233 * pmap_copy_page: 3234 * 3235 * Copy the physical page from the source PA to the target PA. 3236 * This function may be called from an interrupt. No locking 3237 * is required. 3238 */ 3239 void 3240 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 3241 { 3242 vm_offset_t src_virt, dst_virt; 3243 3244 src_virt = PHYS_TO_DMAP(src); 3245 dst_virt = PHYS_TO_DMAP(dst); 3246 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 3247 } 3248 3249 /* 3250 * pmap_copy_page_frag: 3251 * 3252 * Copy the physical page from the source PA to the target PA. 3253 * This function may be called from an interrupt. No locking 3254 * is required. 3255 */ 3256 void 3257 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 3258 { 3259 vm_offset_t src_virt, dst_virt; 3260 3261 src_virt = PHYS_TO_DMAP(src); 3262 dst_virt = PHYS_TO_DMAP(dst); 3263 3264 bcopy((char *)src_virt + (src & PAGE_MASK), 3265 (char *)dst_virt + (dst & PAGE_MASK), 3266 bytes); 3267 } 3268 3269 /* 3270 * Returns true if the pmap's pv is one of the first 3271 * 16 pvs linked to from this page. This count may 3272 * be changed upwards or downwards in the future; it 3273 * is only necessary that true be returned for a small 3274 * subset of pmaps for proper page aging. 3275 */ 3276 boolean_t 3277 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3278 { 3279 pv_entry_t pv; 3280 int loops = 0; 3281 3282 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3283 return FALSE; 3284 3285 crit_enter(); 3286 lwkt_gettoken(&vm_token); 3287 3288 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3289 if (pv->pv_pmap == pmap) { 3290 lwkt_reltoken(&vm_token); 3291 crit_exit(); 3292 return TRUE; 3293 } 3294 loops++; 3295 if (loops >= 16) 3296 break; 3297 } 3298 lwkt_reltoken(&vm_token); 3299 crit_exit(); 3300 return (FALSE); 3301 } 3302 3303 /* 3304 * Remove all pages from specified address space 3305 * this aids process exit speeds. Also, this code 3306 * is special cased for current process only, but 3307 * can have the more generic (and slightly slower) 3308 * mode enabled. This is much faster than pmap_remove 3309 * in the case of running down an entire address space. 3310 */ 3311 void 3312 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3313 { 3314 struct lwp *lp; 3315 pt_entry_t *pte, tpte; 3316 pv_entry_t pv, npv; 3317 vm_page_t m; 3318 pmap_inval_info info; 3319 int iscurrentpmap; 3320 int save_generation; 3321 3322 lp = curthread->td_lwp; 3323 if (lp && pmap == vmspace_pmap(lp->lwp_vmspace)) 3324 iscurrentpmap = 1; 3325 else 3326 iscurrentpmap = 0; 3327 3328 lwkt_gettoken(&vm_token); 3329 pmap_inval_init(&info); 3330 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 3331 if (pv->pv_va >= eva || pv->pv_va < sva) { 3332 npv = TAILQ_NEXT(pv, pv_plist); 3333 continue; 3334 } 3335 3336 KKASSERT(pmap == pv->pv_pmap); 3337 3338 if (iscurrentpmap) 3339 pte = vtopte(pv->pv_va); 3340 else 3341 pte = pmap_pte_quick(pmap, pv->pv_va); 3342 pmap_inval_interlock(&info, pmap, pv->pv_va); 3343 3344 /* 3345 * We cannot remove wired pages from a process' mapping 3346 * at this time 3347 */ 3348 if (*pte & PG_W) { 3349 pmap_inval_deinterlock(&info, pmap); 3350 npv = TAILQ_NEXT(pv, pv_plist); 3351 continue; 3352 } 3353 tpte = pte_load_clear(pte); 3354 3355 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3356 3357 KASSERT(m < &vm_page_array[vm_page_array_size], 3358 ("pmap_remove_pages: bad tpte %lx", tpte)); 3359 3360 KKASSERT(pmap->pm_stats.resident_count > 0); 3361 --pmap->pm_stats.resident_count; 3362 pmap_inval_deinterlock(&info, pmap); 3363 3364 /* 3365 * Update the vm_page_t clean and reference bits. 3366 */ 3367 if (tpte & PG_M) { 3368 vm_page_dirty(m); 3369 } 3370 3371 npv = TAILQ_NEXT(pv, pv_plist); 3372 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 3373 save_generation = ++pmap->pm_generation; 3374 3375 m->md.pv_list_count--; 3376 m->object->agg_pv_list_count--; 3377 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3378 if (TAILQ_EMPTY(&m->md.pv_list)) 3379 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3380 3381 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info); 3382 free_pv_entry(pv); 3383 3384 /* 3385 * Restart the scan if we blocked during the unuse or free 3386 * calls and other removals were made. 3387 */ 3388 if (save_generation != pmap->pm_generation) { 3389 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 3390 npv = TAILQ_FIRST(&pmap->pm_pvlist); 3391 } 3392 } 3393 pmap_inval_done(&info); 3394 lwkt_reltoken(&vm_token); 3395 } 3396 3397 /* 3398 * pmap_testbit tests bits in pte's 3399 * note that the testbit/clearbit routines are inline, 3400 * and a lot of things compile-time evaluate. 3401 */ 3402 static 3403 boolean_t 3404 pmap_testbit(vm_page_t m, int bit) 3405 { 3406 pv_entry_t pv; 3407 pt_entry_t *pte; 3408 3409 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3410 return FALSE; 3411 3412 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 3413 return FALSE; 3414 3415 crit_enter(); 3416 3417 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3418 /* 3419 * if the bit being tested is the modified bit, then 3420 * mark clean_map and ptes as never 3421 * modified. 3422 */ 3423 if (bit & (PG_A|PG_M)) { 3424 if (!pmap_track_modified(pv->pv_va)) 3425 continue; 3426 } 3427 3428 #if defined(PMAP_DIAGNOSTIC) 3429 if (pv->pv_pmap == NULL) { 3430 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 3431 continue; 3432 } 3433 #endif 3434 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3435 if (*pte & bit) { 3436 crit_exit(); 3437 return TRUE; 3438 } 3439 } 3440 crit_exit(); 3441 return (FALSE); 3442 } 3443 3444 /* 3445 * this routine is used to modify bits in ptes 3446 */ 3447 static __inline 3448 void 3449 pmap_clearbit(vm_page_t m, int bit) 3450 { 3451 struct pmap_inval_info info; 3452 pv_entry_t pv; 3453 pt_entry_t *pte; 3454 pt_entry_t pbits; 3455 3456 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3457 return; 3458 3459 pmap_inval_init(&info); 3460 3461 /* 3462 * Loop over all current mappings setting/clearing as appropos If 3463 * setting RO do we need to clear the VAC? 3464 */ 3465 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3466 /* 3467 * don't write protect pager mappings 3468 */ 3469 if (bit == PG_RW) { 3470 if (!pmap_track_modified(pv->pv_va)) 3471 continue; 3472 } 3473 3474 #if defined(PMAP_DIAGNOSTIC) 3475 if (pv->pv_pmap == NULL) { 3476 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 3477 continue; 3478 } 3479 #endif 3480 3481 /* 3482 * Careful here. We can use a locked bus instruction to 3483 * clear PG_A or PG_M safely but we need to synchronize 3484 * with the target cpus when we mess with PG_RW. 3485 * 3486 * We do not have to force synchronization when clearing 3487 * PG_M even for PTEs generated via virtual memory maps, 3488 * because the virtual kernel will invalidate the pmap 3489 * entry when/if it needs to resynchronize the Modify bit. 3490 */ 3491 if (bit & PG_RW) 3492 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 3493 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3494 again: 3495 pbits = *pte; 3496 if (pbits & bit) { 3497 if (bit == PG_RW) { 3498 if (pbits & PG_M) { 3499 vm_page_dirty(m); 3500 atomic_clear_long(pte, PG_M|PG_RW); 3501 } else { 3502 /* 3503 * The cpu may be trying to set PG_M 3504 * simultaniously with our clearing 3505 * of PG_RW. 3506 */ 3507 if (!atomic_cmpset_long(pte, pbits, 3508 pbits & ~PG_RW)) 3509 goto again; 3510 } 3511 } else if (bit == PG_M) { 3512 /* 3513 * We could also clear PG_RW here to force 3514 * a fault on write to redetect PG_M for 3515 * virtual kernels, but it isn't necessary 3516 * since virtual kernels invalidate the pte 3517 * when they clear the VPTE_M bit in their 3518 * virtual page tables. 3519 */ 3520 atomic_clear_long(pte, PG_M); 3521 } else { 3522 atomic_clear_long(pte, bit); 3523 } 3524 } 3525 if (bit & PG_RW) 3526 pmap_inval_deinterlock(&info, pv->pv_pmap); 3527 } 3528 pmap_inval_done(&info); 3529 } 3530 3531 /* 3532 * pmap_page_protect: 3533 * 3534 * Lower the permission for all mappings to a given page. 3535 */ 3536 void 3537 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3538 { 3539 /* JG NX support? */ 3540 if ((prot & VM_PROT_WRITE) == 0) { 3541 lwkt_gettoken(&vm_token); 3542 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3543 pmap_clearbit(m, PG_RW); 3544 vm_page_flag_clear(m, PG_WRITEABLE); 3545 } else { 3546 pmap_remove_all(m); 3547 } 3548 lwkt_reltoken(&vm_token); 3549 } 3550 } 3551 3552 vm_paddr_t 3553 pmap_phys_address(vm_pindex_t ppn) 3554 { 3555 return (x86_64_ptob(ppn)); 3556 } 3557 3558 /* 3559 * pmap_ts_referenced: 3560 * 3561 * Return a count of reference bits for a page, clearing those bits. 3562 * It is not necessary for every reference bit to be cleared, but it 3563 * is necessary that 0 only be returned when there are truly no 3564 * reference bits set. 3565 * 3566 * XXX: The exact number of bits to check and clear is a matter that 3567 * should be tested and standardized at some point in the future for 3568 * optimal aging of shared pages. 3569 */ 3570 int 3571 pmap_ts_referenced(vm_page_t m) 3572 { 3573 pv_entry_t pv, pvf, pvn; 3574 pt_entry_t *pte; 3575 int rtval = 0; 3576 3577 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3578 return (rtval); 3579 3580 crit_enter(); 3581 lwkt_gettoken(&vm_token); 3582 3583 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3584 3585 pvf = pv; 3586 3587 do { 3588 pvn = TAILQ_NEXT(pv, pv_list); 3589 3590 crit_enter(); 3591 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3592 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3593 crit_exit(); 3594 3595 if (!pmap_track_modified(pv->pv_va)) 3596 continue; 3597 3598 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3599 3600 if (pte && (*pte & PG_A)) { 3601 #ifdef SMP 3602 atomic_clear_long(pte, PG_A); 3603 #else 3604 atomic_clear_long_nonlocked(pte, PG_A); 3605 #endif 3606 rtval++; 3607 if (rtval > 4) { 3608 break; 3609 } 3610 } 3611 } while ((pv = pvn) != NULL && pv != pvf); 3612 } 3613 lwkt_reltoken(&vm_token); 3614 crit_exit(); 3615 3616 return (rtval); 3617 } 3618 3619 /* 3620 * pmap_is_modified: 3621 * 3622 * Return whether or not the specified physical page was modified 3623 * in any physical maps. 3624 */ 3625 boolean_t 3626 pmap_is_modified(vm_page_t m) 3627 { 3628 boolean_t res; 3629 3630 lwkt_gettoken(&vm_token); 3631 res = pmap_testbit(m, PG_M); 3632 lwkt_reltoken(&vm_token); 3633 return (res); 3634 } 3635 3636 /* 3637 * Clear the modify bits on the specified physical page. 3638 */ 3639 void 3640 pmap_clear_modify(vm_page_t m) 3641 { 3642 lwkt_gettoken(&vm_token); 3643 pmap_clearbit(m, PG_M); 3644 lwkt_reltoken(&vm_token); 3645 } 3646 3647 /* 3648 * pmap_clear_reference: 3649 * 3650 * Clear the reference bit on the specified physical page. 3651 */ 3652 void 3653 pmap_clear_reference(vm_page_t m) 3654 { 3655 lwkt_gettoken(&vm_token); 3656 pmap_clearbit(m, PG_A); 3657 lwkt_reltoken(&vm_token); 3658 } 3659 3660 /* 3661 * Miscellaneous support routines follow 3662 */ 3663 3664 static 3665 void 3666 i386_protection_init(void) 3667 { 3668 int *kp, prot; 3669 3670 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 3671 kp = protection_codes; 3672 for (prot = 0; prot < 8; prot++) { 3673 switch (prot) { 3674 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 3675 /* 3676 * Read access is also 0. There isn't any execute bit, 3677 * so just make it readable. 3678 */ 3679 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 3680 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 3681 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 3682 *kp++ = 0; 3683 break; 3684 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 3685 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 3686 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 3687 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 3688 *kp++ = PG_RW; 3689 break; 3690 } 3691 } 3692 } 3693 3694 /* 3695 * Map a set of physical memory pages into the kernel virtual 3696 * address space. Return a pointer to where it is mapped. This 3697 * routine is intended to be used for mapping device memory, 3698 * NOT real memory. 3699 * 3700 * NOTE: we can't use pgeflag unless we invalidate the pages one at 3701 * a time. 3702 */ 3703 void * 3704 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 3705 { 3706 vm_offset_t va, tmpva, offset; 3707 pt_entry_t *pte; 3708 3709 offset = pa & PAGE_MASK; 3710 size = roundup(offset + size, PAGE_SIZE); 3711 3712 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3713 if (va == 0) 3714 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3715 3716 pa = pa & ~PAGE_MASK; 3717 for (tmpva = va; size > 0;) { 3718 pte = vtopte(tmpva); 3719 *pte = pa | PG_RW | PG_V; /* | pgeflag; */ 3720 size -= PAGE_SIZE; 3721 tmpva += PAGE_SIZE; 3722 pa += PAGE_SIZE; 3723 } 3724 cpu_invltlb(); 3725 smp_invltlb(); 3726 3727 return ((void *)(va + offset)); 3728 } 3729 3730 void * 3731 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 3732 { 3733 vm_offset_t va, tmpva, offset; 3734 pt_entry_t *pte; 3735 3736 offset = pa & PAGE_MASK; 3737 size = roundup(offset + size, PAGE_SIZE); 3738 3739 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3740 if (va == 0) 3741 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3742 3743 pa = pa & ~PAGE_MASK; 3744 for (tmpva = va; size > 0;) { 3745 pte = vtopte(tmpva); 3746 *pte = pa | PG_RW | PG_V | PG_N; /* | pgeflag; */ 3747 size -= PAGE_SIZE; 3748 tmpva += PAGE_SIZE; 3749 pa += PAGE_SIZE; 3750 } 3751 cpu_invltlb(); 3752 smp_invltlb(); 3753 3754 return ((void *)(va + offset)); 3755 } 3756 3757 void 3758 pmap_unmapdev(vm_offset_t va, vm_size_t size) 3759 { 3760 vm_offset_t base, offset; 3761 3762 base = va & ~PAGE_MASK; 3763 offset = va & PAGE_MASK; 3764 size = roundup(offset + size, PAGE_SIZE); 3765 pmap_qremove(va, size >> PAGE_SHIFT); 3766 kmem_free(&kernel_map, base, size); 3767 } 3768 3769 /* 3770 * perform the pmap work for mincore 3771 */ 3772 int 3773 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3774 { 3775 pt_entry_t *ptep, pte; 3776 vm_page_t m; 3777 int val = 0; 3778 3779 lwkt_gettoken(&vm_token); 3780 ptep = pmap_pte(pmap, addr); 3781 3782 if (ptep && (pte = *ptep) != 0) { 3783 vm_offset_t pa; 3784 3785 val = MINCORE_INCORE; 3786 if ((pte & PG_MANAGED) == 0) 3787 goto done; 3788 3789 pa = pte & PG_FRAME; 3790 3791 m = PHYS_TO_VM_PAGE(pa); 3792 3793 /* 3794 * Modified by us 3795 */ 3796 if (pte & PG_M) 3797 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3798 /* 3799 * Modified by someone 3800 */ 3801 else if (m->dirty || pmap_is_modified(m)) 3802 val |= MINCORE_MODIFIED_OTHER; 3803 /* 3804 * Referenced by us 3805 */ 3806 if (pte & PG_A) 3807 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3808 3809 /* 3810 * Referenced by someone 3811 */ 3812 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3813 val |= MINCORE_REFERENCED_OTHER; 3814 vm_page_flag_set(m, PG_REFERENCED); 3815 } 3816 } 3817 done: 3818 lwkt_reltoken(&vm_token); 3819 return val; 3820 } 3821 3822 /* 3823 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3824 * vmspace will be ref'd and the old one will be deref'd. 3825 * 3826 * The vmspace for all lwps associated with the process will be adjusted 3827 * and cr3 will be reloaded if any lwp is the current lwp. 3828 */ 3829 void 3830 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3831 { 3832 struct vmspace *oldvm; 3833 struct lwp *lp; 3834 3835 crit_enter(); 3836 oldvm = p->p_vmspace; 3837 if (oldvm != newvm) { 3838 p->p_vmspace = newvm; 3839 KKASSERT(p->p_nthreads == 1); 3840 lp = RB_ROOT(&p->p_lwp_tree); 3841 pmap_setlwpvm(lp, newvm); 3842 if (adjrefs) { 3843 sysref_get(&newvm->vm_sysref); 3844 sysref_put(&oldvm->vm_sysref); 3845 } 3846 } 3847 crit_exit(); 3848 } 3849 3850 /* 3851 * Set the vmspace for a LWP. The vmspace is almost universally set the 3852 * same as the process vmspace, but virtual kernels need to swap out contexts 3853 * on a per-lwp basis. 3854 */ 3855 void 3856 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3857 { 3858 struct vmspace *oldvm; 3859 struct pmap *pmap; 3860 3861 crit_enter(); 3862 oldvm = lp->lwp_vmspace; 3863 3864 if (oldvm != newvm) { 3865 lp->lwp_vmspace = newvm; 3866 if (curthread->td_lwp == lp) { 3867 pmap = vmspace_pmap(newvm); 3868 #if defined(SMP) 3869 atomic_set_cpumask(&pmap->pm_active, mycpu->gd_cpumask); 3870 if (pmap->pm_active & CPUMASK_LOCK) 3871 pmap_interlock_wait(newvm); 3872 #else 3873 pmap->pm_active |= 1; 3874 #endif 3875 #if defined(SWTCH_OPTIM_STATS) 3876 tlb_flush_count++; 3877 #endif 3878 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 3879 curthread->td_pcb->pcb_cr3 |= PG_RW | PG_U | PG_V; 3880 load_cr3(curthread->td_pcb->pcb_cr3); 3881 pmap = vmspace_pmap(oldvm); 3882 #if defined(SMP) 3883 atomic_clear_cpumask(&pmap->pm_active, mycpu->gd_cpumask); 3884 #else 3885 pmap->pm_active &= ~(cpumask_t)1; 3886 #endif 3887 } 3888 } 3889 crit_exit(); 3890 } 3891 3892 #ifdef SMP 3893 3894 /* 3895 * Called when switching to a locked pmap 3896 */ 3897 void 3898 pmap_interlock_wait(struct vmspace *vm) 3899 { 3900 struct pmap *pmap = &vm->vm_pmap; 3901 3902 if (pmap->pm_active & CPUMASK_LOCK) { 3903 while (pmap->pm_active & CPUMASK_LOCK) { 3904 cpu_pause(); 3905 cpu_ccfence(); 3906 lwkt_process_ipiq(); 3907 } 3908 } 3909 } 3910 3911 #endif 3912 3913 vm_offset_t 3914 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3915 { 3916 3917 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3918 return addr; 3919 } 3920 3921 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 3922 return addr; 3923 } 3924 3925 /* 3926 * Used by kmalloc/kfree, page already exists at va 3927 */ 3928 vm_page_t 3929 pmap_kvtom(vm_offset_t va) 3930 { 3931 return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME)); 3932 } 3933