1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2003 Peter Wemm 8 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 9 * Copyright (c) 2008, 2009 The DragonFly Project. 10 * Copyright (c) 2008, 2009 Jordan Gordeev. 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 47 */ 48 49 /* 50 * Manages physical address maps. 51 * 52 * In addition to hardware address maps, this 53 * module is called upon to provide software-use-only 54 * maps which may or may not be stored in the same 55 * form as hardware maps. These pseudo-maps are 56 * used to store intermediate results from copy 57 * operations to and from address spaces. 58 * 59 * Since the information managed by this module is 60 * also stored by the logical address mapping module, 61 * this module may throw away valid virtual-to-physical 62 * mappings at almost any time. However, invalidations 63 * of virtual-to-physical mappings must be done as 64 * requested. 65 * 66 * In order to cope with hardware architectures which 67 * make virtual-to-physical map invalidates expensive, 68 * this module may delay invalidate or reduced protection 69 * operations until such time as they are actually 70 * necessary. This module is given full information as 71 * to which processors are currently using which maps, 72 * and to when physical maps must be made correct. 73 */ 74 75 #if JG 76 #include "opt_disable_pse.h" 77 #include "opt_pmap.h" 78 #endif 79 #include "opt_msgbuf.h" 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/kernel.h> 84 #include <sys/proc.h> 85 #include <sys/msgbuf.h> 86 #include <sys/vmmeter.h> 87 #include <sys/mman.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <sys/sysctl.h> 92 #include <sys/lock.h> 93 #include <vm/vm_kern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vm_map.h> 96 #include <vm/vm_object.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_pageout.h> 99 #include <vm/vm_pager.h> 100 #include <vm/vm_zone.h> 101 102 #include <sys/user.h> 103 #include <sys/thread2.h> 104 #include <sys/sysref2.h> 105 106 #include <machine/cputypes.h> 107 #include <machine/md_var.h> 108 #include <machine/specialreg.h> 109 #include <machine/smp.h> 110 #include <machine_base/apic/apicreg.h> 111 #include <machine/globaldata.h> 112 #include <machine/pmap.h> 113 #include <machine/pmap_inval.h> 114 115 #include <ddb/ddb.h> 116 117 #define PMAP_KEEP_PDIRS 118 #ifndef PMAP_SHPGPERPROC 119 #define PMAP_SHPGPERPROC 200 120 #endif 121 122 #if defined(DIAGNOSTIC) 123 #define PMAP_DIAGNOSTIC 124 #endif 125 126 #define MINPV 2048 127 128 /* 129 * Get PDEs and PTEs for user/kernel address space 130 */ 131 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 132 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 133 134 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & PG_V) != 0) 135 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & PG_W) != 0) 136 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & PG_M) != 0) 137 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & PG_A) != 0) 138 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & PG_V) != 0) 139 140 141 /* 142 * Given a map and a machine independent protection code, 143 * convert to a vax protection code. 144 */ 145 #define pte_prot(m, p) \ 146 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 147 static int protection_codes[8]; 148 149 struct pmap kernel_pmap; 150 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); 151 152 vm_paddr_t avail_start; /* PA of first available physical page */ 153 vm_paddr_t avail_end; /* PA of last available physical page */ 154 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 155 vm_offset_t virtual2_end; 156 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 157 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 158 vm_offset_t KvaStart; /* VA start of KVA space */ 159 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 160 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 161 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 162 static int pgeflag; /* PG_G or-in */ 163 static int pseflag; /* PG_PS or-in */ 164 165 static vm_object_t kptobj; 166 167 static int ndmpdp; 168 static vm_paddr_t dmaplimit; 169 static int nkpt; 170 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 171 172 static uint64_t KPTbase; 173 static uint64_t KPTphys; 174 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 175 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 176 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 177 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 178 179 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 180 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 181 182 /* 183 * Data for the pv entry allocation mechanism 184 */ 185 static vm_zone_t pvzone; 186 static struct vm_zone pvzone_store; 187 static struct vm_object pvzone_obj; 188 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; 189 static int pmap_pagedaemon_waken = 0; 190 static struct pv_entry *pvinit; 191 192 /* 193 * All those kernel PT submaps that BSD is so fond of 194 */ 195 pt_entry_t *CMAP1 = 0, *ptmmap; 196 caddr_t CADDR1 = 0, ptvmmap = 0; 197 static pt_entry_t *msgbufmap; 198 struct msgbuf *msgbufp=0; 199 200 /* 201 * Crashdump maps. 202 */ 203 static pt_entry_t *pt_crashdumpmap; 204 static caddr_t crashdumpmap; 205 206 #define DISABLE_PSE 207 208 static pv_entry_t get_pv_entry (void); 209 static void i386_protection_init (void); 210 static void create_pagetables(vm_paddr_t *firstaddr); 211 static void pmap_remove_all (vm_page_t m); 212 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 213 vm_offset_t sva, pmap_inval_info_t info); 214 static void pmap_remove_page (struct pmap *pmap, 215 vm_offset_t va, pmap_inval_info_t info); 216 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 217 vm_offset_t va, pmap_inval_info_t info); 218 static boolean_t pmap_testbit (vm_page_t m, int bit); 219 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 220 vm_page_t mpte, vm_page_t m); 221 222 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 223 224 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 225 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 226 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 227 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 228 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 229 pmap_inval_info_t info); 230 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t); 231 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 232 233 static unsigned pdir4mb; 234 235 /* 236 * Move the kernel virtual free pointer to the next 237 * 2MB. This is used to help improve performance 238 * by using a large (2MB) page for much of the kernel 239 * (.text, .data, .bss) 240 */ 241 static 242 vm_offset_t 243 pmap_kmem_choose(vm_offset_t addr) 244 { 245 vm_offset_t newaddr = addr; 246 247 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 248 return newaddr; 249 } 250 251 /* 252 * pmap_pte_quick: 253 * 254 * Super fast pmap_pte routine best used when scanning the pv lists. 255 * This eliminates many course-grained invltlb calls. Note that many of 256 * the pv list scans are across different pmaps and it is very wasteful 257 * to do an entire invltlb when checking a single mapping. 258 * 259 * Should only be called while in a critical section. 260 */ 261 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 262 263 static 264 pt_entry_t * 265 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 266 { 267 return pmap_pte(pmap, va); 268 } 269 270 /* Return a non-clipped PD index for a given VA */ 271 static __inline 272 vm_pindex_t 273 pmap_pde_pindex(vm_offset_t va) 274 { 275 return va >> PDRSHIFT; 276 } 277 278 /* Return various clipped indexes for a given VA */ 279 static __inline 280 vm_pindex_t 281 pmap_pte_index(vm_offset_t va) 282 { 283 284 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 285 } 286 287 static __inline 288 vm_pindex_t 289 pmap_pde_index(vm_offset_t va) 290 { 291 292 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 293 } 294 295 static __inline 296 vm_pindex_t 297 pmap_pdpe_index(vm_offset_t va) 298 { 299 300 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 301 } 302 303 static __inline 304 vm_pindex_t 305 pmap_pml4e_index(vm_offset_t va) 306 { 307 308 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 309 } 310 311 /* Return a pointer to the PML4 slot that corresponds to a VA */ 312 static __inline 313 pml4_entry_t * 314 pmap_pml4e(pmap_t pmap, vm_offset_t va) 315 { 316 317 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 318 } 319 320 /* Return a pointer to the PDP slot that corresponds to a VA */ 321 static __inline 322 pdp_entry_t * 323 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 324 { 325 pdp_entry_t *pdpe; 326 327 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 328 return (&pdpe[pmap_pdpe_index(va)]); 329 } 330 331 /* Return a pointer to the PDP slot that corresponds to a VA */ 332 static __inline 333 pdp_entry_t * 334 pmap_pdpe(pmap_t pmap, vm_offset_t va) 335 { 336 pml4_entry_t *pml4e; 337 338 pml4e = pmap_pml4e(pmap, va); 339 if ((*pml4e & PG_V) == 0) 340 return NULL; 341 return (pmap_pml4e_to_pdpe(pml4e, va)); 342 } 343 344 /* Return a pointer to the PD slot that corresponds to a VA */ 345 static __inline 346 pd_entry_t * 347 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 348 { 349 pd_entry_t *pde; 350 351 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 352 return (&pde[pmap_pde_index(va)]); 353 } 354 355 /* Return a pointer to the PD slot that corresponds to a VA */ 356 static __inline 357 pd_entry_t * 358 pmap_pde(pmap_t pmap, vm_offset_t va) 359 { 360 pdp_entry_t *pdpe; 361 362 pdpe = pmap_pdpe(pmap, va); 363 if (pdpe == NULL || (*pdpe & PG_V) == 0) 364 return NULL; 365 return (pmap_pdpe_to_pde(pdpe, va)); 366 } 367 368 /* Return a pointer to the PT slot that corresponds to a VA */ 369 static __inline 370 pt_entry_t * 371 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 372 { 373 pt_entry_t *pte; 374 375 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 376 return (&pte[pmap_pte_index(va)]); 377 } 378 379 /* Return a pointer to the PT slot that corresponds to a VA */ 380 static __inline 381 pt_entry_t * 382 pmap_pte(pmap_t pmap, vm_offset_t va) 383 { 384 pd_entry_t *pde; 385 386 pde = pmap_pde(pmap, va); 387 if (pde == NULL || (*pde & PG_V) == 0) 388 return NULL; 389 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 390 return ((pt_entry_t *)pde); 391 return (pmap_pde_to_pte(pde, va)); 392 } 393 394 static __inline 395 pt_entry_t * 396 vtopte(vm_offset_t va) 397 { 398 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 399 400 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 401 } 402 403 static __inline 404 pd_entry_t * 405 vtopde(vm_offset_t va) 406 { 407 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 408 409 return (PDmap + ((va >> PDRSHIFT) & mask)); 410 } 411 412 static uint64_t 413 allocpages(vm_paddr_t *firstaddr, long n) 414 { 415 uint64_t ret; 416 417 ret = *firstaddr; 418 bzero((void *)ret, n * PAGE_SIZE); 419 *firstaddr += n * PAGE_SIZE; 420 return (ret); 421 } 422 423 static 424 void 425 create_pagetables(vm_paddr_t *firstaddr) 426 { 427 long i; /* must be 64 bits */ 428 long nkpt_base; 429 long nkpt_phys; 430 431 /* 432 * We are running (mostly) V=P at this point 433 * 434 * Calculate NKPT - number of kernel page tables. We have to 435 * accomodoate prealloction of the vm_page_array, dump bitmap, 436 * MSGBUF_SIZE, and other stuff. Be generous. 437 * 438 * Maxmem is in pages. 439 */ 440 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 441 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 442 ndmpdp = 4; 443 444 /* 445 * Starting at the beginning of kvm (not KERNBASE). 446 */ 447 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 448 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 449 nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E + ndmpdp) + 450 511) / 512; 451 nkpt_phys += 128; 452 453 /* 454 * Starting at KERNBASE - map 2G worth of page table pages. 455 * KERNBASE is offset -2G from the end of kvm. 456 */ 457 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 458 459 /* 460 * Allocate pages 461 */ 462 KPTbase = allocpages(firstaddr, nkpt_base); 463 KPTphys = allocpages(firstaddr, nkpt_phys); 464 KPML4phys = allocpages(firstaddr, 1); 465 KPDPphys = allocpages(firstaddr, NKPML4E); 466 KPDphys = allocpages(firstaddr, NKPDPE); 467 468 /* 469 * Calculate the page directory base for KERNBASE, 470 * that is where we start populating the page table pages. 471 * Basically this is the end - 2. 472 */ 473 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 474 475 DMPDPphys = allocpages(firstaddr, NDMPML4E); 476 if ((amd_feature & AMDID_PAGE1GB) == 0) 477 DMPDphys = allocpages(firstaddr, ndmpdp); 478 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 479 480 /* 481 * Fill in the underlying page table pages for the area around 482 * KERNBASE. This remaps low physical memory to KERNBASE. 483 * 484 * Read-only from zero to physfree 485 * XXX not fully used, underneath 2M pages 486 */ 487 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 488 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 489 ((pt_entry_t *)KPTbase)[i] |= PG_RW | PG_V | PG_G; 490 } 491 492 /* 493 * Now map the initial kernel page tables. One block of page 494 * tables is placed at the beginning of kernel virtual memory, 495 * and another block is placed at KERNBASE to map the kernel binary, 496 * data, bss, and initial pre-allocations. 497 */ 498 for (i = 0; i < nkpt_base; i++) { 499 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 500 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V; 501 } 502 for (i = 0; i < nkpt_phys; i++) { 503 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 504 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 505 } 506 507 /* 508 * Map from zero to end of allocations using 2M pages as an 509 * optimization. This will bypass some of the KPTBase pages 510 * above in the KERNBASE area. 511 */ 512 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 513 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 514 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V | PG_PS | PG_G; 515 } 516 517 /* 518 * And connect up the PD to the PDP. The kernel pmap is expected 519 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 520 */ 521 for (i = 0; i < NKPDPE; i++) { 522 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 523 KPDphys + (i << PAGE_SHIFT); 524 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 525 PG_RW | PG_V | PG_U; 526 } 527 528 /* Now set up the direct map space using either 2MB or 1GB pages */ 529 /* Preset PG_M and PG_A because demotion expects it */ 530 if ((amd_feature & AMDID_PAGE1GB) == 0) { 531 for (i = 0; i < NPDEPG * ndmpdp; i++) { 532 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 533 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | 534 PG_G | PG_M | PG_A; 535 } 536 /* And the direct map space's PDP */ 537 for (i = 0; i < ndmpdp; i++) { 538 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 539 (i << PAGE_SHIFT); 540 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 541 } 542 } else { 543 for (i = 0; i < ndmpdp; i++) { 544 ((pdp_entry_t *)DMPDPphys)[i] = 545 (vm_paddr_t)i << PDPSHIFT; 546 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | 547 PG_G | PG_M | PG_A; 548 } 549 } 550 551 /* And recursively map PML4 to itself in order to get PTmap */ 552 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 553 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 554 555 /* Connect the Direct Map slot up to the PML4 */ 556 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; 557 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; 558 559 /* Connect the KVA slot up to the PML4 */ 560 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 561 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 562 } 563 564 /* 565 * Bootstrap the system enough to run with virtual memory. 566 * 567 * On the i386 this is called after mapping has already been enabled 568 * and just syncs the pmap module with what has already been done. 569 * [We can't call it easily with mapping off since the kernel is not 570 * mapped with PA == VA, hence we would have to relocate every address 571 * from the linked base (virtual) address "KERNBASE" to the actual 572 * (physical) address starting relative to 0] 573 */ 574 void 575 pmap_bootstrap(vm_paddr_t *firstaddr) 576 { 577 vm_offset_t va; 578 pt_entry_t *pte; 579 struct mdglobaldata *gd; 580 int pg; 581 582 KvaStart = VM_MIN_KERNEL_ADDRESS; 583 KvaEnd = VM_MAX_KERNEL_ADDRESS; 584 KvaSize = KvaEnd - KvaStart; 585 586 avail_start = *firstaddr; 587 588 /* 589 * Create an initial set of page tables to run the kernel in. 590 */ 591 create_pagetables(firstaddr); 592 593 virtual2_start = KvaStart; 594 virtual2_end = PTOV_OFFSET; 595 596 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 597 virtual_start = pmap_kmem_choose(virtual_start); 598 599 virtual_end = VM_MAX_KERNEL_ADDRESS; 600 601 /* XXX do %cr0 as well */ 602 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 603 load_cr3(KPML4phys); 604 605 /* 606 * Initialize protection array. 607 */ 608 i386_protection_init(); 609 610 /* 611 * The kernel's pmap is statically allocated so we don't have to use 612 * pmap_create, which is unlikely to work correctly at this part of 613 * the boot sequence (XXX and which no longer exists). 614 */ 615 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 616 kernel_pmap.pm_count = 1; 617 kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK; 618 TAILQ_INIT(&kernel_pmap.pm_pvlist); 619 620 /* 621 * Reserve some special page table entries/VA space for temporary 622 * mapping of pages. 623 */ 624 #define SYSMAP(c, p, v, n) \ 625 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 626 627 va = virtual_start; 628 pte = vtopte(va); 629 630 /* 631 * CMAP1/CMAP2 are used for zeroing and copying pages. 632 */ 633 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 634 635 /* 636 * Crashdump maps. 637 */ 638 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 639 640 /* 641 * ptvmmap is used for reading arbitrary physical pages via 642 * /dev/mem. 643 */ 644 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 645 646 /* 647 * msgbufp is used to map the system message buffer. 648 * XXX msgbufmap is not used. 649 */ 650 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 651 atop(round_page(MSGBUF_SIZE))) 652 653 virtual_start = va; 654 655 *CMAP1 = 0; 656 657 /* 658 * PG_G is terribly broken on SMP because we IPI invltlb's in some 659 * cases rather then invl1pg. Actually, I don't even know why it 660 * works under UP because self-referential page table mappings 661 */ 662 #ifdef SMP 663 pgeflag = 0; 664 #else 665 if (cpu_feature & CPUID_PGE) 666 pgeflag = PG_G; 667 #endif 668 669 /* 670 * Initialize the 4MB page size flag 671 */ 672 pseflag = 0; 673 /* 674 * The 4MB page version of the initial 675 * kernel page mapping. 676 */ 677 pdir4mb = 0; 678 679 #if !defined(DISABLE_PSE) 680 if (cpu_feature & CPUID_PSE) { 681 pt_entry_t ptditmp; 682 /* 683 * Note that we have enabled PSE mode 684 */ 685 pseflag = PG_PS; 686 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 687 ptditmp &= ~(NBPDR - 1); 688 ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; 689 pdir4mb = ptditmp; 690 691 #ifndef SMP 692 /* 693 * Enable the PSE mode. If we are SMP we can't do this 694 * now because the APs will not be able to use it when 695 * they boot up. 696 */ 697 load_cr4(rcr4() | CR4_PSE); 698 699 /* 700 * We can do the mapping here for the single processor 701 * case. We simply ignore the old page table page from 702 * now on. 703 */ 704 /* 705 * For SMP, we still need 4K pages to bootstrap APs, 706 * PSE will be enabled as soon as all APs are up. 707 */ 708 PTD[KPTDI] = (pd_entry_t)ptditmp; 709 cpu_invltlb(); 710 #endif 711 } 712 #endif 713 714 /* 715 * We need to finish setting up the globaldata page for the BSP. 716 * locore has already populated the page table for the mdglobaldata 717 * portion. 718 */ 719 pg = MDGLOBALDATA_BASEALLOC_PAGES; 720 gd = &CPU_prvspace[0].mdglobaldata; 721 722 cpu_invltlb(); 723 } 724 725 #ifdef SMP 726 /* 727 * Set 4mb pdir for mp startup 728 */ 729 void 730 pmap_set_opt(void) 731 { 732 if (pseflag && (cpu_feature & CPUID_PSE)) { 733 load_cr4(rcr4() | CR4_PSE); 734 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 735 cpu_invltlb(); 736 } 737 } 738 } 739 #endif 740 741 /* 742 * Initialize the pmap module. 743 * Called by vm_init, to initialize any structures that the pmap 744 * system needs to map virtual memory. 745 * pmap_init has been enhanced to support in a fairly consistant 746 * way, discontiguous physical memory. 747 */ 748 void 749 pmap_init(void) 750 { 751 int i; 752 int initial_pvs; 753 754 /* 755 * object for kernel page table pages 756 */ 757 /* JG I think the number can be arbitrary */ 758 kptobj = vm_object_allocate(OBJT_DEFAULT, 5); 759 760 /* 761 * Allocate memory for random pmap data structures. Includes the 762 * pv_head_table. 763 */ 764 765 for(i = 0; i < vm_page_array_size; i++) { 766 vm_page_t m; 767 768 m = &vm_page_array[i]; 769 TAILQ_INIT(&m->md.pv_list); 770 m->md.pv_list_count = 0; 771 } 772 773 /* 774 * init the pv free list 775 */ 776 initial_pvs = vm_page_array_size; 777 if (initial_pvs < MINPV) 778 initial_pvs = MINPV; 779 pvzone = &pvzone_store; 780 pvinit = (void *)kmem_alloc(&kernel_map, 781 initial_pvs * sizeof (struct pv_entry)); 782 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 783 pvinit, initial_pvs); 784 785 /* 786 * Now it is safe to enable pv_table recording. 787 */ 788 pmap_initialized = TRUE; 789 } 790 791 /* 792 * Initialize the address space (zone) for the pv_entries. Set a 793 * high water mark so that the system can recover from excessive 794 * numbers of pv entries. 795 */ 796 void 797 pmap_init2(void) 798 { 799 int shpgperproc = PMAP_SHPGPERPROC; 800 int entry_max; 801 802 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 803 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 804 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 805 pv_entry_high_water = 9 * (pv_entry_max / 10); 806 807 /* 808 * Subtract out pages already installed in the zone (hack) 809 */ 810 entry_max = pv_entry_max - vm_page_array_size; 811 if (entry_max <= 0) 812 entry_max = 1; 813 814 zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1); 815 } 816 817 818 /*************************************************** 819 * Low level helper routines..... 820 ***************************************************/ 821 822 #if defined(PMAP_DIAGNOSTIC) 823 824 /* 825 * This code checks for non-writeable/modified pages. 826 * This should be an invalid condition. 827 */ 828 static 829 int 830 pmap_nw_modified(pt_entry_t pte) 831 { 832 if ((pte & (PG_M|PG_RW)) == PG_M) 833 return 1; 834 else 835 return 0; 836 } 837 #endif 838 839 840 /* 841 * this routine defines the region(s) of memory that should 842 * not be tested for the modified bit. 843 */ 844 static __inline 845 int 846 pmap_track_modified(vm_offset_t va) 847 { 848 if ((va < clean_sva) || (va >= clean_eva)) 849 return 1; 850 else 851 return 0; 852 } 853 854 /* 855 * Extract the physical page address associated with the map/VA pair. 856 * 857 * The caller must hold vm_token if non-blocking operation is desired. 858 */ 859 vm_paddr_t 860 pmap_extract(pmap_t pmap, vm_offset_t va) 861 { 862 vm_paddr_t rtval; 863 pt_entry_t *pte; 864 pd_entry_t pde, *pdep; 865 866 lwkt_gettoken(&vm_token); 867 rtval = 0; 868 pdep = pmap_pde(pmap, va); 869 if (pdep != NULL) { 870 pde = *pdep; 871 if (pde) { 872 if ((pde & PG_PS) != 0) { 873 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 874 } else { 875 pte = pmap_pde_to_pte(pdep, va); 876 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 877 } 878 } 879 } 880 lwkt_reltoken(&vm_token); 881 return rtval; 882 } 883 884 /* 885 * Extract the physical page address associated kernel virtual address. 886 */ 887 vm_paddr_t 888 pmap_kextract(vm_offset_t va) 889 { 890 pd_entry_t pde; 891 vm_paddr_t pa; 892 893 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 894 pa = DMAP_TO_PHYS(va); 895 } else { 896 pde = *vtopde(va); 897 if (pde & PG_PS) { 898 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 899 } else { 900 /* 901 * Beware of a concurrent promotion that changes the 902 * PDE at this point! For example, vtopte() must not 903 * be used to access the PTE because it would use the 904 * new PDE. It is, however, safe to use the old PDE 905 * because the page table page is preserved by the 906 * promotion. 907 */ 908 pa = *pmap_pde_to_pte(&pde, va); 909 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 910 } 911 } 912 return pa; 913 } 914 915 /*************************************************** 916 * Low level mapping routines..... 917 ***************************************************/ 918 919 /* 920 * Routine: pmap_kenter 921 * Function: 922 * Add a wired page to the KVA 923 * NOTE! note that in order for the mapping to take effect -- you 924 * should do an invltlb after doing the pmap_kenter(). 925 */ 926 void 927 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 928 { 929 pt_entry_t *pte; 930 pt_entry_t npte; 931 pmap_inval_info info; 932 933 pmap_inval_init(&info); 934 npte = pa | PG_RW | PG_V | pgeflag; 935 pte = vtopte(va); 936 pmap_inval_interlock(&info, &kernel_pmap, va); 937 *pte = npte; 938 pmap_inval_deinterlock(&info, &kernel_pmap); 939 pmap_inval_done(&info); 940 } 941 942 /* 943 * Routine: pmap_kenter_quick 944 * Function: 945 * Similar to pmap_kenter(), except we only invalidate the 946 * mapping on the current CPU. 947 */ 948 void 949 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 950 { 951 pt_entry_t *pte; 952 pt_entry_t npte; 953 954 npte = pa | PG_RW | PG_V | pgeflag; 955 pte = vtopte(va); 956 *pte = npte; 957 cpu_invlpg((void *)va); 958 } 959 960 void 961 pmap_kenter_sync(vm_offset_t va) 962 { 963 pmap_inval_info info; 964 965 pmap_inval_init(&info); 966 pmap_inval_interlock(&info, &kernel_pmap, va); 967 pmap_inval_deinterlock(&info, &kernel_pmap); 968 pmap_inval_done(&info); 969 } 970 971 void 972 pmap_kenter_sync_quick(vm_offset_t va) 973 { 974 cpu_invlpg((void *)va); 975 } 976 977 /* 978 * remove a page from the kernel pagetables 979 */ 980 void 981 pmap_kremove(vm_offset_t va) 982 { 983 pt_entry_t *pte; 984 pmap_inval_info info; 985 986 pmap_inval_init(&info); 987 pte = vtopte(va); 988 pmap_inval_interlock(&info, &kernel_pmap, va); 989 *pte = 0; 990 pmap_inval_deinterlock(&info, &kernel_pmap); 991 pmap_inval_done(&info); 992 } 993 994 void 995 pmap_kremove_quick(vm_offset_t va) 996 { 997 pt_entry_t *pte; 998 pte = vtopte(va); 999 *pte = 0; 1000 cpu_invlpg((void *)va); 1001 } 1002 1003 /* 1004 * XXX these need to be recoded. They are not used in any critical path. 1005 */ 1006 void 1007 pmap_kmodify_rw(vm_offset_t va) 1008 { 1009 *vtopte(va) |= PG_RW; 1010 cpu_invlpg((void *)va); 1011 } 1012 1013 void 1014 pmap_kmodify_nc(vm_offset_t va) 1015 { 1016 *vtopte(va) |= PG_N; 1017 cpu_invlpg((void *)va); 1018 } 1019 1020 /* 1021 * Used to map a range of physical addresses into kernel virtual 1022 * address space during the low level boot, typically to map the 1023 * dump bitmap, message buffer, and vm_page_array. 1024 * 1025 * These mappings are typically made at some pointer after the end of the 1026 * kernel text+data. 1027 * 1028 * We could return PHYS_TO_DMAP(start) here and not allocate any 1029 * via (*virtp), but then kmem from userland and kernel dumps won't 1030 * have access to the related pointers. 1031 */ 1032 vm_offset_t 1033 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1034 { 1035 vm_offset_t va; 1036 vm_offset_t va_start; 1037 1038 /*return PHYS_TO_DMAP(start);*/ 1039 1040 va_start = *virtp; 1041 va = va_start; 1042 1043 while (start < end) { 1044 pmap_kenter_quick(va, start); 1045 va += PAGE_SIZE; 1046 start += PAGE_SIZE; 1047 } 1048 *virtp = va; 1049 return va_start; 1050 } 1051 1052 1053 /* 1054 * Add a list of wired pages to the kva 1055 * this routine is only used for temporary 1056 * kernel mappings that do not need to have 1057 * page modification or references recorded. 1058 * Note that old mappings are simply written 1059 * over. The page *must* be wired. 1060 */ 1061 void 1062 pmap_qenter(vm_offset_t va, vm_page_t *m, int count) 1063 { 1064 vm_offset_t end_va; 1065 1066 end_va = va + count * PAGE_SIZE; 1067 1068 while (va < end_va) { 1069 pt_entry_t *pte; 1070 1071 pte = vtopte(va); 1072 *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag; 1073 cpu_invlpg((void *)va); 1074 va += PAGE_SIZE; 1075 m++; 1076 } 1077 smp_invltlb(); 1078 } 1079 1080 /* 1081 * This routine jerks page mappings from the 1082 * kernel -- it is meant only for temporary mappings. 1083 * 1084 * MPSAFE, INTERRUPT SAFE (cluster callback) 1085 */ 1086 void 1087 pmap_qremove(vm_offset_t va, int count) 1088 { 1089 vm_offset_t end_va; 1090 1091 end_va = va + count * PAGE_SIZE; 1092 1093 while (va < end_va) { 1094 pt_entry_t *pte; 1095 1096 pte = vtopte(va); 1097 *pte = 0; 1098 cpu_invlpg((void *)va); 1099 va += PAGE_SIZE; 1100 } 1101 smp_invltlb(); 1102 } 1103 1104 /* 1105 * This routine works like vm_page_lookup() but also blocks as long as the 1106 * page is busy. This routine does not busy the page it returns. 1107 * 1108 * Unless the caller is managing objects whos pages are in a known state, 1109 * the call should be made with a critical section held so the page's object 1110 * association remains valid on return. 1111 */ 1112 static 1113 vm_page_t 1114 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1115 { 1116 vm_page_t m; 1117 1118 do { 1119 m = vm_page_lookup(object, pindex); 1120 } while (m && vm_page_sleep_busy(m, FALSE, "pplookp")); 1121 1122 return(m); 1123 } 1124 1125 /* 1126 * Create a new thread and optionally associate it with a (new) process. 1127 * NOTE! the new thread's cpu may not equal the current cpu. 1128 */ 1129 void 1130 pmap_init_thread(thread_t td) 1131 { 1132 /* enforce pcb placement & alignment */ 1133 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1134 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 1135 td->td_savefpu = &td->td_pcb->pcb_save; 1136 td->td_sp = (char *)td->td_pcb; /* no -16 */ 1137 } 1138 1139 /* 1140 * This routine directly affects the fork perf for a process. 1141 */ 1142 void 1143 pmap_init_proc(struct proc *p) 1144 { 1145 } 1146 1147 /* 1148 * Dispose the UPAGES for a process that has exited. 1149 * This routine directly impacts the exit perf of a process. 1150 */ 1151 void 1152 pmap_dispose_proc(struct proc *p) 1153 { 1154 KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p)); 1155 } 1156 1157 /*************************************************** 1158 * Page table page management routines..... 1159 ***************************************************/ 1160 1161 /* 1162 * This routine unholds page table pages, and if the hold count 1163 * drops to zero, then it decrements the wire count. 1164 */ 1165 static __inline 1166 int 1167 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1168 pmap_inval_info_t info) 1169 { 1170 KKASSERT(m->hold_count > 0); 1171 if (m->hold_count > 1) { 1172 vm_page_unhold(m); 1173 return 0; 1174 } else { 1175 return _pmap_unwire_pte_hold(pmap, va, m, info); 1176 } 1177 } 1178 1179 static 1180 int 1181 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1182 pmap_inval_info_t info) 1183 { 1184 /* 1185 * Wait until we can busy the page ourselves. We cannot have 1186 * any active flushes if we block. We own one hold count on the 1187 * page so it cannot be freed out from under us. 1188 */ 1189 if (m->flags & PG_BUSY) { 1190 while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) 1191 ; 1192 } 1193 KASSERT(m->queue == PQ_NONE, 1194 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m)); 1195 1196 /* 1197 * This case can occur if new references were acquired while 1198 * we were blocked. 1199 */ 1200 if (m->hold_count > 1) { 1201 KKASSERT(m->hold_count > 1); 1202 vm_page_unhold(m); 1203 return 0; 1204 } 1205 1206 /* 1207 * Unmap the page table page 1208 */ 1209 KKASSERT(m->hold_count == 1); 1210 vm_page_busy(m); 1211 pmap_inval_interlock(info, pmap, -1); 1212 1213 if (m->pindex >= (NUPDE + NUPDPE)) { 1214 /* PDP page */ 1215 pml4_entry_t *pml4; 1216 pml4 = pmap_pml4e(pmap, va); 1217 *pml4 = 0; 1218 } else if (m->pindex >= NUPDE) { 1219 /* PD page */ 1220 pdp_entry_t *pdp; 1221 pdp = pmap_pdpe(pmap, va); 1222 *pdp = 0; 1223 } else { 1224 /* PT page */ 1225 pd_entry_t *pd; 1226 pd = pmap_pde(pmap, va); 1227 *pd = 0; 1228 } 1229 1230 KKASSERT(pmap->pm_stats.resident_count > 0); 1231 --pmap->pm_stats.resident_count; 1232 1233 if (pmap->pm_ptphint == m) 1234 pmap->pm_ptphint = NULL; 1235 pmap_inval_deinterlock(info, pmap); 1236 1237 if (m->pindex < NUPDE) { 1238 /* We just released a PT, unhold the matching PD */ 1239 vm_page_t pdpg; 1240 1241 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1242 pmap_unwire_pte_hold(pmap, va, pdpg, info); 1243 } 1244 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1245 /* We just released a PD, unhold the matching PDP */ 1246 vm_page_t pdppg; 1247 1248 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1249 pmap_unwire_pte_hold(pmap, va, pdppg, info); 1250 } 1251 1252 /* 1253 * This was our last hold, the page had better be unwired 1254 * after we decrement wire_count. 1255 * 1256 * FUTURE NOTE: shared page directory page could result in 1257 * multiple wire counts. 1258 */ 1259 vm_page_unhold(m); 1260 --m->wire_count; 1261 KKASSERT(m->wire_count == 0); 1262 --vmstats.v_wire_count; 1263 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1264 vm_page_flash(m); 1265 vm_page_free_zero(m); 1266 1267 return 1; 1268 } 1269 1270 /* 1271 * After removing a page table entry, this routine is used to 1272 * conditionally free the page, and manage the hold/wire counts. 1273 */ 1274 static 1275 int 1276 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte, 1277 pmap_inval_info_t info) 1278 { 1279 vm_pindex_t ptepindex; 1280 1281 if (va >= VM_MAX_USER_ADDRESS) 1282 return 0; 1283 1284 if (mpte == NULL) { 1285 ptepindex = pmap_pde_pindex(va); 1286 #if JGHINT 1287 if (pmap->pm_ptphint && 1288 (pmap->pm_ptphint->pindex == ptepindex)) { 1289 mpte = pmap->pm_ptphint; 1290 } else { 1291 #endif 1292 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1293 pmap->pm_ptphint = mpte; 1294 #if JGHINT 1295 } 1296 #endif 1297 } 1298 return pmap_unwire_pte_hold(pmap, va, mpte, info); 1299 } 1300 1301 /* 1302 * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because 1303 * it, and IdlePTD, represents the template used to update all other pmaps. 1304 * 1305 * On architectures where the kernel pmap is not integrated into the user 1306 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1307 * kernel_pmap should be used to directly access the kernel_pmap. 1308 */ 1309 void 1310 pmap_pinit0(struct pmap *pmap) 1311 { 1312 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1313 pmap->pm_count = 1; 1314 pmap->pm_active = 0; 1315 pmap->pm_ptphint = NULL; 1316 TAILQ_INIT(&pmap->pm_pvlist); 1317 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1318 } 1319 1320 /* 1321 * Initialize a preallocated and zeroed pmap structure, 1322 * such as one in a vmspace structure. 1323 */ 1324 void 1325 pmap_pinit(struct pmap *pmap) 1326 { 1327 vm_page_t ptdpg; 1328 1329 /* 1330 * No need to allocate page table space yet but we do need a valid 1331 * page directory table. 1332 */ 1333 if (pmap->pm_pml4 == NULL) { 1334 pmap->pm_pml4 = 1335 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 1336 } 1337 1338 /* 1339 * Allocate an object for the ptes 1340 */ 1341 if (pmap->pm_pteobj == NULL) 1342 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + PML4PML4I + 1); 1343 1344 /* 1345 * Allocate the page directory page, unless we already have 1346 * one cached. If we used the cached page the wire_count will 1347 * already be set appropriately. 1348 */ 1349 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1350 ptdpg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I, 1351 VM_ALLOC_NORMAL | VM_ALLOC_RETRY); 1352 pmap->pm_pdirm = ptdpg; 1353 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); 1354 ptdpg->valid = VM_PAGE_BITS_ALL; 1355 if (ptdpg->wire_count == 0) 1356 ++vmstats.v_wire_count; 1357 ptdpg->wire_count = 1; 1358 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1359 } 1360 if ((ptdpg->flags & PG_ZERO) == 0) 1361 bzero(pmap->pm_pml4, PAGE_SIZE); 1362 #ifdef PMAP_DEBUG 1363 else 1364 pmap_page_assertzero(VM_PAGE_TO_PHYS(ptdpg)); 1365 #endif 1366 1367 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1368 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; 1369 1370 /* install self-referential address mapping entry */ 1371 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; 1372 1373 pmap->pm_count = 1; 1374 pmap->pm_active = 0; 1375 pmap->pm_ptphint = NULL; 1376 TAILQ_INIT(&pmap->pm_pvlist); 1377 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1378 pmap->pm_stats.resident_count = 1; 1379 } 1380 1381 /* 1382 * Clean up a pmap structure so it can be physically freed. This routine 1383 * is called by the vmspace dtor function. A great deal of pmap data is 1384 * left passively mapped to improve vmspace management so we have a bit 1385 * of cleanup work to do here. 1386 */ 1387 void 1388 pmap_puninit(pmap_t pmap) 1389 { 1390 vm_page_t p; 1391 1392 KKASSERT(pmap->pm_active == 0); 1393 lwkt_gettoken(&vm_token); 1394 if ((p = pmap->pm_pdirm) != NULL) { 1395 KKASSERT(pmap->pm_pml4 != NULL); 1396 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1397 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1398 p->wire_count--; 1399 vmstats.v_wire_count--; 1400 KKASSERT((p->flags & PG_BUSY) == 0); 1401 vm_page_busy(p); 1402 vm_page_free_zero(p); 1403 pmap->pm_pdirm = NULL; 1404 } 1405 if (pmap->pm_pml4) { 1406 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1407 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1408 pmap->pm_pml4 = NULL; 1409 } 1410 if (pmap->pm_pteobj) { 1411 vm_object_deallocate(pmap->pm_pteobj); 1412 pmap->pm_pteobj = NULL; 1413 } 1414 lwkt_reltoken(&vm_token); 1415 } 1416 1417 /* 1418 * Wire in kernel global address entries. To avoid a race condition 1419 * between pmap initialization and pmap_growkernel, this procedure 1420 * adds the pmap to the master list (which growkernel scans to update), 1421 * then copies the template. 1422 */ 1423 void 1424 pmap_pinit2(struct pmap *pmap) 1425 { 1426 crit_enter(); 1427 lwkt_gettoken(&vm_token); 1428 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); 1429 /* XXX copies current process, does not fill in MPPTDI */ 1430 lwkt_reltoken(&vm_token); 1431 crit_exit(); 1432 } 1433 1434 /* 1435 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1436 * 0 on failure (if the procedure had to sleep). 1437 * 1438 * When asked to remove the page directory page itself, we actually just 1439 * leave it cached so we do not have to incur the SMP inval overhead of 1440 * removing the kernel mapping. pmap_puninit() will take care of it. 1441 */ 1442 static 1443 int 1444 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1445 { 1446 /* 1447 * This code optimizes the case of freeing non-busy 1448 * page-table pages. Those pages are zero now, and 1449 * might as well be placed directly into the zero queue. 1450 */ 1451 if (vm_page_sleep_busy(p, FALSE, "pmaprl")) 1452 return 0; 1453 1454 vm_page_busy(p); 1455 1456 /* 1457 * Remove the page table page from the processes address space. 1458 */ 1459 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1460 /* 1461 * We are the pml4 table itself. 1462 */ 1463 /* XXX anything to do here? */ 1464 } else if (p->pindex >= (NUPDE + NUPDPE)) { 1465 /* 1466 * Remove a PDP page from the PML4. We do not maintain 1467 * hold counts on the PML4 page. 1468 */ 1469 pml4_entry_t *pml4; 1470 vm_page_t m4; 1471 int idx; 1472 1473 m4 = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I); 1474 KKASSERT(m4 != NULL); 1475 pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1476 idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG; 1477 KKASSERT(pml4[idx] != 0); 1478 pml4[idx] = 0; 1479 } else if (p->pindex >= NUPDE) { 1480 /* 1481 * Remove a PD page from the PDP and drop the hold count 1482 * on the PDP. The PDP is left cached in the pmap if 1483 * the hold count drops to 0 so the wire count remains 1484 * intact. 1485 */ 1486 vm_page_t m3; 1487 pdp_entry_t *pdp; 1488 int idx; 1489 1490 m3 = vm_page_lookup(pmap->pm_pteobj, 1491 NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG); 1492 KKASSERT(m3 != NULL); 1493 pdp = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1494 idx = (p->pindex - NUPDE) % NPDPEPG; 1495 KKASSERT(pdp[idx] != 0); 1496 pdp[idx] = 0; 1497 m3->hold_count--; 1498 } else { 1499 /* 1500 * Remove a PT page from the PD and drop the hold count 1501 * on the PD. The PD is left cached in the pmap if 1502 * the hold count drops to 0 so the wire count remains 1503 * intact. 1504 */ 1505 vm_page_t m2; 1506 pd_entry_t *pd; 1507 int idx; 1508 1509 m2 = vm_page_lookup(pmap->pm_pteobj, 1510 NUPDE + p->pindex / NPDEPG); 1511 KKASSERT(m2 != NULL); 1512 pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1513 idx = p->pindex % NPDEPG; 1514 pd[idx] = 0; 1515 m2->hold_count--; 1516 } 1517 1518 /* 1519 * One fewer mappings in the pmap. p's hold count had better 1520 * be zero. 1521 */ 1522 KKASSERT(pmap->pm_stats.resident_count > 0); 1523 --pmap->pm_stats.resident_count; 1524 if (p->hold_count) 1525 panic("pmap_release: freeing held page table page"); 1526 if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) 1527 pmap->pm_ptphint = NULL; 1528 1529 /* 1530 * We leave the top-level page table page cached, wired, and mapped in 1531 * the pmap until the dtor function (pmap_puninit()) gets called. 1532 * However, still clean it up so we can set PG_ZERO. 1533 */ 1534 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1535 bzero(pmap->pm_pml4, PAGE_SIZE); 1536 vm_page_flag_set(p, PG_ZERO); 1537 vm_page_wakeup(p); 1538 } else { 1539 p->wire_count--; 1540 KKASSERT(p->wire_count == 0); 1541 vmstats.v_wire_count--; 1542 /* JG eventually revert to using vm_page_free_zero() */ 1543 vm_page_free(p); 1544 } 1545 return 1; 1546 } 1547 1548 /* 1549 * This routine is called when various levels in the page table need to 1550 * be populated. This routine cannot fail. 1551 */ 1552 static 1553 vm_page_t 1554 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1555 { 1556 vm_page_t m; 1557 1558 /* 1559 * Find or fabricate a new pagetable page. This will busy the page. 1560 */ 1561 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1562 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1563 if ((m->flags & PG_ZERO) == 0) { 1564 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 1565 } 1566 #ifdef PMAP_DEBUG 1567 else { 1568 pmap_page_assertzero(VM_PAGE_TO_PHYS(m)); 1569 } 1570 #endif 1571 1572 KASSERT(m->queue == PQ_NONE, 1573 ("_pmap_allocpte: %p->queue != PQ_NONE", m)); 1574 1575 /* 1576 * Increment the hold count for the page we will be returning to 1577 * the caller. 1578 */ 1579 m->hold_count++; 1580 if (m->wire_count++ == 0) 1581 vmstats.v_wire_count++; 1582 m->valid = VM_PAGE_BITS_ALL; 1583 vm_page_flag_clear(m, PG_ZERO); 1584 1585 /* 1586 * Map the pagetable page into the process address space, if 1587 * it isn't already there. 1588 * 1589 * It is possible that someone else got in and mapped the page 1590 * directory page while we were blocked, if so just unbusy and 1591 * return the held page. 1592 */ 1593 if (ptepindex >= (NUPDE + NUPDPE)) { 1594 /* 1595 * Wire up a new PDP page in the PML4 1596 */ 1597 vm_pindex_t pml4index; 1598 pml4_entry_t *pml4; 1599 1600 pml4index = ptepindex - (NUPDE + NUPDPE); 1601 pml4 = &pmap->pm_pml4[pml4index]; 1602 if (*pml4 & PG_V) { 1603 if (--m->wire_count == 0) 1604 --vmstats.v_wire_count; 1605 vm_page_wakeup(m); 1606 return(m); 1607 } 1608 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1609 } else if (ptepindex >= NUPDE) { 1610 /* 1611 * Wire up a new PD page in the PDP 1612 */ 1613 vm_pindex_t pml4index; 1614 vm_pindex_t pdpindex; 1615 vm_page_t pdppg; 1616 pml4_entry_t *pml4; 1617 pdp_entry_t *pdp; 1618 1619 pdpindex = ptepindex - NUPDE; 1620 pml4index = pdpindex >> NPML4EPGSHIFT; 1621 1622 pml4 = &pmap->pm_pml4[pml4index]; 1623 if ((*pml4 & PG_V) == 0) { 1624 /* 1625 * Have to allocate a new PDP page, recurse. 1626 * This always succeeds. Returned page will 1627 * be held. 1628 */ 1629 pdppg = _pmap_allocpte(pmap, 1630 NUPDE + NUPDPE + pml4index); 1631 } else { 1632 /* 1633 * Add a held reference to the PDP page. 1634 */ 1635 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1636 pdppg->hold_count++; 1637 } 1638 1639 /* 1640 * Now find the pdp_entry and map the PDP. If the PDP 1641 * has already been mapped unwind and return the 1642 * already-mapped PDP held. 1643 * 1644 * pdppg is left held (hold_count is incremented for 1645 * each PD in the PDP). 1646 */ 1647 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1648 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1649 if (*pdp & PG_V) { 1650 vm_page_unhold(pdppg); 1651 if (--m->wire_count == 0) 1652 --vmstats.v_wire_count; 1653 vm_page_wakeup(m); 1654 return(m); 1655 } 1656 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1657 } else { 1658 /* 1659 * Wire up the new PT page in the PD 1660 */ 1661 vm_pindex_t pml4index; 1662 vm_pindex_t pdpindex; 1663 pml4_entry_t *pml4; 1664 pdp_entry_t *pdp; 1665 pd_entry_t *pd; 1666 vm_page_t pdpg; 1667 1668 pdpindex = ptepindex >> NPDPEPGSHIFT; 1669 pml4index = pdpindex >> NPML4EPGSHIFT; 1670 1671 /* 1672 * Locate the PDP page in the PML4, then the PD page in 1673 * the PDP. If either does not exist we simply recurse 1674 * to allocate them. 1675 * 1676 * We can just recurse on the PD page as it will recurse 1677 * on the PDP if necessary. 1678 */ 1679 pml4 = &pmap->pm_pml4[pml4index]; 1680 if ((*pml4 & PG_V) == 0) { 1681 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1682 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1683 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1684 } else { 1685 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1686 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1687 if ((*pdp & PG_V) == 0) { 1688 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1689 } else { 1690 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1691 pdpg->hold_count++; 1692 } 1693 } 1694 1695 /* 1696 * Now fill in the pte in the PD. If the pte already exists 1697 * (again, if we raced the grab), unhold pdpg and unwire 1698 * m, returning a held m. 1699 * 1700 * pdpg is left held (hold_count is incremented for 1701 * each PT in the PD). 1702 */ 1703 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1704 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1705 if (*pd != 0) { 1706 vm_page_unhold(pdpg); 1707 if (--m->wire_count == 0) 1708 --vmstats.v_wire_count; 1709 vm_page_wakeup(m); 1710 return(m); 1711 } 1712 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1713 } 1714 1715 /* 1716 * We successfully loaded a PDP, PD, or PTE. Set the page table hint, 1717 * valid bits, mapped flag, unbusy, and we're done. 1718 */ 1719 pmap->pm_ptphint = m; 1720 ++pmap->pm_stats.resident_count; 1721 1722 #if 0 1723 m->valid = VM_PAGE_BITS_ALL; 1724 vm_page_flag_clear(m, PG_ZERO); 1725 #endif 1726 vm_page_flag_set(m, PG_MAPPED); 1727 vm_page_wakeup(m); 1728 1729 return (m); 1730 } 1731 1732 static 1733 vm_page_t 1734 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1735 { 1736 vm_pindex_t ptepindex; 1737 pd_entry_t *pd; 1738 vm_page_t m; 1739 1740 /* 1741 * Calculate pagetable page index 1742 */ 1743 ptepindex = pmap_pde_pindex(va); 1744 1745 /* 1746 * Get the page directory entry 1747 */ 1748 pd = pmap_pde(pmap, va); 1749 1750 /* 1751 * This supports switching from a 2MB page to a 1752 * normal 4K page. 1753 */ 1754 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1755 panic("no promotion/demotion yet"); 1756 *pd = 0; 1757 pd = NULL; 1758 cpu_invltlb(); 1759 smp_invltlb(); 1760 } 1761 1762 /* 1763 * If the page table page is mapped, we just increment the 1764 * hold count, and activate it. 1765 */ 1766 if (pd != NULL && (*pd & PG_V) != 0) { 1767 /* YYY hint is used here on i386 */ 1768 m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 1769 pmap->pm_ptphint = m; 1770 m->hold_count++; 1771 return m; 1772 } 1773 /* 1774 * Here if the pte page isn't mapped, or if it has been deallocated. 1775 */ 1776 return _pmap_allocpte(pmap, ptepindex); 1777 } 1778 1779 1780 /*************************************************** 1781 * Pmap allocation/deallocation routines. 1782 ***************************************************/ 1783 1784 /* 1785 * Release any resources held by the given physical map. 1786 * Called when a pmap initialized by pmap_pinit is being released. 1787 * Should only be called if the map contains no valid mappings. 1788 */ 1789 static int pmap_release_callback(struct vm_page *p, void *data); 1790 1791 void 1792 pmap_release(struct pmap *pmap) 1793 { 1794 vm_object_t object = pmap->pm_pteobj; 1795 struct rb_vm_page_scan_info info; 1796 1797 KASSERT(pmap->pm_active == 0, 1798 ("pmap still active! %016jx", (uintmax_t)pmap->pm_active)); 1799 #if defined(DIAGNOSTIC) 1800 if (object->ref_count != 1) 1801 panic("pmap_release: pteobj reference count != 1"); 1802 #endif 1803 1804 info.pmap = pmap; 1805 info.object = object; 1806 crit_enter(); 1807 lwkt_gettoken(&vm_token); 1808 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); 1809 crit_exit(); 1810 1811 do { 1812 crit_enter(); 1813 info.error = 0; 1814 info.mpte = NULL; 1815 info.limit = object->generation; 1816 1817 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1818 pmap_release_callback, &info); 1819 if (info.error == 0 && info.mpte) { 1820 if (!pmap_release_free_page(pmap, info.mpte)) 1821 info.error = 1; 1822 } 1823 crit_exit(); 1824 } while (info.error); 1825 lwkt_reltoken(&vm_token); 1826 } 1827 1828 static 1829 int 1830 pmap_release_callback(struct vm_page *p, void *data) 1831 { 1832 struct rb_vm_page_scan_info *info = data; 1833 1834 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1835 info->mpte = p; 1836 return(0); 1837 } 1838 if (!pmap_release_free_page(info->pmap, p)) { 1839 info->error = 1; 1840 return(-1); 1841 } 1842 if (info->object->generation != info->limit) { 1843 info->error = 1; 1844 return(-1); 1845 } 1846 return(0); 1847 } 1848 1849 /* 1850 * Grow the number of kernel page table entries, if needed. 1851 * 1852 * This routine is always called to validate any address space 1853 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 1854 * space below KERNBASE. 1855 */ 1856 void 1857 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 1858 { 1859 vm_paddr_t paddr; 1860 vm_offset_t ptppaddr; 1861 vm_page_t nkpg; 1862 pd_entry_t *pde, newpdir; 1863 pdp_entry_t newpdp; 1864 int update_kernel_vm_end; 1865 1866 crit_enter(); 1867 lwkt_gettoken(&vm_token); 1868 1869 /* 1870 * bootstrap kernel_vm_end on first real VM use 1871 */ 1872 if (kernel_vm_end == 0) { 1873 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 1874 nkpt = 0; 1875 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & PG_V) != 0) { 1876 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 1877 ~(PAGE_SIZE * NPTEPG - 1); 1878 nkpt++; 1879 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1880 kernel_vm_end = kernel_map.max_offset; 1881 break; 1882 } 1883 } 1884 } 1885 1886 /* 1887 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 1888 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 1889 * do not want to force-fill 128G worth of page tables. 1890 */ 1891 if (kstart < KERNBASE) { 1892 if (kstart > kernel_vm_end) 1893 kstart = kernel_vm_end; 1894 KKASSERT(kend <= KERNBASE); 1895 update_kernel_vm_end = 1; 1896 } else { 1897 update_kernel_vm_end = 0; 1898 } 1899 1900 kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG); 1901 kend = roundup2(kend, PAGE_SIZE * NPTEPG); 1902 1903 if (kend - 1 >= kernel_map.max_offset) 1904 kend = kernel_map.max_offset; 1905 1906 while (kstart < kend) { 1907 pde = pmap_pde(&kernel_pmap, kstart); 1908 if (pde == NULL) { 1909 /* We need a new PDP entry */ 1910 nkpg = vm_page_alloc(kptobj, nkpt, 1911 VM_ALLOC_NORMAL | 1912 VM_ALLOC_SYSTEM | 1913 VM_ALLOC_INTERRUPT); 1914 if (nkpg == NULL) { 1915 panic("pmap_growkernel: no memory to grow " 1916 "kernel"); 1917 } 1918 paddr = VM_PAGE_TO_PHYS(nkpg); 1919 if ((nkpg->flags & PG_ZERO) == 0) 1920 pmap_zero_page(paddr); 1921 vm_page_flag_clear(nkpg, PG_ZERO); 1922 newpdp = (pdp_entry_t) 1923 (paddr | PG_V | PG_RW | PG_A | PG_M); 1924 *pmap_pdpe(&kernel_pmap, kstart) = newpdp; 1925 nkpt++; 1926 continue; /* try again */ 1927 } 1928 if ((*pde & PG_V) != 0) { 1929 kstart = (kstart + PAGE_SIZE * NPTEPG) & 1930 ~(PAGE_SIZE * NPTEPG - 1); 1931 if (kstart - 1 >= kernel_map.max_offset) { 1932 kstart = kernel_map.max_offset; 1933 break; 1934 } 1935 continue; 1936 } 1937 1938 /* 1939 * This index is bogus, but out of the way 1940 */ 1941 nkpg = vm_page_alloc(kptobj, nkpt, 1942 VM_ALLOC_NORMAL | 1943 VM_ALLOC_SYSTEM | 1944 VM_ALLOC_INTERRUPT); 1945 if (nkpg == NULL) 1946 panic("pmap_growkernel: no memory to grow kernel"); 1947 1948 vm_page_wire(nkpg); 1949 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1950 pmap_zero_page(ptppaddr); 1951 vm_page_flag_clear(nkpg, PG_ZERO); 1952 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 1953 *pmap_pde(&kernel_pmap, kstart) = newpdir; 1954 nkpt++; 1955 1956 kstart = (kstart + PAGE_SIZE * NPTEPG) & 1957 ~(PAGE_SIZE * NPTEPG - 1); 1958 1959 if (kstart - 1 >= kernel_map.max_offset) { 1960 kstart = kernel_map.max_offset; 1961 break; 1962 } 1963 } 1964 1965 /* 1966 * Only update kernel_vm_end for areas below KERNBASE. 1967 */ 1968 if (update_kernel_vm_end && kernel_vm_end < kstart) 1969 kernel_vm_end = kstart; 1970 1971 lwkt_reltoken(&vm_token); 1972 crit_exit(); 1973 } 1974 1975 /* 1976 * Retire the given physical map from service. 1977 * Should only be called if the map contains 1978 * no valid mappings. 1979 */ 1980 void 1981 pmap_destroy(pmap_t pmap) 1982 { 1983 int count; 1984 1985 if (pmap == NULL) 1986 return; 1987 1988 lwkt_gettoken(&vm_token); 1989 count = --pmap->pm_count; 1990 if (count == 0) { 1991 pmap_release(pmap); 1992 panic("destroying a pmap is not yet implemented"); 1993 } 1994 lwkt_reltoken(&vm_token); 1995 } 1996 1997 /* 1998 * Add a reference to the specified pmap. 1999 */ 2000 void 2001 pmap_reference(pmap_t pmap) 2002 { 2003 if (pmap != NULL) { 2004 lwkt_gettoken(&vm_token); 2005 pmap->pm_count++; 2006 lwkt_reltoken(&vm_token); 2007 } 2008 } 2009 2010 /*************************************************** 2011 * page management routines. 2012 ***************************************************/ 2013 2014 /* 2015 * free the pv_entry back to the free list. This function may be 2016 * called from an interrupt. 2017 */ 2018 static __inline 2019 void 2020 free_pv_entry(pv_entry_t pv) 2021 { 2022 pv_entry_count--; 2023 KKASSERT(pv_entry_count >= 0); 2024 zfree(pvzone, pv); 2025 } 2026 2027 /* 2028 * get a new pv_entry, allocating a block from the system 2029 * when needed. This function may be called from an interrupt. 2030 */ 2031 static 2032 pv_entry_t 2033 get_pv_entry(void) 2034 { 2035 pv_entry_count++; 2036 if (pv_entry_high_water && 2037 (pv_entry_count > pv_entry_high_water) && 2038 (pmap_pagedaemon_waken == 0)) { 2039 pmap_pagedaemon_waken = 1; 2040 wakeup(&vm_pages_needed); 2041 } 2042 return zalloc(pvzone); 2043 } 2044 2045 /* 2046 * This routine is very drastic, but can save the system 2047 * in a pinch. 2048 */ 2049 void 2050 pmap_collect(void) 2051 { 2052 int i; 2053 vm_page_t m; 2054 static int warningdone=0; 2055 2056 if (pmap_pagedaemon_waken == 0) 2057 return; 2058 lwkt_gettoken(&vm_token); 2059 if (warningdone < 5) { 2060 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); 2061 warningdone++; 2062 } 2063 2064 for(i = 0; i < vm_page_array_size; i++) { 2065 m = &vm_page_array[i]; 2066 if (m->wire_count || m->hold_count || m->busy || 2067 (m->flags & PG_BUSY)) 2068 continue; 2069 pmap_remove_all(m); 2070 } 2071 pmap_pagedaemon_waken = 0; 2072 lwkt_reltoken(&vm_token); 2073 } 2074 2075 2076 /* 2077 * If it is the first entry on the list, it is actually 2078 * in the header and we must copy the following entry up 2079 * to the header. Otherwise we must search the list for 2080 * the entry. In either case we free the now unused entry. 2081 */ 2082 static 2083 int 2084 pmap_remove_entry(struct pmap *pmap, vm_page_t m, 2085 vm_offset_t va, pmap_inval_info_t info) 2086 { 2087 pv_entry_t pv; 2088 int rtval; 2089 2090 crit_enter(); 2091 if (m->md.pv_list_count < pmap->pm_stats.resident_count) { 2092 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2093 if (pmap == pv->pv_pmap && va == pv->pv_va) 2094 break; 2095 } 2096 } else { 2097 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { 2098 if (va == pv->pv_va) 2099 break; 2100 } 2101 } 2102 2103 rtval = 0; 2104 KKASSERT(pv); 2105 2106 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2107 m->md.pv_list_count--; 2108 m->object->agg_pv_list_count--; 2109 KKASSERT(m->md.pv_list_count >= 0); 2110 if (TAILQ_EMPTY(&m->md.pv_list)) 2111 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2112 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2113 ++pmap->pm_generation; 2114 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info); 2115 free_pv_entry(pv); 2116 2117 crit_exit(); 2118 return rtval; 2119 } 2120 2121 /* 2122 * Create a pv entry for page at pa for 2123 * (pmap, va). 2124 */ 2125 static 2126 void 2127 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) 2128 { 2129 pv_entry_t pv; 2130 2131 crit_enter(); 2132 pv = get_pv_entry(); 2133 pv->pv_va = va; 2134 pv->pv_pmap = pmap; 2135 pv->pv_ptem = mpte; 2136 2137 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); 2138 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2139 ++pmap->pm_generation; 2140 m->md.pv_list_count++; 2141 m->object->agg_pv_list_count++; 2142 2143 crit_exit(); 2144 } 2145 2146 /* 2147 * pmap_remove_pte: do the things to unmap a page in a process 2148 */ 2149 static 2150 int 2151 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, 2152 pmap_inval_info_t info) 2153 { 2154 pt_entry_t oldpte; 2155 vm_page_t m; 2156 2157 pmap_inval_interlock(info, pmap, va); 2158 oldpte = pte_load_clear(ptq); 2159 pmap_inval_deinterlock(info, pmap); 2160 if (oldpte & PG_W) 2161 pmap->pm_stats.wired_count -= 1; 2162 /* 2163 * Machines that don't support invlpg, also don't support 2164 * PG_G. XXX PG_G is disabled for SMP so don't worry about 2165 * the SMP case. 2166 */ 2167 if (oldpte & PG_G) 2168 cpu_invlpg((void *)va); 2169 KKASSERT(pmap->pm_stats.resident_count > 0); 2170 --pmap->pm_stats.resident_count; 2171 if (oldpte & PG_MANAGED) { 2172 m = PHYS_TO_VM_PAGE(oldpte); 2173 if (oldpte & PG_M) { 2174 #if defined(PMAP_DIAGNOSTIC) 2175 if (pmap_nw_modified((pt_entry_t) oldpte)) { 2176 kprintf( 2177 "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2178 va, oldpte); 2179 } 2180 #endif 2181 if (pmap_track_modified(va)) 2182 vm_page_dirty(m); 2183 } 2184 if (oldpte & PG_A) 2185 vm_page_flag_set(m, PG_REFERENCED); 2186 return pmap_remove_entry(pmap, m, va, info); 2187 } else { 2188 return pmap_unuse_pt(pmap, va, NULL, info); 2189 } 2190 2191 return 0; 2192 } 2193 2194 /* 2195 * pmap_remove_page: 2196 * 2197 * Remove a single page from a process address space. 2198 * 2199 * This function may not be called from an interrupt if the pmap is 2200 * not kernel_pmap. 2201 */ 2202 static 2203 void 2204 pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info) 2205 { 2206 pt_entry_t *pte; 2207 2208 pte = pmap_pte(pmap, va); 2209 if (pte == NULL) 2210 return; 2211 if ((*pte & PG_V) == 0) 2212 return; 2213 pmap_remove_pte(pmap, pte, va, info); 2214 } 2215 2216 /* 2217 * pmap_remove: 2218 * 2219 * Remove the given range of addresses from the specified map. 2220 * 2221 * It is assumed that the start and end are properly 2222 * rounded to the page size. 2223 * 2224 * This function may not be called from an interrupt if the pmap is 2225 * not kernel_pmap. 2226 */ 2227 void 2228 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 2229 { 2230 vm_offset_t va_next; 2231 pml4_entry_t *pml4e; 2232 pdp_entry_t *pdpe; 2233 pd_entry_t ptpaddr, *pde; 2234 pt_entry_t *pte; 2235 struct pmap_inval_info info; 2236 2237 if (pmap == NULL) 2238 return; 2239 2240 lwkt_gettoken(&vm_token); 2241 if (pmap->pm_stats.resident_count == 0) { 2242 lwkt_reltoken(&vm_token); 2243 return; 2244 } 2245 2246 pmap_inval_init(&info); 2247 2248 /* 2249 * special handling of removing one page. a very 2250 * common operation and easy to short circuit some 2251 * code. 2252 */ 2253 if (sva + PAGE_SIZE == eva) { 2254 pde = pmap_pde(pmap, sva); 2255 if (pde && (*pde & PG_PS) == 0) { 2256 pmap_remove_page(pmap, sva, &info); 2257 pmap_inval_done(&info); 2258 lwkt_reltoken(&vm_token); 2259 return; 2260 } 2261 } 2262 2263 for (; sva < eva; sva = va_next) { 2264 pml4e = pmap_pml4e(pmap, sva); 2265 if ((*pml4e & PG_V) == 0) { 2266 va_next = (sva + NBPML4) & ~PML4MASK; 2267 if (va_next < sva) 2268 va_next = eva; 2269 continue; 2270 } 2271 2272 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2273 if ((*pdpe & PG_V) == 0) { 2274 va_next = (sva + NBPDP) & ~PDPMASK; 2275 if (va_next < sva) 2276 va_next = eva; 2277 continue; 2278 } 2279 2280 /* 2281 * Calculate index for next page table. 2282 */ 2283 va_next = (sva + NBPDR) & ~PDRMASK; 2284 if (va_next < sva) 2285 va_next = eva; 2286 2287 pde = pmap_pdpe_to_pde(pdpe, sva); 2288 ptpaddr = *pde; 2289 2290 /* 2291 * Weed out invalid mappings. 2292 */ 2293 if (ptpaddr == 0) 2294 continue; 2295 2296 /* 2297 * Check for large page. 2298 */ 2299 if ((ptpaddr & PG_PS) != 0) { 2300 /* JG FreeBSD has more complex treatment here */ 2301 pmap_inval_interlock(&info, pmap, -1); 2302 *pde = 0; 2303 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2304 pmap_inval_deinterlock(&info, pmap); 2305 continue; 2306 } 2307 2308 /* 2309 * Limit our scan to either the end of the va represented 2310 * by the current page table page, or to the end of the 2311 * range being removed. 2312 */ 2313 if (va_next > eva) 2314 va_next = eva; 2315 2316 /* 2317 * NOTE: pmap_remove_pte() can block. 2318 */ 2319 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2320 sva += PAGE_SIZE) { 2321 if (*pte == 0) 2322 continue; 2323 if (pmap_remove_pte(pmap, pte, sva, &info)) 2324 break; 2325 } 2326 } 2327 pmap_inval_done(&info); 2328 lwkt_reltoken(&vm_token); 2329 } 2330 2331 /* 2332 * pmap_remove_all: 2333 * 2334 * Removes this physical page from all physical maps in which it resides. 2335 * Reflects back modify bits to the pager. 2336 * 2337 * This routine may not be called from an interrupt. 2338 */ 2339 2340 static 2341 void 2342 pmap_remove_all(vm_page_t m) 2343 { 2344 struct pmap_inval_info info; 2345 pt_entry_t *pte, tpte; 2346 pv_entry_t pv; 2347 2348 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2349 return; 2350 2351 lwkt_gettoken(&vm_token); 2352 pmap_inval_init(&info); 2353 crit_enter(); 2354 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2355 KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0); 2356 --pv->pv_pmap->pm_stats.resident_count; 2357 2358 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2359 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 2360 tpte = pte_load_clear(pte); 2361 if (tpte & PG_W) 2362 pv->pv_pmap->pm_stats.wired_count--; 2363 pmap_inval_deinterlock(&info, pv->pv_pmap); 2364 if (tpte & PG_A) 2365 vm_page_flag_set(m, PG_REFERENCED); 2366 2367 /* 2368 * Update the vm_page_t clean and reference bits. 2369 */ 2370 if (tpte & PG_M) { 2371 #if defined(PMAP_DIAGNOSTIC) 2372 if (pmap_nw_modified(tpte)) { 2373 kprintf( 2374 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2375 pv->pv_va, tpte); 2376 } 2377 #endif 2378 if (pmap_track_modified(pv->pv_va)) 2379 vm_page_dirty(m); 2380 } 2381 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2382 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 2383 ++pv->pv_pmap->pm_generation; 2384 m->md.pv_list_count--; 2385 m->object->agg_pv_list_count--; 2386 KKASSERT(m->md.pv_list_count >= 0); 2387 if (TAILQ_EMPTY(&m->md.pv_list)) 2388 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2389 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info); 2390 free_pv_entry(pv); 2391 } 2392 crit_exit(); 2393 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2394 pmap_inval_done(&info); 2395 lwkt_reltoken(&vm_token); 2396 } 2397 2398 /* 2399 * pmap_protect: 2400 * 2401 * Set the physical protection on the specified range of this map 2402 * as requested. 2403 * 2404 * This function may not be called from an interrupt if the map is 2405 * not the kernel_pmap. 2406 */ 2407 void 2408 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2409 { 2410 vm_offset_t va_next; 2411 pml4_entry_t *pml4e; 2412 pdp_entry_t *pdpe; 2413 pd_entry_t ptpaddr, *pde; 2414 pt_entry_t *pte; 2415 pmap_inval_info info; 2416 2417 /* JG review for NX */ 2418 2419 if (pmap == NULL) 2420 return; 2421 2422 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2423 pmap_remove(pmap, sva, eva); 2424 return; 2425 } 2426 2427 if (prot & VM_PROT_WRITE) 2428 return; 2429 2430 lwkt_gettoken(&vm_token); 2431 pmap_inval_init(&info); 2432 2433 for (; sva < eva; sva = va_next) { 2434 2435 pml4e = pmap_pml4e(pmap, sva); 2436 if ((*pml4e & PG_V) == 0) { 2437 va_next = (sva + NBPML4) & ~PML4MASK; 2438 if (va_next < sva) 2439 va_next = eva; 2440 continue; 2441 } 2442 2443 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2444 if ((*pdpe & PG_V) == 0) { 2445 va_next = (sva + NBPDP) & ~PDPMASK; 2446 if (va_next < sva) 2447 va_next = eva; 2448 continue; 2449 } 2450 2451 va_next = (sva + NBPDR) & ~PDRMASK; 2452 if (va_next < sva) 2453 va_next = eva; 2454 2455 pde = pmap_pdpe_to_pde(pdpe, sva); 2456 ptpaddr = *pde; 2457 2458 /* 2459 * Check for large page. 2460 */ 2461 if ((ptpaddr & PG_PS) != 0) { 2462 pmap_inval_interlock(&info, pmap, -1); 2463 *pde &= ~(PG_M|PG_RW); 2464 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2465 pmap_inval_deinterlock(&info, pmap); 2466 continue; 2467 } 2468 2469 /* 2470 * Weed out invalid mappings. Note: we assume that the page 2471 * directory table is always allocated, and in kernel virtual. 2472 */ 2473 if (ptpaddr == 0) 2474 continue; 2475 2476 if (va_next > eva) 2477 va_next = eva; 2478 2479 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2480 sva += PAGE_SIZE) { 2481 pt_entry_t pbits; 2482 pt_entry_t cbits; 2483 vm_page_t m; 2484 2485 /* 2486 * XXX non-optimal. 2487 */ 2488 pmap_inval_interlock(&info, pmap, sva); 2489 again: 2490 pbits = *pte; 2491 cbits = pbits; 2492 if ((pbits & PG_V) == 0) { 2493 pmap_inval_deinterlock(&info, pmap); 2494 continue; 2495 } 2496 if (pbits & PG_MANAGED) { 2497 m = NULL; 2498 if (pbits & PG_A) { 2499 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2500 vm_page_flag_set(m, PG_REFERENCED); 2501 cbits &= ~PG_A; 2502 } 2503 if (pbits & PG_M) { 2504 if (pmap_track_modified(sva)) { 2505 if (m == NULL) 2506 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2507 vm_page_dirty(m); 2508 cbits &= ~PG_M; 2509 } 2510 } 2511 } 2512 cbits &= ~PG_RW; 2513 if (pbits != cbits && 2514 !atomic_cmpset_long(pte, pbits, cbits)) { 2515 goto again; 2516 } 2517 pmap_inval_deinterlock(&info, pmap); 2518 } 2519 } 2520 pmap_inval_done(&info); 2521 lwkt_reltoken(&vm_token); 2522 } 2523 2524 /* 2525 * Insert the given physical page (p) at 2526 * the specified virtual address (v) in the 2527 * target physical map with the protection requested. 2528 * 2529 * If specified, the page will be wired down, meaning 2530 * that the related pte can not be reclaimed. 2531 * 2532 * NB: This is the only routine which MAY NOT lazy-evaluate 2533 * or lose information. That is, this routine must actually 2534 * insert this page into the given map NOW. 2535 */ 2536 void 2537 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2538 boolean_t wired) 2539 { 2540 vm_paddr_t pa; 2541 pd_entry_t *pde; 2542 pt_entry_t *pte; 2543 vm_paddr_t opa; 2544 pt_entry_t origpte, newpte; 2545 vm_page_t mpte; 2546 pmap_inval_info info; 2547 2548 if (pmap == NULL) 2549 return; 2550 2551 va = trunc_page(va); 2552 #ifdef PMAP_DIAGNOSTIC 2553 if (va >= KvaEnd) 2554 panic("pmap_enter: toobig"); 2555 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 2556 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); 2557 #endif 2558 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2559 kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n"); 2560 #ifdef DDB 2561 db_print_backtrace(); 2562 #endif 2563 } 2564 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2565 kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n"); 2566 #ifdef DDB 2567 db_print_backtrace(); 2568 #endif 2569 } 2570 2571 lwkt_gettoken(&vm_token); 2572 2573 /* 2574 * In the case that a page table page is not 2575 * resident, we are creating it here. 2576 */ 2577 if (va < VM_MAX_USER_ADDRESS) 2578 mpte = pmap_allocpte(pmap, va); 2579 else 2580 mpte = NULL; 2581 2582 pmap_inval_init(&info); 2583 pde = pmap_pde(pmap, va); 2584 if (pde != NULL && (*pde & PG_V) != 0) { 2585 if ((*pde & PG_PS) != 0) 2586 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2587 pte = pmap_pde_to_pte(pde, va); 2588 } else 2589 panic("pmap_enter: invalid page directory va=%#lx", va); 2590 2591 KKASSERT(pte != NULL); 2592 pa = VM_PAGE_TO_PHYS(m); 2593 origpte = *pte; 2594 opa = origpte & PG_FRAME; 2595 2596 /* 2597 * Mapping has not changed, must be protection or wiring change. 2598 */ 2599 if (origpte && (opa == pa)) { 2600 /* 2601 * Wiring change, just update stats. We don't worry about 2602 * wiring PT pages as they remain resident as long as there 2603 * are valid mappings in them. Hence, if a user page is wired, 2604 * the PT page will be also. 2605 */ 2606 if (wired && ((origpte & PG_W) == 0)) 2607 pmap->pm_stats.wired_count++; 2608 else if (!wired && (origpte & PG_W)) 2609 pmap->pm_stats.wired_count--; 2610 2611 #if defined(PMAP_DIAGNOSTIC) 2612 if (pmap_nw_modified(origpte)) { 2613 kprintf( 2614 "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2615 va, origpte); 2616 } 2617 #endif 2618 2619 /* 2620 * Remove the extra pte reference. Note that we cannot 2621 * optimize the RO->RW case because we have adjusted the 2622 * wiring count above and may need to adjust the wiring 2623 * bits below. 2624 */ 2625 if (mpte) 2626 mpte->hold_count--; 2627 2628 /* 2629 * We might be turning off write access to the page, 2630 * so we go ahead and sense modify status. 2631 */ 2632 if (origpte & PG_MANAGED) { 2633 if ((origpte & PG_M) && pmap_track_modified(va)) { 2634 vm_page_t om; 2635 om = PHYS_TO_VM_PAGE(opa); 2636 vm_page_dirty(om); 2637 } 2638 pa |= PG_MANAGED; 2639 KKASSERT(m->flags & PG_MAPPED); 2640 } 2641 goto validate; 2642 } 2643 /* 2644 * Mapping has changed, invalidate old range and fall through to 2645 * handle validating new mapping. 2646 */ 2647 while (opa) { 2648 int err; 2649 err = pmap_remove_pte(pmap, pte, va, &info); 2650 if (err) 2651 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2652 origpte = *pte; 2653 opa = origpte & PG_FRAME; 2654 if (opa) { 2655 kprintf("pmap_enter: Warning, raced pmap %p va %p\n", 2656 pmap, (void *)va); 2657 } 2658 } 2659 2660 /* 2661 * Enter on the PV list if part of our managed memory. Note that we 2662 * raise IPL while manipulating pv_table since pmap_enter can be 2663 * called at interrupt time. 2664 */ 2665 if (pmap_initialized && 2666 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2667 pmap_insert_entry(pmap, va, mpte, m); 2668 pa |= PG_MANAGED; 2669 vm_page_flag_set(m, PG_MAPPED); 2670 } 2671 2672 /* 2673 * Increment counters 2674 */ 2675 ++pmap->pm_stats.resident_count; 2676 if (wired) 2677 pmap->pm_stats.wired_count++; 2678 2679 validate: 2680 /* 2681 * Now validate mapping with desired protection/wiring. 2682 */ 2683 newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | PG_V); 2684 2685 if (wired) 2686 newpte |= PG_W; 2687 if (va < VM_MAX_USER_ADDRESS) 2688 newpte |= PG_U; 2689 if (pmap == &kernel_pmap) 2690 newpte |= pgeflag; 2691 2692 /* 2693 * if the mapping or permission bits are different, we need 2694 * to update the pte. 2695 */ 2696 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2697 pmap_inval_interlock(&info, pmap, va); 2698 *pte = newpte | PG_A; 2699 pmap_inval_deinterlock(&info, pmap); 2700 if (newpte & PG_RW) 2701 vm_page_flag_set(m, PG_WRITEABLE); 2702 } 2703 KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2704 pmap_inval_done(&info); 2705 lwkt_reltoken(&vm_token); 2706 } 2707 2708 /* 2709 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 2710 * This code also assumes that the pmap has no pre-existing entry for this 2711 * VA. 2712 * 2713 * This code currently may only be used on user pmaps, not kernel_pmap. 2714 */ 2715 void 2716 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 2717 { 2718 pt_entry_t *pte; 2719 vm_paddr_t pa; 2720 vm_page_t mpte; 2721 vm_pindex_t ptepindex; 2722 pd_entry_t *ptepa; 2723 pmap_inval_info info; 2724 2725 lwkt_gettoken(&vm_token); 2726 pmap_inval_init(&info); 2727 2728 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2729 kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n"); 2730 #ifdef DDB 2731 db_print_backtrace(); 2732 #endif 2733 } 2734 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2735 kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n"); 2736 #ifdef DDB 2737 db_print_backtrace(); 2738 #endif 2739 } 2740 2741 KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */ 2742 2743 /* 2744 * Calculate the page table page (mpte), allocating it if necessary. 2745 * 2746 * A held page table page (mpte), or NULL, is passed onto the 2747 * section following. 2748 */ 2749 if (va < VM_MAX_USER_ADDRESS) { 2750 /* 2751 * Calculate pagetable page index 2752 */ 2753 ptepindex = pmap_pde_pindex(va); 2754 2755 do { 2756 /* 2757 * Get the page directory entry 2758 */ 2759 ptepa = pmap_pde(pmap, va); 2760 2761 /* 2762 * If the page table page is mapped, we just increment 2763 * the hold count, and activate it. 2764 */ 2765 if (ptepa && (*ptepa & PG_V) != 0) { 2766 if (*ptepa & PG_PS) 2767 panic("pmap_enter_quick: unexpected mapping into 2MB page"); 2768 // if (pmap->pm_ptphint && 2769 // (pmap->pm_ptphint->pindex == ptepindex)) { 2770 // mpte = pmap->pm_ptphint; 2771 // } else { 2772 mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 2773 pmap->pm_ptphint = mpte; 2774 // } 2775 if (mpte) 2776 mpte->hold_count++; 2777 } else { 2778 mpte = _pmap_allocpte(pmap, ptepindex); 2779 } 2780 } while (mpte == NULL); 2781 } else { 2782 mpte = NULL; 2783 /* this code path is not yet used */ 2784 } 2785 2786 /* 2787 * With a valid (and held) page directory page, we can just use 2788 * vtopte() to get to the pte. If the pte is already present 2789 * we do not disturb it. 2790 */ 2791 pte = vtopte(va); 2792 if (*pte & PG_V) { 2793 if (mpte) 2794 pmap_unwire_pte_hold(pmap, va, mpte, &info); 2795 pa = VM_PAGE_TO_PHYS(m); 2796 KKASSERT(((*pte ^ pa) & PG_FRAME) == 0); 2797 pmap_inval_done(&info); 2798 lwkt_reltoken(&vm_token); 2799 return; 2800 } 2801 2802 /* 2803 * Enter on the PV list if part of our managed memory 2804 */ 2805 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2806 pmap_insert_entry(pmap, va, mpte, m); 2807 vm_page_flag_set(m, PG_MAPPED); 2808 } 2809 2810 /* 2811 * Increment counters 2812 */ 2813 ++pmap->pm_stats.resident_count; 2814 2815 pa = VM_PAGE_TO_PHYS(m); 2816 2817 /* 2818 * Now validate mapping with RO protection 2819 */ 2820 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2821 *pte = pa | PG_V | PG_U; 2822 else 2823 *pte = pa | PG_V | PG_U | PG_MANAGED; 2824 /* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */ 2825 pmap_inval_done(&info); 2826 lwkt_reltoken(&vm_token); 2827 } 2828 2829 /* 2830 * Make a temporary mapping for a physical address. This is only intended 2831 * to be used for panic dumps. 2832 * 2833 * The caller is responsible for calling smp_invltlb(). 2834 */ 2835 void * 2836 pmap_kenter_temporary(vm_paddr_t pa, long i) 2837 { 2838 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 2839 return ((void *)crashdumpmap); 2840 } 2841 2842 #define MAX_INIT_PT (96) 2843 2844 /* 2845 * This routine preloads the ptes for a given object into the specified pmap. 2846 * This eliminates the blast of soft faults on process startup and 2847 * immediately after an mmap. 2848 */ 2849 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2850 2851 void 2852 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 2853 vm_object_t object, vm_pindex_t pindex, 2854 vm_size_t size, int limit) 2855 { 2856 struct rb_vm_page_scan_info info; 2857 struct lwp *lp; 2858 vm_size_t psize; 2859 2860 /* 2861 * We can't preinit if read access isn't set or there is no pmap 2862 * or object. 2863 */ 2864 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2865 return; 2866 2867 /* 2868 * We can't preinit if the pmap is not the current pmap 2869 */ 2870 lp = curthread->td_lwp; 2871 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2872 return; 2873 2874 psize = x86_64_btop(size); 2875 2876 if ((object->type != OBJT_VNODE) || 2877 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2878 (object->resident_page_count > MAX_INIT_PT))) { 2879 return; 2880 } 2881 2882 if (psize + pindex > object->size) { 2883 if (object->size < pindex) 2884 return; 2885 psize = object->size - pindex; 2886 } 2887 2888 if (psize == 0) 2889 return; 2890 2891 /* 2892 * Use a red-black scan to traverse the requested range and load 2893 * any valid pages found into the pmap. 2894 * 2895 * We cannot safely scan the object's memq unless we are in a 2896 * critical section since interrupts can remove pages from objects. 2897 */ 2898 info.start_pindex = pindex; 2899 info.end_pindex = pindex + psize - 1; 2900 info.limit = limit; 2901 info.mpte = NULL; 2902 info.addr = addr; 2903 info.pmap = pmap; 2904 2905 crit_enter(); 2906 lwkt_gettoken(&vm_token); 2907 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2908 pmap_object_init_pt_callback, &info); 2909 lwkt_reltoken(&vm_token); 2910 crit_exit(); 2911 } 2912 2913 static 2914 int 2915 pmap_object_init_pt_callback(vm_page_t p, void *data) 2916 { 2917 struct rb_vm_page_scan_info *info = data; 2918 vm_pindex_t rel_index; 2919 /* 2920 * don't allow an madvise to blow away our really 2921 * free pages allocating pv entries. 2922 */ 2923 if ((info->limit & MAP_PREFAULT_MADVISE) && 2924 vmstats.v_free_count < vmstats.v_free_reserved) { 2925 return(-1); 2926 } 2927 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2928 (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { 2929 vm_page_busy(p); 2930 if ((p->queue - p->pc) == PQ_CACHE) 2931 vm_page_deactivate(p); 2932 rel_index = p->pindex - info->start_pindex; 2933 pmap_enter_quick(info->pmap, 2934 info->addr + x86_64_ptob(rel_index), p); 2935 vm_page_wakeup(p); 2936 } 2937 return(0); 2938 } 2939 2940 /* 2941 * Return TRUE if the pmap is in shape to trivially 2942 * pre-fault the specified address. 2943 * 2944 * Returns FALSE if it would be non-trivial or if a 2945 * pte is already loaded into the slot. 2946 */ 2947 int 2948 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2949 { 2950 pt_entry_t *pte; 2951 pd_entry_t *pde; 2952 int ret; 2953 2954 lwkt_gettoken(&vm_token); 2955 pde = pmap_pde(pmap, addr); 2956 if (pde == NULL || *pde == 0) { 2957 ret = 0; 2958 } else { 2959 pte = vtopte(addr); 2960 ret = (*pte) ? 0 : 1; 2961 } 2962 lwkt_reltoken(&vm_token); 2963 return(ret); 2964 } 2965 2966 /* 2967 * Routine: pmap_change_wiring 2968 * Function: Change the wiring attribute for a map/virtual-address 2969 * pair. 2970 * In/out conditions: 2971 * The mapping must already exist in the pmap. 2972 */ 2973 void 2974 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 2975 { 2976 pt_entry_t *pte; 2977 2978 if (pmap == NULL) 2979 return; 2980 2981 lwkt_gettoken(&vm_token); 2982 pte = pmap_pte(pmap, va); 2983 2984 if (wired && !pmap_pte_w(pte)) 2985 pmap->pm_stats.wired_count++; 2986 else if (!wired && pmap_pte_w(pte)) 2987 pmap->pm_stats.wired_count--; 2988 2989 /* 2990 * Wiring is not a hardware characteristic so there is no need to 2991 * invalidate TLB. However, in an SMP environment we must use 2992 * a locked bus cycle to update the pte (if we are not using 2993 * the pmap_inval_*() API that is)... it's ok to do this for simple 2994 * wiring changes. 2995 */ 2996 #ifdef SMP 2997 if (wired) 2998 atomic_set_long(pte, PG_W); 2999 else 3000 atomic_clear_long(pte, PG_W); 3001 #else 3002 if (wired) 3003 atomic_set_long_nonlocked(pte, PG_W); 3004 else 3005 atomic_clear_long_nonlocked(pte, PG_W); 3006 #endif 3007 lwkt_reltoken(&vm_token); 3008 } 3009 3010 3011 3012 /* 3013 * Copy the range specified by src_addr/len 3014 * from the source map to the range dst_addr/len 3015 * in the destination map. 3016 * 3017 * This routine is only advisory and need not do anything. 3018 */ 3019 void 3020 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 3021 vm_size_t len, vm_offset_t src_addr) 3022 { 3023 return; 3024 #if 0 3025 pmap_inval_info info; 3026 vm_offset_t addr; 3027 vm_offset_t end_addr = src_addr + len; 3028 vm_offset_t pdnxt; 3029 pd_entry_t src_frame, dst_frame; 3030 vm_page_t m; 3031 3032 if (dst_addr != src_addr) 3033 return; 3034 #if JGPMAP32 3035 src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 3036 if (src_frame != (PTDpde & PG_FRAME)) { 3037 return; 3038 } 3039 3040 dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 3041 if (dst_frame != (APTDpde & PG_FRAME)) { 3042 APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); 3043 /* The page directory is not shared between CPUs */ 3044 cpu_invltlb(); 3045 } 3046 #endif 3047 pmap_inval_init(&info); 3048 pmap_inval_add(&info, dst_pmap, -1); 3049 pmap_inval_add(&info, src_pmap, -1); 3050 3051 /* 3052 * critical section protection is required to maintain the page/object 3053 * association, interrupts can free pages and remove them from 3054 * their objects. 3055 */ 3056 crit_enter(); 3057 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 3058 pt_entry_t *src_pte, *dst_pte; 3059 vm_page_t dstmpte, srcmpte; 3060 vm_offset_t srcptepaddr; 3061 vm_pindex_t ptepindex; 3062 3063 if (addr >= UPT_MIN_ADDRESS) 3064 panic("pmap_copy: invalid to pmap_copy page tables\n"); 3065 3066 /* 3067 * Don't let optional prefaulting of pages make us go 3068 * way below the low water mark of free pages or way 3069 * above high water mark of used pv entries. 3070 */ 3071 if (vmstats.v_free_count < vmstats.v_free_reserved || 3072 pv_entry_count > pv_entry_high_water) 3073 break; 3074 3075 pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); 3076 ptepindex = addr >> PDRSHIFT; 3077 3078 #if JGPMAP32 3079 srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; 3080 #endif 3081 if (srcptepaddr == 0) 3082 continue; 3083 3084 if (srcptepaddr & PG_PS) { 3085 #if JGPMAP32 3086 if (dst_pmap->pm_pdir[ptepindex] == 0) { 3087 dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; 3088 dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3089 } 3090 #endif 3091 continue; 3092 } 3093 3094 srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); 3095 if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || 3096 (srcmpte->flags & PG_BUSY)) { 3097 continue; 3098 } 3099 3100 if (pdnxt > end_addr) 3101 pdnxt = end_addr; 3102 3103 src_pte = vtopte(addr); 3104 #if JGPMAP32 3105 dst_pte = avtopte(addr); 3106 #endif 3107 while (addr < pdnxt) { 3108 pt_entry_t ptetemp; 3109 3110 ptetemp = *src_pte; 3111 /* 3112 * we only virtual copy managed pages 3113 */ 3114 if ((ptetemp & PG_MANAGED) != 0) { 3115 /* 3116 * We have to check after allocpte for the 3117 * pte still being around... allocpte can 3118 * block. 3119 * 3120 * pmap_allocpte() can block. If we lose 3121 * our page directory mappings we stop. 3122 */ 3123 dstmpte = pmap_allocpte(dst_pmap, addr); 3124 3125 #if JGPMAP32 3126 if (src_frame != (PTDpde & PG_FRAME) || 3127 dst_frame != (APTDpde & PG_FRAME) 3128 ) { 3129 kprintf("WARNING: pmap_copy: detected and corrected race\n"); 3130 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3131 goto failed; 3132 } else if ((*dst_pte == 0) && 3133 (ptetemp = *src_pte) != 0 && 3134 (ptetemp & PG_MANAGED)) { 3135 /* 3136 * Clear the modified and 3137 * accessed (referenced) bits 3138 * during the copy. 3139 */ 3140 m = PHYS_TO_VM_PAGE(ptetemp); 3141 *dst_pte = ptetemp & ~(PG_M | PG_A); 3142 ++dst_pmap->pm_stats.resident_count; 3143 pmap_insert_entry(dst_pmap, addr, 3144 dstmpte, m); 3145 KKASSERT(m->flags & PG_MAPPED); 3146 } else { 3147 kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n"); 3148 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3149 goto failed; 3150 } 3151 #endif 3152 if (dstmpte->hold_count >= srcmpte->hold_count) 3153 break; 3154 } 3155 addr += PAGE_SIZE; 3156 src_pte++; 3157 dst_pte++; 3158 } 3159 } 3160 failed: 3161 crit_exit(); 3162 pmap_inval_done(&info); 3163 #endif 3164 } 3165 3166 /* 3167 * pmap_zero_page: 3168 * 3169 * Zero the specified physical page. 3170 * 3171 * This function may be called from an interrupt and no locking is 3172 * required. 3173 */ 3174 void 3175 pmap_zero_page(vm_paddr_t phys) 3176 { 3177 vm_offset_t va = PHYS_TO_DMAP(phys); 3178 3179 pagezero((void *)va); 3180 } 3181 3182 /* 3183 * pmap_page_assertzero: 3184 * 3185 * Assert that a page is empty, panic if it isn't. 3186 */ 3187 void 3188 pmap_page_assertzero(vm_paddr_t phys) 3189 { 3190 vm_offset_t va = PHYS_TO_DMAP(phys); 3191 size_t i; 3192 3193 for (i = 0; i < PAGE_SIZE; i += sizeof(long)) { 3194 if (*(long *)((char *)va + i) != 0) { 3195 panic("pmap_page_assertzero() @ %p not zero!\n", 3196 (void *)(intptr_t)va); 3197 } 3198 } 3199 } 3200 3201 /* 3202 * pmap_zero_page: 3203 * 3204 * Zero part of a physical page by mapping it into memory and clearing 3205 * its contents with bzero. 3206 * 3207 * off and size may not cover an area beyond a single hardware page. 3208 */ 3209 void 3210 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 3211 { 3212 vm_offset_t virt = PHYS_TO_DMAP(phys); 3213 3214 bzero((char *)virt + off, size); 3215 } 3216 3217 /* 3218 * pmap_copy_page: 3219 * 3220 * Copy the physical page from the source PA to the target PA. 3221 * This function may be called from an interrupt. No locking 3222 * is required. 3223 */ 3224 void 3225 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 3226 { 3227 vm_offset_t src_virt, dst_virt; 3228 3229 src_virt = PHYS_TO_DMAP(src); 3230 dst_virt = PHYS_TO_DMAP(dst); 3231 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 3232 } 3233 3234 /* 3235 * pmap_copy_page_frag: 3236 * 3237 * Copy the physical page from the source PA to the target PA. 3238 * This function may be called from an interrupt. No locking 3239 * is required. 3240 */ 3241 void 3242 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 3243 { 3244 vm_offset_t src_virt, dst_virt; 3245 3246 src_virt = PHYS_TO_DMAP(src); 3247 dst_virt = PHYS_TO_DMAP(dst); 3248 3249 bcopy((char *)src_virt + (src & PAGE_MASK), 3250 (char *)dst_virt + (dst & PAGE_MASK), 3251 bytes); 3252 } 3253 3254 /* 3255 * Returns true if the pmap's pv is one of the first 3256 * 16 pvs linked to from this page. This count may 3257 * be changed upwards or downwards in the future; it 3258 * is only necessary that true be returned for a small 3259 * subset of pmaps for proper page aging. 3260 */ 3261 boolean_t 3262 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3263 { 3264 pv_entry_t pv; 3265 int loops = 0; 3266 3267 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3268 return FALSE; 3269 3270 crit_enter(); 3271 lwkt_gettoken(&vm_token); 3272 3273 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3274 if (pv->pv_pmap == pmap) { 3275 lwkt_reltoken(&vm_token); 3276 crit_exit(); 3277 return TRUE; 3278 } 3279 loops++; 3280 if (loops >= 16) 3281 break; 3282 } 3283 lwkt_reltoken(&vm_token); 3284 crit_exit(); 3285 return (FALSE); 3286 } 3287 3288 /* 3289 * Remove all pages from specified address space 3290 * this aids process exit speeds. Also, this code 3291 * is special cased for current process only, but 3292 * can have the more generic (and slightly slower) 3293 * mode enabled. This is much faster than pmap_remove 3294 * in the case of running down an entire address space. 3295 */ 3296 void 3297 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3298 { 3299 struct lwp *lp; 3300 pt_entry_t *pte, tpte; 3301 pv_entry_t pv, npv; 3302 vm_page_t m; 3303 pmap_inval_info info; 3304 int iscurrentpmap; 3305 int save_generation; 3306 3307 lp = curthread->td_lwp; 3308 if (lp && pmap == vmspace_pmap(lp->lwp_vmspace)) 3309 iscurrentpmap = 1; 3310 else 3311 iscurrentpmap = 0; 3312 3313 lwkt_gettoken(&vm_token); 3314 pmap_inval_init(&info); 3315 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 3316 if (pv->pv_va >= eva || pv->pv_va < sva) { 3317 npv = TAILQ_NEXT(pv, pv_plist); 3318 continue; 3319 } 3320 3321 KKASSERT(pmap == pv->pv_pmap); 3322 3323 if (iscurrentpmap) 3324 pte = vtopte(pv->pv_va); 3325 else 3326 pte = pmap_pte_quick(pmap, pv->pv_va); 3327 pmap_inval_interlock(&info, pmap, pv->pv_va); 3328 3329 /* 3330 * We cannot remove wired pages from a process' mapping 3331 * at this time 3332 */ 3333 if (*pte & PG_W) { 3334 pmap_inval_deinterlock(&info, pmap); 3335 npv = TAILQ_NEXT(pv, pv_plist); 3336 continue; 3337 } 3338 tpte = pte_load_clear(pte); 3339 3340 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3341 3342 KASSERT(m < &vm_page_array[vm_page_array_size], 3343 ("pmap_remove_pages: bad tpte %lx", tpte)); 3344 3345 KKASSERT(pmap->pm_stats.resident_count > 0); 3346 --pmap->pm_stats.resident_count; 3347 pmap_inval_deinterlock(&info, pmap); 3348 3349 /* 3350 * Update the vm_page_t clean and reference bits. 3351 */ 3352 if (tpte & PG_M) { 3353 vm_page_dirty(m); 3354 } 3355 3356 npv = TAILQ_NEXT(pv, pv_plist); 3357 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 3358 save_generation = ++pmap->pm_generation; 3359 3360 m->md.pv_list_count--; 3361 m->object->agg_pv_list_count--; 3362 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3363 if (TAILQ_EMPTY(&m->md.pv_list)) 3364 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3365 3366 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info); 3367 free_pv_entry(pv); 3368 3369 /* 3370 * Restart the scan if we blocked during the unuse or free 3371 * calls and other removals were made. 3372 */ 3373 if (save_generation != pmap->pm_generation) { 3374 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 3375 npv = TAILQ_FIRST(&pmap->pm_pvlist); 3376 } 3377 } 3378 pmap_inval_done(&info); 3379 lwkt_reltoken(&vm_token); 3380 } 3381 3382 /* 3383 * pmap_testbit tests bits in pte's 3384 * note that the testbit/clearbit routines are inline, 3385 * and a lot of things compile-time evaluate. 3386 */ 3387 static 3388 boolean_t 3389 pmap_testbit(vm_page_t m, int bit) 3390 { 3391 pv_entry_t pv; 3392 pt_entry_t *pte; 3393 3394 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3395 return FALSE; 3396 3397 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 3398 return FALSE; 3399 3400 crit_enter(); 3401 3402 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3403 /* 3404 * if the bit being tested is the modified bit, then 3405 * mark clean_map and ptes as never 3406 * modified. 3407 */ 3408 if (bit & (PG_A|PG_M)) { 3409 if (!pmap_track_modified(pv->pv_va)) 3410 continue; 3411 } 3412 3413 #if defined(PMAP_DIAGNOSTIC) 3414 if (pv->pv_pmap == NULL) { 3415 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 3416 continue; 3417 } 3418 #endif 3419 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3420 if (*pte & bit) { 3421 crit_exit(); 3422 return TRUE; 3423 } 3424 } 3425 crit_exit(); 3426 return (FALSE); 3427 } 3428 3429 /* 3430 * this routine is used to modify bits in ptes 3431 */ 3432 static __inline 3433 void 3434 pmap_clearbit(vm_page_t m, int bit) 3435 { 3436 struct pmap_inval_info info; 3437 pv_entry_t pv; 3438 pt_entry_t *pte; 3439 pt_entry_t pbits; 3440 3441 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3442 return; 3443 3444 pmap_inval_init(&info); 3445 3446 /* 3447 * Loop over all current mappings setting/clearing as appropos If 3448 * setting RO do we need to clear the VAC? 3449 */ 3450 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3451 /* 3452 * don't write protect pager mappings 3453 */ 3454 if (bit == PG_RW) { 3455 if (!pmap_track_modified(pv->pv_va)) 3456 continue; 3457 } 3458 3459 #if defined(PMAP_DIAGNOSTIC) 3460 if (pv->pv_pmap == NULL) { 3461 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 3462 continue; 3463 } 3464 #endif 3465 3466 /* 3467 * Careful here. We can use a locked bus instruction to 3468 * clear PG_A or PG_M safely but we need to synchronize 3469 * with the target cpus when we mess with PG_RW. 3470 * 3471 * We do not have to force synchronization when clearing 3472 * PG_M even for PTEs generated via virtual memory maps, 3473 * because the virtual kernel will invalidate the pmap 3474 * entry when/if it needs to resynchronize the Modify bit. 3475 */ 3476 if (bit & PG_RW) 3477 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 3478 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3479 again: 3480 pbits = *pte; 3481 if (pbits & bit) { 3482 if (bit == PG_RW) { 3483 if (pbits & PG_M) { 3484 vm_page_dirty(m); 3485 atomic_clear_long(pte, PG_M|PG_RW); 3486 } else { 3487 /* 3488 * The cpu may be trying to set PG_M 3489 * simultaniously with our clearing 3490 * of PG_RW. 3491 */ 3492 if (!atomic_cmpset_long(pte, pbits, 3493 pbits & ~PG_RW)) 3494 goto again; 3495 } 3496 } else if (bit == PG_M) { 3497 /* 3498 * We could also clear PG_RW here to force 3499 * a fault on write to redetect PG_M for 3500 * virtual kernels, but it isn't necessary 3501 * since virtual kernels invalidate the pte 3502 * when they clear the VPTE_M bit in their 3503 * virtual page tables. 3504 */ 3505 atomic_clear_long(pte, PG_M); 3506 } else { 3507 atomic_clear_long(pte, bit); 3508 } 3509 } 3510 if (bit & PG_RW) 3511 pmap_inval_deinterlock(&info, pv->pv_pmap); 3512 } 3513 pmap_inval_done(&info); 3514 } 3515 3516 /* 3517 * pmap_page_protect: 3518 * 3519 * Lower the permission for all mappings to a given page. 3520 */ 3521 void 3522 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3523 { 3524 /* JG NX support? */ 3525 if ((prot & VM_PROT_WRITE) == 0) { 3526 lwkt_gettoken(&vm_token); 3527 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3528 pmap_clearbit(m, PG_RW); 3529 vm_page_flag_clear(m, PG_WRITEABLE); 3530 } else { 3531 pmap_remove_all(m); 3532 } 3533 lwkt_reltoken(&vm_token); 3534 } 3535 } 3536 3537 vm_paddr_t 3538 pmap_phys_address(vm_pindex_t ppn) 3539 { 3540 return (x86_64_ptob(ppn)); 3541 } 3542 3543 /* 3544 * pmap_ts_referenced: 3545 * 3546 * Return a count of reference bits for a page, clearing those bits. 3547 * It is not necessary for every reference bit to be cleared, but it 3548 * is necessary that 0 only be returned when there are truly no 3549 * reference bits set. 3550 * 3551 * XXX: The exact number of bits to check and clear is a matter that 3552 * should be tested and standardized at some point in the future for 3553 * optimal aging of shared pages. 3554 */ 3555 int 3556 pmap_ts_referenced(vm_page_t m) 3557 { 3558 pv_entry_t pv, pvf, pvn; 3559 pt_entry_t *pte; 3560 int rtval = 0; 3561 3562 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3563 return (rtval); 3564 3565 crit_enter(); 3566 lwkt_gettoken(&vm_token); 3567 3568 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3569 3570 pvf = pv; 3571 3572 do { 3573 pvn = TAILQ_NEXT(pv, pv_list); 3574 3575 crit_enter(); 3576 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3577 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3578 crit_exit(); 3579 3580 if (!pmap_track_modified(pv->pv_va)) 3581 continue; 3582 3583 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3584 3585 if (pte && (*pte & PG_A)) { 3586 #ifdef SMP 3587 atomic_clear_long(pte, PG_A); 3588 #else 3589 atomic_clear_long_nonlocked(pte, PG_A); 3590 #endif 3591 rtval++; 3592 if (rtval > 4) { 3593 break; 3594 } 3595 } 3596 } while ((pv = pvn) != NULL && pv != pvf); 3597 } 3598 lwkt_reltoken(&vm_token); 3599 crit_exit(); 3600 3601 return (rtval); 3602 } 3603 3604 /* 3605 * pmap_is_modified: 3606 * 3607 * Return whether or not the specified physical page was modified 3608 * in any physical maps. 3609 */ 3610 boolean_t 3611 pmap_is_modified(vm_page_t m) 3612 { 3613 boolean_t res; 3614 3615 lwkt_gettoken(&vm_token); 3616 res = pmap_testbit(m, PG_M); 3617 lwkt_reltoken(&vm_token); 3618 return (res); 3619 } 3620 3621 /* 3622 * Clear the modify bits on the specified physical page. 3623 */ 3624 void 3625 pmap_clear_modify(vm_page_t m) 3626 { 3627 lwkt_gettoken(&vm_token); 3628 pmap_clearbit(m, PG_M); 3629 lwkt_reltoken(&vm_token); 3630 } 3631 3632 /* 3633 * pmap_clear_reference: 3634 * 3635 * Clear the reference bit on the specified physical page. 3636 */ 3637 void 3638 pmap_clear_reference(vm_page_t m) 3639 { 3640 lwkt_gettoken(&vm_token); 3641 pmap_clearbit(m, PG_A); 3642 lwkt_reltoken(&vm_token); 3643 } 3644 3645 /* 3646 * Miscellaneous support routines follow 3647 */ 3648 3649 static 3650 void 3651 i386_protection_init(void) 3652 { 3653 int *kp, prot; 3654 3655 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 3656 kp = protection_codes; 3657 for (prot = 0; prot < 8; prot++) { 3658 switch (prot) { 3659 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 3660 /* 3661 * Read access is also 0. There isn't any execute bit, 3662 * so just make it readable. 3663 */ 3664 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 3665 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 3666 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 3667 *kp++ = 0; 3668 break; 3669 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 3670 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 3671 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 3672 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 3673 *kp++ = PG_RW; 3674 break; 3675 } 3676 } 3677 } 3678 3679 /* 3680 * Map a set of physical memory pages into the kernel virtual 3681 * address space. Return a pointer to where it is mapped. This 3682 * routine is intended to be used for mapping device memory, 3683 * NOT real memory. 3684 * 3685 * NOTE: we can't use pgeflag unless we invalidate the pages one at 3686 * a time. 3687 */ 3688 void * 3689 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 3690 { 3691 vm_offset_t va, tmpva, offset; 3692 pt_entry_t *pte; 3693 3694 offset = pa & PAGE_MASK; 3695 size = roundup(offset + size, PAGE_SIZE); 3696 3697 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3698 if (va == 0) 3699 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3700 3701 pa = pa & ~PAGE_MASK; 3702 for (tmpva = va; size > 0;) { 3703 pte = vtopte(tmpva); 3704 *pte = pa | PG_RW | PG_V; /* | pgeflag; */ 3705 size -= PAGE_SIZE; 3706 tmpva += PAGE_SIZE; 3707 pa += PAGE_SIZE; 3708 } 3709 cpu_invltlb(); 3710 smp_invltlb(); 3711 3712 return ((void *)(va + offset)); 3713 } 3714 3715 void * 3716 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 3717 { 3718 vm_offset_t va, tmpva, offset; 3719 pt_entry_t *pte; 3720 3721 offset = pa & PAGE_MASK; 3722 size = roundup(offset + size, PAGE_SIZE); 3723 3724 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3725 if (va == 0) 3726 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3727 3728 pa = pa & ~PAGE_MASK; 3729 for (tmpva = va; size > 0;) { 3730 pte = vtopte(tmpva); 3731 *pte = pa | PG_RW | PG_V | PG_N; /* | pgeflag; */ 3732 size -= PAGE_SIZE; 3733 tmpva += PAGE_SIZE; 3734 pa += PAGE_SIZE; 3735 } 3736 cpu_invltlb(); 3737 smp_invltlb(); 3738 3739 return ((void *)(va + offset)); 3740 } 3741 3742 void 3743 pmap_unmapdev(vm_offset_t va, vm_size_t size) 3744 { 3745 vm_offset_t base, offset; 3746 3747 base = va & ~PAGE_MASK; 3748 offset = va & PAGE_MASK; 3749 size = roundup(offset + size, PAGE_SIZE); 3750 pmap_qremove(va, size >> PAGE_SHIFT); 3751 kmem_free(&kernel_map, base, size); 3752 } 3753 3754 /* 3755 * perform the pmap work for mincore 3756 */ 3757 int 3758 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3759 { 3760 pt_entry_t *ptep, pte; 3761 vm_page_t m; 3762 int val = 0; 3763 3764 lwkt_gettoken(&vm_token); 3765 ptep = pmap_pte(pmap, addr); 3766 3767 if (ptep && (pte = *ptep) != 0) { 3768 vm_offset_t pa; 3769 3770 val = MINCORE_INCORE; 3771 if ((pte & PG_MANAGED) == 0) 3772 goto done; 3773 3774 pa = pte & PG_FRAME; 3775 3776 m = PHYS_TO_VM_PAGE(pa); 3777 3778 /* 3779 * Modified by us 3780 */ 3781 if (pte & PG_M) 3782 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3783 /* 3784 * Modified by someone 3785 */ 3786 else if (m->dirty || pmap_is_modified(m)) 3787 val |= MINCORE_MODIFIED_OTHER; 3788 /* 3789 * Referenced by us 3790 */ 3791 if (pte & PG_A) 3792 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3793 3794 /* 3795 * Referenced by someone 3796 */ 3797 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3798 val |= MINCORE_REFERENCED_OTHER; 3799 vm_page_flag_set(m, PG_REFERENCED); 3800 } 3801 } 3802 done: 3803 lwkt_reltoken(&vm_token); 3804 return val; 3805 } 3806 3807 /* 3808 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3809 * vmspace will be ref'd and the old one will be deref'd. 3810 * 3811 * The vmspace for all lwps associated with the process will be adjusted 3812 * and cr3 will be reloaded if any lwp is the current lwp. 3813 */ 3814 void 3815 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3816 { 3817 struct vmspace *oldvm; 3818 struct lwp *lp; 3819 3820 crit_enter(); 3821 oldvm = p->p_vmspace; 3822 if (oldvm != newvm) { 3823 p->p_vmspace = newvm; 3824 KKASSERT(p->p_nthreads == 1); 3825 lp = RB_ROOT(&p->p_lwp_tree); 3826 pmap_setlwpvm(lp, newvm); 3827 if (adjrefs) { 3828 sysref_get(&newvm->vm_sysref); 3829 sysref_put(&oldvm->vm_sysref); 3830 } 3831 } 3832 crit_exit(); 3833 } 3834 3835 /* 3836 * Set the vmspace for a LWP. The vmspace is almost universally set the 3837 * same as the process vmspace, but virtual kernels need to swap out contexts 3838 * on a per-lwp basis. 3839 */ 3840 void 3841 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3842 { 3843 struct vmspace *oldvm; 3844 struct pmap *pmap; 3845 3846 crit_enter(); 3847 oldvm = lp->lwp_vmspace; 3848 3849 if (oldvm != newvm) { 3850 lp->lwp_vmspace = newvm; 3851 if (curthread->td_lwp == lp) { 3852 pmap = vmspace_pmap(newvm); 3853 #if defined(SMP) 3854 atomic_set_cpumask(&pmap->pm_active, mycpu->gd_cpumask); 3855 if (pmap->pm_active & CPUMASK_LOCK) 3856 pmap_interlock_wait(newvm); 3857 #else 3858 pmap->pm_active |= 1; 3859 #endif 3860 #if defined(SWTCH_OPTIM_STATS) 3861 tlb_flush_count++; 3862 #endif 3863 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 3864 curthread->td_pcb->pcb_cr3 |= PG_RW | PG_U | PG_V; 3865 load_cr3(curthread->td_pcb->pcb_cr3); 3866 pmap = vmspace_pmap(oldvm); 3867 #if defined(SMP) 3868 atomic_clear_cpumask(&pmap->pm_active, mycpu->gd_cpumask); 3869 #else 3870 pmap->pm_active &= ~(cpumask_t)1; 3871 #endif 3872 } 3873 } 3874 crit_exit(); 3875 } 3876 3877 #ifdef SMP 3878 3879 /* 3880 * Called when switching to a locked pmap 3881 */ 3882 void 3883 pmap_interlock_wait(struct vmspace *vm) 3884 { 3885 struct pmap *pmap = &vm->vm_pmap; 3886 3887 if (pmap->pm_active & CPUMASK_LOCK) { 3888 DEBUG_PUSH_INFO("pmap_interlock_wait"); 3889 while (pmap->pm_active & CPUMASK_LOCK) { 3890 cpu_pause(); 3891 cpu_ccfence(); 3892 lwkt_process_ipiq(); 3893 } 3894 DEBUG_POP_INFO(); 3895 } 3896 } 3897 3898 #endif 3899 3900 vm_offset_t 3901 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3902 { 3903 3904 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3905 return addr; 3906 } 3907 3908 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 3909 return addr; 3910 } 3911 3912 /* 3913 * Used by kmalloc/kfree, page already exists at va 3914 */ 3915 vm_page_t 3916 pmap_kvtom(vm_offset_t va) 3917 { 3918 return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME)); 3919 } 3920