1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 8 * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org> 9 * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org> 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 41 */ 42 /*- 43 * Copyright (c) 2003 Networks Associates Technology, Inc. 44 * All rights reserved. 45 * 46 * This software was developed for the FreeBSD Project by Jake Burkholder, 47 * Safeport Network Services, and Network Associates Laboratories, the 48 * Security Research Division of Network Associates, Inc. under 49 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 50 * CHATS research program. 51 * 52 * Redistribution and use in source and binary forms, with or without 53 * modification, are permitted provided that the following conditions 54 * are met: 55 * 1. Redistributions of source code must retain the above copyright 56 * notice, this list of conditions and the following disclaimer. 57 * 2. Redistributions in binary form must reproduce the above copyright 58 * notice, this list of conditions and the following disclaimer in the 59 * documentation and/or other materials provided with the distribution. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 */ 73 74 #include <sys/cdefs.h> 75 __FBSDID("$FreeBSD$"); 76 77 /* 78 * Manages physical address maps. 79 * 80 * Since the information managed by this module is 81 * also stored by the logical address mapping module, 82 * this module may throw away valid virtual-to-physical 83 * mappings at almost any time. However, invalidations 84 * of virtual-to-physical mappings must be done as 85 * requested. 86 * 87 * In order to cope with hardware architectures which 88 * make virtual-to-physical map invalidates expensive, 89 * this module may delay invalidate or reduced protection 90 * operations until such time as they are actually 91 * necessary. This module is given full information as 92 * to which processors are currently using which maps, 93 * and to when physical maps must be made correct. 94 */ 95 96 #include "opt_vm.h" 97 #include "opt_pmap.h" 98 #include "opt_ddb.h" 99 100 #include <sys/param.h> 101 #include <sys/systm.h> 102 #include <sys/kernel.h> 103 #include <sys/ktr.h> 104 #include <sys/lock.h> 105 #include <sys/proc.h> 106 #include <sys/rwlock.h> 107 #include <sys/malloc.h> 108 #include <sys/vmmeter.h> 109 #include <sys/malloc.h> 110 #include <sys/mman.h> 111 #include <sys/sf_buf.h> 112 #include <sys/smp.h> 113 #include <sys/sched.h> 114 #include <sys/sysctl.h> 115 116 #ifdef DDB 117 #include <ddb/ddb.h> 118 #endif 119 120 #include <machine/physmem.h> 121 122 #include <vm/vm.h> 123 #include <vm/uma.h> 124 #include <vm/pmap.h> 125 #include <vm/vm_param.h> 126 #include <vm/vm_kern.h> 127 #include <vm/vm_object.h> 128 #include <vm/vm_map.h> 129 #include <vm/vm_page.h> 130 #include <vm/vm_pageout.h> 131 #include <vm/vm_phys.h> 132 #include <vm/vm_extern.h> 133 #include <vm/vm_reserv.h> 134 #include <sys/lock.h> 135 #include <sys/mutex.h> 136 137 #include <machine/md_var.h> 138 #include <machine/pmap_var.h> 139 #include <machine/cpu.h> 140 #include <machine/pcb.h> 141 #include <machine/sf_buf.h> 142 #ifdef SMP 143 #include <machine/smp.h> 144 #endif 145 #ifndef PMAP_SHPGPERPROC 146 #define PMAP_SHPGPERPROC 200 147 #endif 148 149 #ifndef DIAGNOSTIC 150 #define PMAP_INLINE __inline 151 #else 152 #define PMAP_INLINE 153 #endif 154 155 #ifdef PMAP_DEBUG 156 static void pmap_zero_page_check(vm_page_t m); 157 void pmap_debug(int level); 158 int pmap_pid_dump(int pid); 159 160 #define PDEBUG(_lev_,_stat_) \ 161 if (pmap_debug_level >= (_lev_)) \ 162 ((_stat_)) 163 #define dprintf printf 164 int pmap_debug_level = 1; 165 #else /* PMAP_DEBUG */ 166 #define PDEBUG(_lev_,_stat_) /* Nothing */ 167 #define dprintf(x, arg...) 168 #endif /* PMAP_DEBUG */ 169 170 /* 171 * Level 2 page tables map definion ('max' is excluded). 172 */ 173 174 #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 175 #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) 176 177 #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 178 #define UPT2V_MAX_ADDRESS \ 179 ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) 180 181 /* 182 * Promotion to a 1MB (PTE1) page mapping requires that the corresponding 183 * 4KB (PTE2) page mappings have identical settings for the following fields: 184 */ 185 #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ 186 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ 187 PTE2_ATTR_MASK) 188 189 #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ 190 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ 191 PTE1_ATTR_MASK) 192 193 #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ 194 (((l2_attr) & L2_C) ? L1_S_C : 0) | \ 195 (((l2_attr) & L2_B) ? L1_S_B : 0) | \ 196 (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ 197 (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ 198 (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ 199 (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ 200 (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ 201 (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ 202 (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ 203 (((l2_attr) & PTE2_W) ? PTE1_W : 0)) 204 205 #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ 206 (((l1_attr) & L1_S_C) ? L2_C : 0) | \ 207 (((l1_attr) & L1_S_B) ? L2_B : 0) | \ 208 (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ 209 (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ 210 (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ 211 (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ 212 (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ 213 (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ 214 (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ 215 (((l1_attr) & PTE1_W) ? PTE2_W : 0)) 216 217 /* 218 * PTE2 descriptors creation macros. 219 */ 220 #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) 221 #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) 222 223 #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 224 #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 225 226 #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) 227 #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) 228 229 #define PV_STATS 230 #ifdef PV_STATS 231 #define PV_STAT(x) do { x ; } while (0) 232 #else 233 #define PV_STAT(x) do { } while (0) 234 #endif 235 236 /* 237 * The boot_pt1 is used temporary in very early boot stage as L1 page table. 238 * We can init many things with no memory allocation thanks to its static 239 * allocation and this brings two main advantages: 240 * (1) other cores can be started very simply, 241 * (2) various boot loaders can be supported as its arguments can be processed 242 * in virtual address space and can be moved to safe location before 243 * first allocation happened. 244 * Only disadvantage is that boot_pt1 is used only in very early boot stage. 245 * However, the table is uninitialized and so lays in bss. Therefore kernel 246 * image size is not influenced. 247 * 248 * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and 249 * CPU suspend/resume game. 250 */ 251 extern pt1_entry_t boot_pt1[]; 252 253 vm_paddr_t base_pt1; 254 pt1_entry_t *kern_pt1; 255 pt2_entry_t *kern_pt2tab; 256 pt2_entry_t *PT2MAP; 257 258 static uint32_t ttb_flags; 259 static vm_memattr_t pt_memattr; 260 ttb_entry_t pmap_kern_ttb; 261 262 struct pmap kernel_pmap_store; 263 LIST_HEAD(pmaplist, pmap); 264 static struct pmaplist allpmaps; 265 static struct mtx allpmaps_lock; 266 267 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 268 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 269 270 static vm_offset_t kernel_vm_end_new; 271 vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; 272 vm_offset_t vm_max_kernel_address; 273 vm_paddr_t kernel_l1pa; 274 275 static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; 276 277 /* 278 * Data for the pv entry allocation mechanism 279 */ 280 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 281 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 282 static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ 283 static int shpgperproc = PMAP_SHPGPERPROC; 284 285 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 286 int pv_maxchunks; /* How many chunks we have KVA for */ 287 vm_offset_t pv_vafree; /* freelist stored in the PTE */ 288 289 vm_paddr_t first_managed_pa; 290 #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) 291 292 /* 293 * All those kernel PT submaps that BSD is so fond of 294 */ 295 caddr_t _tmppt = 0; 296 297 /* 298 * Crashdump maps. 299 */ 300 static caddr_t crashdumpmap; 301 302 static pt2_entry_t *PMAP1 = NULL, *PMAP2; 303 static pt2_entry_t *PADDR1 = NULL, *PADDR2; 304 #ifdef DDB 305 static pt2_entry_t *PMAP3; 306 static pt2_entry_t *PADDR3; 307 static int PMAP3cpu __unused; /* for SMP only */ 308 #endif 309 #ifdef SMP 310 static int PMAP1cpu; 311 static int PMAP1changedcpu; 312 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 313 &PMAP1changedcpu, 0, 314 "Number of times pmap_pte2_quick changed CPU with same PMAP1"); 315 #endif 316 static int PMAP1changed; 317 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 318 &PMAP1changed, 0, 319 "Number of times pmap_pte2_quick changed PMAP1"); 320 static int PMAP1unchanged; 321 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 322 &PMAP1unchanged, 0, 323 "Number of times pmap_pte2_quick didn't change PMAP1"); 324 static struct mtx PMAP2mutex; 325 326 static __inline void pt2_wirecount_init(vm_page_t m); 327 static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, 328 vm_offset_t va); 329 void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); 330 331 /* 332 * Function to set the debug level of the pmap code. 333 */ 334 #ifdef PMAP_DEBUG 335 void 336 pmap_debug(int level) 337 { 338 339 pmap_debug_level = level; 340 dprintf("pmap_debug: level=%d\n", pmap_debug_level); 341 } 342 #endif /* PMAP_DEBUG */ 343 344 /* 345 * This table must corespond with memory attribute configuration in vm.h. 346 * First entry is used for normal system mapping. 347 * 348 * Device memory is always marked as shared. 349 * Normal memory is shared only in SMP . 350 * Not outer shareable bits are not used yet. 351 * Class 6 cannot be used on ARM11. 352 */ 353 #define TEXDEF_TYPE_SHIFT 0 354 #define TEXDEF_TYPE_MASK 0x3 355 #define TEXDEF_INNER_SHIFT 2 356 #define TEXDEF_INNER_MASK 0x3 357 #define TEXDEF_OUTER_SHIFT 4 358 #define TEXDEF_OUTER_MASK 0x3 359 #define TEXDEF_NOS_SHIFT 6 360 #define TEXDEF_NOS_MASK 0x1 361 362 #define TEX(t, i, o, s) \ 363 ((t) << TEXDEF_TYPE_SHIFT) | \ 364 ((i) << TEXDEF_INNER_SHIFT) | \ 365 ((o) << TEXDEF_OUTER_SHIFT | \ 366 ((s) << TEXDEF_NOS_SHIFT)) 367 368 static uint32_t tex_class[8] = { 369 /* type inner cache outer cache */ 370 TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ 371 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ 372 TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ 373 TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ 374 TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ 375 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ 376 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ 377 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ 378 }; 379 #undef TEX 380 381 static uint32_t pte2_attr_tab[8] = { 382 PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ 383 PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ 384 PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ 385 PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ 386 PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 387 0, /* 5 - NOT USED YET */ 388 0, /* 6 - NOT USED YET */ 389 0 /* 7 - NOT USED YET */ 390 }; 391 CTASSERT(VM_MEMATTR_WB_WA == 0); 392 CTASSERT(VM_MEMATTR_NOCACHE == 1); 393 CTASSERT(VM_MEMATTR_DEVICE == 2); 394 CTASSERT(VM_MEMATTR_SO == 3); 395 CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); 396 397 static inline uint32_t 398 vm_memattr_to_pte2(vm_memattr_t ma) 399 { 400 401 KASSERT((u_int)ma < 5, ("%s: bad vm_memattr_t %d", __func__, ma)); 402 return (pte2_attr_tab[(u_int)ma]); 403 } 404 405 static inline uint32_t 406 vm_page_pte2_attr(vm_page_t m) 407 { 408 409 return (vm_memattr_to_pte2(m->md.pat_mode)); 410 } 411 412 /* 413 * Convert TEX definition entry to TTB flags. 414 */ 415 static uint32_t 416 encode_ttb_flags(int idx) 417 { 418 uint32_t inner, outer, nos, reg; 419 420 inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & 421 TEXDEF_INNER_MASK; 422 outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & 423 TEXDEF_OUTER_MASK; 424 nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & 425 TEXDEF_NOS_MASK; 426 427 reg = nos << 5; 428 reg |= outer << 3; 429 if (cpuinfo.coherent_walk) 430 reg |= (inner & 0x1) << 6; 431 reg |= (inner & 0x2) >> 1; 432 #ifdef SMP 433 ARM_SMP_UP( 434 reg |= 1 << 1, 435 ); 436 #endif 437 return reg; 438 } 439 440 /* 441 * Set TEX remapping registers in current CPU. 442 */ 443 void 444 pmap_set_tex(void) 445 { 446 uint32_t prrr, nmrr; 447 uint32_t type, inner, outer, nos; 448 int i; 449 450 #ifdef PMAP_PTE_NOCACHE 451 /* XXX fixme */ 452 if (cpuinfo.coherent_walk) { 453 pt_memattr = VM_MEMATTR_WB_WA; 454 ttb_flags = encode_ttb_flags(0); 455 } 456 else { 457 pt_memattr = VM_MEMATTR_NOCACHE; 458 ttb_flags = encode_ttb_flags(1); 459 } 460 #else 461 pt_memattr = VM_MEMATTR_WB_WA; 462 ttb_flags = encode_ttb_flags(0); 463 #endif 464 465 prrr = 0; 466 nmrr = 0; 467 468 /* Build remapping register from TEX classes. */ 469 for (i = 0; i < 8; i++) { 470 type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & 471 TEXDEF_TYPE_MASK; 472 inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & 473 TEXDEF_INNER_MASK; 474 outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & 475 TEXDEF_OUTER_MASK; 476 nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & 477 TEXDEF_NOS_MASK; 478 479 prrr |= type << (i * 2); 480 prrr |= nos << (i + 24); 481 nmrr |= inner << (i * 2); 482 nmrr |= outer << (i * 2 + 16); 483 } 484 /* Add shareable bits for device memory. */ 485 prrr |= PRRR_DS0 | PRRR_DS1; 486 487 /* Add shareable bits for normal memory in SMP case. */ 488 #ifdef SMP 489 ARM_SMP_UP( 490 prrr |= PRRR_NS1, 491 ); 492 #endif 493 cp15_prrr_set(prrr); 494 cp15_nmrr_set(nmrr); 495 496 /* Caches are disabled, so full TLB flush should be enough. */ 497 tlb_flush_all_local(); 498 } 499 500 /* 501 * Remap one vm_meattr class to another one. This can be useful as 502 * workaround for SOC errata, e.g. if devices must be accessed using 503 * SO memory class. 504 * 505 * !!! Please note that this function is absolutely last resort thing. 506 * It should not be used under normal circumstances. !!! 507 * 508 * Usage rules: 509 * - it shall be called after pmap_bootstrap_prepare() and before 510 * cpu_mp_start() (thus only on boot CPU). In practice, it's expected 511 * to be called from platform_attach() or platform_late_init(). 512 * 513 * - if remapping doesn't change caching mode, or until uncached class 514 * is remapped to any kind of cached one, then no other restriction exists. 515 * 516 * - if pmap_remap_vm_attr() changes caching mode, but both (original and 517 * remapped) remain cached, then caller is resposible for calling 518 * of dcache_wbinv_poc_all(). 519 * 520 * - remapping of any kind of cached class to uncached is not permitted. 521 */ 522 void 523 pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) 524 { 525 int old_idx, new_idx; 526 527 /* Map VM memattrs to indexes to tex_class table. */ 528 old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); 529 new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); 530 531 /* Replace TEX attribute and apply it. */ 532 tex_class[old_idx] = tex_class[new_idx]; 533 pmap_set_tex(); 534 } 535 536 /* 537 * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, 538 * KERNBASE is mapped by first L2 page table in L2 page table page. It 539 * meets same constrain due to PT2MAP being placed just under KERNBASE. 540 */ 541 CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); 542 CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); 543 544 /* 545 * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. 546 * For now, anyhow, the following check must be fulfilled. 547 */ 548 CTASSERT(PAGE_SIZE == PTE2_SIZE); 549 /* 550 * We don't want to mess up MI code with all MMU and PMAP definitions, 551 * so some things, which depend on other ones, are defined independently. 552 * Now, it is time to check that we don't screw up something. 553 */ 554 CTASSERT(PDRSHIFT == PTE1_SHIFT); 555 /* 556 * Check L1 and L2 page table entries definitions consistency. 557 */ 558 CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); 559 CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); 560 /* 561 * Check L2 page tables page consistency. 562 */ 563 CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); 564 CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); 565 /* 566 * Check PT2TAB consistency. 567 * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. 568 * This should be done without remainder. 569 */ 570 CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); 571 572 /* 573 * A PT2MAP magic. 574 * 575 * All level 2 page tables (PT2s) are mapped continuously and accordingly 576 * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can 577 * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page 578 * must be used together, but not necessary at once. The first PT2 in a page 579 * must map things on correctly aligned address and the others must follow 580 * in right order. 581 */ 582 #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) 583 #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) 584 #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) 585 586 /* 587 * Check PT2TAB consistency. 588 * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. 589 * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. 590 * The both should be done without remainder. 591 */ 592 CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); 593 CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); 594 /* 595 * The implementation was made general, however, with the assumption 596 * bellow in mind. In case of another value of NPG_IN_PT2TAB, 597 * the code should be once more rechecked. 598 */ 599 CTASSERT(NPG_IN_PT2TAB == 1); 600 601 /* 602 * Get offset of PT2 in a page 603 * associated with given PT1 index. 604 */ 605 static __inline u_int 606 page_pt2off(u_int pt1_idx) 607 { 608 609 return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); 610 } 611 612 /* 613 * Get physical address of PT2 614 * associated with given PT2s page and PT1 index. 615 */ 616 static __inline vm_paddr_t 617 page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) 618 { 619 620 return (pgpa + page_pt2off(pt1_idx)); 621 } 622 623 /* 624 * Get first entry of PT2 625 * associated with given PT2s page and PT1 index. 626 */ 627 static __inline pt2_entry_t * 628 page_pt2(vm_offset_t pgva, u_int pt1_idx) 629 { 630 631 return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); 632 } 633 634 /* 635 * Get virtual address of PT2s page (mapped in PT2MAP) 636 * which holds PT2 which holds entry which maps given virtual address. 637 */ 638 static __inline vm_offset_t 639 pt2map_pt2pg(vm_offset_t va) 640 { 641 642 va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); 643 return ((vm_offset_t)pt2map_entry(va)); 644 } 645 646 /***************************************************************************** 647 * 648 * THREE pmap initialization milestones exist: 649 * 650 * locore.S 651 * -> fundamental init (including MMU) in ASM 652 * 653 * initarm() 654 * -> fundamental init continues in C 655 * -> first available physical address is known 656 * 657 * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) 658 * -> basic (safe) interface for physical address allocation is made 659 * -> basic (safe) interface for virtual mapping is made 660 * -> limited not SMP coherent work is possible 661 * 662 * -> more fundamental init continues in C 663 * -> locks and some more things are available 664 * -> all fundamental allocations and mappings are done 665 * 666 * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) 667 * -> phys_avail[] and virtual_avail is set 668 * -> control is passed to vm subsystem 669 * -> physical and virtual address allocation are off limit 670 * -> low level mapping functions, some SMP coherent, 671 * are available, which cannot be used before vm subsystem 672 * is being inited 673 * 674 * mi_startup() 675 * -> vm subsystem is being inited 676 * 677 * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) 678 * -> pmap is fully inited 679 * 680 *****************************************************************************/ 681 682 /***************************************************************************** 683 * 684 * PMAP first stage initialization and utility functions 685 * for pre-bootstrap epoch. 686 * 687 * After pmap_bootstrap_prepare() is called, the following functions 688 * can be used: 689 * 690 * (1) strictly only for this stage functions for physical page allocations, 691 * virtual space allocations, and mappings: 692 * 693 * vm_paddr_t pmap_preboot_get_pages(u_int num); 694 * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); 695 * vm_offset_t pmap_preboot_reserve_pages(u_int num); 696 * vm_offset_t pmap_preboot_get_vpages(u_int num); 697 * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 698 * vm_prot_t prot, vm_memattr_t attr); 699 * 700 * (2) for all stages: 701 * 702 * vm_paddr_t pmap_kextract(vm_offset_t va); 703 * 704 * NOTE: This is not SMP coherent stage. 705 * 706 *****************************************************************************/ 707 708 #define KERNEL_P2V(pa) \ 709 ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) 710 #define KERNEL_V2P(va) \ 711 ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) 712 713 static vm_paddr_t last_paddr; 714 715 /* 716 * Pre-bootstrap epoch page allocator. 717 */ 718 vm_paddr_t 719 pmap_preboot_get_pages(u_int num) 720 { 721 vm_paddr_t ret; 722 723 ret = last_paddr; 724 last_paddr += num * PAGE_SIZE; 725 726 return (ret); 727 } 728 729 /* 730 * The fundamental initialization of PMAP stuff. 731 * 732 * Some things already happened in locore.S and some things could happen 733 * before pmap_bootstrap_prepare() is called, so let's recall what is done: 734 * 1. Caches are disabled. 735 * 2. We are running on virtual addresses already with 'boot_pt1' 736 * as L1 page table. 737 * 3. So far, all virtual addresses can be converted to physical ones and 738 * vice versa by the following macros: 739 * KERNEL_P2V(pa) .... physical to virtual ones, 740 * KERNEL_V2P(va) .... virtual to physical ones. 741 * 742 * What is done herein: 743 * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. 744 * 2. PT2MAP magic is brought to live. 745 * 3. Basic preboot functions for page allocations and mappings can be used. 746 * 4. Everything is prepared for L1 cache enabling. 747 * 748 * Variations: 749 * 1. To use second TTB register, so kernel and users page tables will be 750 * separated. This way process forking - pmap_pinit() - could be faster, 751 * it saves physical pages and KVA per a process, and it's simple change. 752 * However, it will lead, due to hardware matter, to the following: 753 * (a) 2G space for kernel and 2G space for users. 754 * (b) 1G space for kernel in low addresses and 3G for users above it. 755 * A question is: Is the case (b) really an option? Note that case (b) 756 * does save neither physical memory and KVA. 757 */ 758 void 759 pmap_bootstrap_prepare(vm_paddr_t last) 760 { 761 vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; 762 vm_offset_t pt2pg_va; 763 pt1_entry_t *pte1p; 764 pt2_entry_t *pte2p; 765 u_int i; 766 uint32_t l1_attr; 767 768 /* 769 * Now, we are going to make real kernel mapping. Note that we are 770 * already running on some mapping made in locore.S and we expect 771 * that it's large enough to ensure nofault access to physical memory 772 * allocated herein before switch. 773 * 774 * As kernel image and everything needed before are and will be mapped 775 * by section mappings, we align last physical address to PTE1_SIZE. 776 */ 777 last_paddr = pte1_roundup(last); 778 779 /* 780 * Allocate and zero page(s) for kernel L1 page table. 781 * 782 * Note that it's first allocation on space which was PTE1_SIZE 783 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. 784 */ 785 base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); 786 kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); 787 bzero((void*)kern_pt1, NB_IN_PT1); 788 pte1_sync_range(kern_pt1, NB_IN_PT1); 789 790 /* Allocate and zero page(s) for kernel PT2TAB. */ 791 pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); 792 kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); 793 bzero(kern_pt2tab, NB_IN_PT2TAB); 794 pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); 795 796 /* Allocate and zero page(s) for kernel L2 page tables. */ 797 pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); 798 pt2pg_va = KERNEL_P2V(pt2pg_pa); 799 size = NKPT2PG * PAGE_SIZE; 800 bzero((void*)pt2pg_va, size); 801 pte2_sync_range((pt2_entry_t *)pt2pg_va, size); 802 803 /* 804 * Add a physical memory segment (vm_phys_seg) corresponding to the 805 * preallocated pages for kernel L2 page tables so that vm_page 806 * structures representing these pages will be created. The vm_page 807 * structures are required for promotion of the corresponding kernel 808 * virtual addresses to section mappings. 809 */ 810 vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); 811 812 /* 813 * Insert allocated L2 page table pages to PT2TAB and make 814 * link to all PT2s in L1 page table. See how kernel_vm_end 815 * is initialized. 816 * 817 * We play simple and safe. So every KVA will have underlaying 818 * L2 page table, even kernel image mapped by sections. 819 */ 820 pte2p = kern_pt2tab_entry(KERNBASE); 821 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) 822 pt2tab_store(pte2p++, PTE2_KPT(pa)); 823 824 pte1p = kern_pte1(KERNBASE); 825 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) 826 pte1_store(pte1p++, PTE1_LINK(pa)); 827 828 /* Make section mappings for kernel. */ 829 l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); 830 pte1p = kern_pte1(KERNBASE); 831 for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) 832 pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); 833 834 /* 835 * Get free and aligned space for PT2MAP and make L1 page table links 836 * to L2 page tables held in PT2TAB. 837 * 838 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t 839 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus 840 * each entry in PT2TAB maps all PT2s in a page. This implies that 841 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. 842 */ 843 PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); 844 pte1p = kern_pte1((vm_offset_t)PT2MAP); 845 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 846 pte1_store(pte1p++, PTE1_LINK(pa)); 847 } 848 849 /* 850 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. 851 * Each pmap will hold own PT2TAB, so the mapping should be not global. 852 */ 853 pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); 854 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 855 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 856 } 857 858 /* 859 * Choose correct L2 page table and make mappings for allocations 860 * made herein which replaces temporary locore.S mappings after a while. 861 * Note that PT2MAP cannot be used until we switch to kern_pt1. 862 * 863 * Note, that these allocations started aligned on 1M section and 864 * kernel PT1 was allocated first. Making of mappings must follow 865 * order of physical allocations as we've used KERNEL_P2V() macro 866 * for virtual addresses resolution. 867 */ 868 pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); 869 pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); 870 871 pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); 872 873 /* Make mapping for kernel L1 page table. */ 874 for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) 875 pte2_store(pte2p++, PTE2_KPT(pa)); 876 877 /* Make mapping for kernel PT2TAB. */ 878 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) 879 pte2_store(pte2p++, PTE2_KPT(pa)); 880 881 /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ 882 pmap_kern_ttb = base_pt1 | ttb_flags; 883 cpuinfo_reinit_mmu(pmap_kern_ttb); 884 /* 885 * Initialize the first available KVA. As kernel image is mapped by 886 * sections, we are leaving some gap behind. 887 */ 888 virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; 889 } 890 891 /* 892 * Setup L2 page table page for given KVA. 893 * Used in pre-bootstrap epoch. 894 * 895 * Note that we have allocated NKPT2PG pages for L2 page tables in advance 896 * and used them for mapping KVA starting from KERNBASE. However, this is not 897 * enough. Vectors and devices need L2 page tables too. Note that they are 898 * even above VM_MAX_KERNEL_ADDRESS. 899 */ 900 static __inline vm_paddr_t 901 pmap_preboot_pt2pg_setup(vm_offset_t va) 902 { 903 pt2_entry_t *pte2p, pte2; 904 vm_paddr_t pt2pg_pa; 905 906 /* Get associated entry in PT2TAB. */ 907 pte2p = kern_pt2tab_entry(va); 908 909 /* Just return, if PT2s page exists already. */ 910 pte2 = pt2tab_load(pte2p); 911 if (pte2_is_valid(pte2)) 912 return (pte2_pa(pte2)); 913 914 KASSERT(va >= VM_MAX_KERNEL_ADDRESS, 915 ("%s: NKPT2PG too small", __func__)); 916 917 /* 918 * Allocate page for PT2s and insert it to PT2TAB. 919 * In other words, map it into PT2MAP space. 920 */ 921 pt2pg_pa = pmap_preboot_get_pages(1); 922 pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); 923 924 /* Zero all PT2s in allocated page. */ 925 bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); 926 pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); 927 928 return (pt2pg_pa); 929 } 930 931 /* 932 * Setup L2 page table for given KVA. 933 * Used in pre-bootstrap epoch. 934 */ 935 static void 936 pmap_preboot_pt2_setup(vm_offset_t va) 937 { 938 pt1_entry_t *pte1p; 939 vm_paddr_t pt2pg_pa, pt2_pa; 940 941 /* Setup PT2's page. */ 942 pt2pg_pa = pmap_preboot_pt2pg_setup(va); 943 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); 944 945 /* Insert PT2 to PT1. */ 946 pte1p = kern_pte1(va); 947 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 948 } 949 950 /* 951 * Get L2 page entry associated with given KVA. 952 * Used in pre-bootstrap epoch. 953 */ 954 static __inline pt2_entry_t* 955 pmap_preboot_vtopte2(vm_offset_t va) 956 { 957 pt1_entry_t *pte1p; 958 959 /* Setup PT2 if needed. */ 960 pte1p = kern_pte1(va); 961 if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ 962 pmap_preboot_pt2_setup(va); 963 964 return (pt2map_entry(va)); 965 } 966 967 /* 968 * Pre-bootstrap epoch page(s) mapping(s). 969 */ 970 void 971 pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) 972 { 973 u_int i; 974 pt2_entry_t *pte2p; 975 976 /* Map all the pages. */ 977 for (i = 0; i < num; i++) { 978 pte2p = pmap_preboot_vtopte2(va); 979 pte2_store(pte2p, PTE2_KRW(pa)); 980 va += PAGE_SIZE; 981 pa += PAGE_SIZE; 982 } 983 } 984 985 /* 986 * Pre-bootstrap epoch virtual space alocator. 987 */ 988 vm_offset_t 989 pmap_preboot_reserve_pages(u_int num) 990 { 991 u_int i; 992 vm_offset_t start, va; 993 pt2_entry_t *pte2p; 994 995 /* Allocate virtual space. */ 996 start = va = virtual_avail; 997 virtual_avail += num * PAGE_SIZE; 998 999 /* Zero the mapping. */ 1000 for (i = 0; i < num; i++) { 1001 pte2p = pmap_preboot_vtopte2(va); 1002 pte2_store(pte2p, 0); 1003 va += PAGE_SIZE; 1004 } 1005 1006 return (start); 1007 } 1008 1009 /* 1010 * Pre-bootstrap epoch page(s) allocation and mapping(s). 1011 */ 1012 vm_offset_t 1013 pmap_preboot_get_vpages(u_int num) 1014 { 1015 vm_paddr_t pa; 1016 vm_offset_t va; 1017 1018 /* Allocate physical page(s). */ 1019 pa = pmap_preboot_get_pages(num); 1020 1021 /* Allocate virtual space. */ 1022 va = virtual_avail; 1023 virtual_avail += num * PAGE_SIZE; 1024 1025 /* Map and zero all. */ 1026 pmap_preboot_map_pages(pa, va, num); 1027 bzero((void *)va, num * PAGE_SIZE); 1028 1029 return (va); 1030 } 1031 1032 /* 1033 * Pre-bootstrap epoch page mapping(s) with attributes. 1034 */ 1035 void 1036 pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 1037 vm_prot_t prot, vm_memattr_t attr) 1038 { 1039 u_int num; 1040 u_int l1_attr, l1_prot, l2_prot, l2_attr; 1041 pt1_entry_t *pte1p; 1042 pt2_entry_t *pte2p; 1043 1044 l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; 1045 l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1046 l2_attr = vm_memattr_to_pte2(attr); 1047 l1_prot = ATTR_TO_L1(l2_prot); 1048 l1_attr = ATTR_TO_L1(l2_attr); 1049 1050 /* Map all the pages. */ 1051 num = round_page(size); 1052 while (num > 0) { 1053 if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { 1054 pte1p = kern_pte1(va); 1055 pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); 1056 va += PTE1_SIZE; 1057 pa += PTE1_SIZE; 1058 num -= PTE1_SIZE; 1059 } else { 1060 pte2p = pmap_preboot_vtopte2(va); 1061 pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); 1062 va += PAGE_SIZE; 1063 pa += PAGE_SIZE; 1064 num -= PAGE_SIZE; 1065 } 1066 } 1067 } 1068 1069 /* 1070 * Extract from the kernel page table the physical address 1071 * that is mapped by the given virtual address "va". 1072 */ 1073 vm_paddr_t 1074 pmap_kextract(vm_offset_t va) 1075 { 1076 vm_paddr_t pa; 1077 pt1_entry_t pte1; 1078 pt2_entry_t pte2; 1079 1080 pte1 = pte1_load(kern_pte1(va)); 1081 if (pte1_is_section(pte1)) { 1082 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1083 } else if (pte1_is_link(pte1)) { 1084 /* 1085 * We should beware of concurrent promotion that changes 1086 * pte1 at this point. However, it's not a problem as PT2 1087 * page is preserved by promotion in PT2TAB. So even if 1088 * it happens, using of PT2MAP is still safe. 1089 * 1090 * QQQ: However, concurrent removing is a problem which 1091 * ends in abort on PT2MAP space. Locking must be used 1092 * to deal with this. 1093 */ 1094 pte2 = pte2_load(pt2map_entry(va)); 1095 pa = pte2_pa(pte2) | (va & PTE2_OFFSET); 1096 } 1097 else { 1098 panic("%s: va %#x pte1 %#x", __func__, va, pte1); 1099 } 1100 return (pa); 1101 } 1102 1103 /* 1104 * Extract from the kernel page table the physical address 1105 * that is mapped by the given virtual address "va". Also 1106 * return L2 page table entry which maps the address. 1107 * 1108 * This is only intended to be used for panic dumps. 1109 */ 1110 vm_paddr_t 1111 pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) 1112 { 1113 vm_paddr_t pa; 1114 pt1_entry_t pte1; 1115 pt2_entry_t pte2; 1116 1117 pte1 = pte1_load(kern_pte1(va)); 1118 if (pte1_is_section(pte1)) { 1119 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1120 pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; 1121 } else if (pte1_is_link(pte1)) { 1122 pte2 = pte2_load(pt2map_entry(va)); 1123 pa = pte2_pa(pte2); 1124 } else { 1125 pte2 = 0; 1126 pa = 0; 1127 } 1128 if (pte2p != NULL) 1129 *pte2p = pte2; 1130 return (pa); 1131 } 1132 1133 /***************************************************************************** 1134 * 1135 * PMAP second stage initialization and utility functions 1136 * for bootstrap epoch. 1137 * 1138 * After pmap_bootstrap() is called, the following functions for 1139 * mappings can be used: 1140 * 1141 * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); 1142 * void pmap_kremove(vm_offset_t va); 1143 * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1144 * int prot); 1145 * 1146 * NOTE: This is not SMP coherent stage. And physical page allocation is not 1147 * allowed during this stage. 1148 * 1149 *****************************************************************************/ 1150 1151 /* 1152 * Initialize kernel PMAP locks and lists, kernel_pmap itself, and 1153 * reserve various virtual spaces for temporary mappings. 1154 */ 1155 void 1156 pmap_bootstrap(vm_offset_t firstaddr) 1157 { 1158 pt2_entry_t *unused __unused; 1159 struct pcpu *pc; 1160 1161 /* 1162 * Initialize the kernel pmap (which is statically allocated). 1163 */ 1164 PMAP_LOCK_INIT(kernel_pmap); 1165 kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ 1166 kernel_pmap->pm_pt1 = kern_pt1; 1167 kernel_pmap->pm_pt2tab = kern_pt2tab; 1168 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1169 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1170 1171 /* 1172 * Initialize the global pv list lock. 1173 */ 1174 rw_init(&pvh_global_lock, "pmap pv global"); 1175 1176 LIST_INIT(&allpmaps); 1177 1178 /* 1179 * Request a spin mutex so that changes to allpmaps cannot be 1180 * preempted by smp_rendezvous_cpus(). 1181 */ 1182 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 1183 mtx_lock_spin(&allpmaps_lock); 1184 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 1185 mtx_unlock_spin(&allpmaps_lock); 1186 1187 /* 1188 * Reserve some special page table entries/VA space for temporary 1189 * mapping of pages. 1190 */ 1191 #define SYSMAP(c, p, v, n) do { \ 1192 v = (c)pmap_preboot_reserve_pages(n); \ 1193 p = pt2map_entry((vm_offset_t)v); \ 1194 } while (0) 1195 1196 /* 1197 * Local CMAP1/CMAP2 are used for zeroing and copying pages. 1198 * Local CMAP2 is also used for data cache cleaning. 1199 */ 1200 pc = get_pcpu(); 1201 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1202 SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); 1203 SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); 1204 SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); 1205 1206 /* 1207 * Crashdump maps. 1208 */ 1209 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); 1210 1211 /* 1212 * _tmppt is used for reading arbitrary physical pages via /dev/mem. 1213 */ 1214 SYSMAP(caddr_t, unused, _tmppt, 1); 1215 1216 /* 1217 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), 1218 * respectively. PADDR3 is used by pmap_pte2_ddb(). 1219 */ 1220 SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); 1221 SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); 1222 #ifdef DDB 1223 SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); 1224 #endif 1225 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 1226 1227 /* 1228 * Note that in very short time in initarm(), we are going to 1229 * initialize phys_avail[] array and no further page allocation 1230 * can happen after that until vm subsystem will be initialized. 1231 */ 1232 kernel_vm_end_new = kernel_vm_end; 1233 virtual_end = vm_max_kernel_address; 1234 } 1235 1236 static void 1237 pmap_init_reserved_pages(void) 1238 { 1239 struct pcpu *pc; 1240 vm_offset_t pages; 1241 int i; 1242 1243 CPU_FOREACH(i) { 1244 pc = pcpu_find(i); 1245 /* 1246 * Skip if the mapping has already been initialized, 1247 * i.e. this is the BSP. 1248 */ 1249 if (pc->pc_cmap1_addr != 0) 1250 continue; 1251 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1252 pages = kva_alloc(PAGE_SIZE * 3); 1253 if (pages == 0) 1254 panic("%s: unable to allocate KVA", __func__); 1255 pc->pc_cmap1_pte2p = pt2map_entry(pages); 1256 pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); 1257 pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); 1258 pc->pc_cmap1_addr = (caddr_t)pages; 1259 pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); 1260 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 1261 } 1262 } 1263 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 1264 1265 /* 1266 * The function can already be use in second initialization stage. 1267 * As such, the function DOES NOT call pmap_growkernel() where PT2 1268 * allocation can happen. So if used, be sure that PT2 for given 1269 * virtual address is allocated already! 1270 * 1271 * Add a wired page to the kva. 1272 * Note: not SMP coherent. 1273 */ 1274 static __inline void 1275 pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, 1276 uint32_t attr) 1277 { 1278 pt1_entry_t *pte1p; 1279 pt2_entry_t *pte2p; 1280 1281 pte1p = kern_pte1(va); 1282 if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ 1283 /* 1284 * This is a very low level function, so PT2 and particularly 1285 * PT2PG associated with given virtual address must be already 1286 * allocated. It's a pain mainly during pmap initialization 1287 * stage. However, called after pmap initialization with 1288 * virtual address not under kernel_vm_end will lead to 1289 * the same misery. 1290 */ 1291 if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) 1292 panic("%s: kernel PT2 not allocated!", __func__); 1293 } 1294 1295 pte2p = pt2map_entry(va); 1296 pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); 1297 } 1298 1299 PMAP_INLINE void 1300 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1301 { 1302 1303 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); 1304 } 1305 1306 /* 1307 * Remove a page from the kernel pagetables. 1308 * Note: not SMP coherent. 1309 */ 1310 PMAP_INLINE void 1311 pmap_kremove(vm_offset_t va) 1312 { 1313 pt1_entry_t *pte1p; 1314 pt2_entry_t *pte2p; 1315 1316 pte1p = kern_pte1(va); 1317 if (pte1_is_section(pte1_load(pte1p))) { 1318 pte1_clear(pte1p); 1319 } else { 1320 pte2p = pt2map_entry(va); 1321 pte2_clear(pte2p); 1322 } 1323 } 1324 1325 /* 1326 * Share new kernel PT2PG with all pmaps. 1327 * The caller is responsible for maintaining TLB consistency. 1328 */ 1329 static void 1330 pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) 1331 { 1332 pmap_t pmap; 1333 pt2_entry_t *pte2p; 1334 1335 mtx_lock_spin(&allpmaps_lock); 1336 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1337 pte2p = pmap_pt2tab_entry(pmap, va); 1338 pt2tab_store(pte2p, npte2); 1339 } 1340 mtx_unlock_spin(&allpmaps_lock); 1341 } 1342 1343 /* 1344 * Share new kernel PTE1 with all pmaps. 1345 * The caller is responsible for maintaining TLB consistency. 1346 */ 1347 static void 1348 pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) 1349 { 1350 pmap_t pmap; 1351 pt1_entry_t *pte1p; 1352 1353 mtx_lock_spin(&allpmaps_lock); 1354 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1355 pte1p = pmap_pte1(pmap, va); 1356 pte1_store(pte1p, npte1); 1357 } 1358 mtx_unlock_spin(&allpmaps_lock); 1359 } 1360 1361 /* 1362 * Used to map a range of physical addresses into kernel 1363 * virtual address space. 1364 * 1365 * The value passed in '*virt' is a suggested virtual address for 1366 * the mapping. Architectures which can support a direct-mapped 1367 * physical to virtual region can return the appropriate address 1368 * within that region, leaving '*virt' unchanged. Other 1369 * architectures should map the pages starting at '*virt' and 1370 * update '*virt' with the first usable address after the mapped 1371 * region. 1372 * 1373 * NOTE: Read the comments above pmap_kenter_prot_attr() as 1374 * the function is used herein! 1375 */ 1376 vm_offset_t 1377 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1378 { 1379 vm_offset_t va, sva; 1380 vm_paddr_t pte1_offset; 1381 pt1_entry_t npte1; 1382 uint32_t l1prot, l2prot; 1383 uint32_t l1attr, l2attr; 1384 1385 PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," 1386 " prot = %d\n", __func__, *virt, start, end, end - start, prot)); 1387 1388 l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; 1389 l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1390 l1prot = ATTR_TO_L1(l2prot); 1391 1392 l2attr = PTE2_ATTR_DEFAULT; 1393 l1attr = ATTR_TO_L1(l2attr); 1394 1395 va = *virt; 1396 /* 1397 * Does the physical address range's size and alignment permit at 1398 * least one section mapping to be created? 1399 */ 1400 pte1_offset = start & PTE1_OFFSET; 1401 if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= 1402 PTE1_SIZE) { 1403 /* 1404 * Increase the starting virtual address so that its alignment 1405 * does not preclude the use of section mappings. 1406 */ 1407 if ((va & PTE1_OFFSET) < pte1_offset) 1408 va = pte1_trunc(va) + pte1_offset; 1409 else if ((va & PTE1_OFFSET) > pte1_offset) 1410 va = pte1_roundup(va) + pte1_offset; 1411 } 1412 sva = va; 1413 while (start < end) { 1414 if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { 1415 KASSERT((va & PTE1_OFFSET) == 0, 1416 ("%s: misaligned va %#x", __func__, va)); 1417 npte1 = PTE1_KERN(start, l1prot, l1attr); 1418 pmap_kenter_pte1(va, npte1); 1419 va += PTE1_SIZE; 1420 start += PTE1_SIZE; 1421 } else { 1422 pmap_kenter_prot_attr(va, start, l2prot, l2attr); 1423 va += PAGE_SIZE; 1424 start += PAGE_SIZE; 1425 } 1426 } 1427 tlb_flush_range(sva, va - sva); 1428 *virt = va; 1429 return (sva); 1430 } 1431 1432 /* 1433 * Make a temporary mapping for a physical address. 1434 * This is only intended to be used for panic dumps. 1435 */ 1436 void * 1437 pmap_kenter_temporary(vm_paddr_t pa, int i) 1438 { 1439 vm_offset_t va; 1440 1441 /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ 1442 1443 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 1444 pmap_kenter(va, pa); 1445 tlb_flush_local(va); 1446 return ((void *)crashdumpmap); 1447 } 1448 1449 1450 /************************************* 1451 * 1452 * TLB & cache maintenance routines. 1453 * 1454 *************************************/ 1455 1456 /* 1457 * We inline these within pmap.c for speed. 1458 */ 1459 PMAP_INLINE void 1460 pmap_tlb_flush(pmap_t pmap, vm_offset_t va) 1461 { 1462 1463 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1464 tlb_flush(va); 1465 } 1466 1467 PMAP_INLINE void 1468 pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) 1469 { 1470 1471 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1472 tlb_flush_range(sva, size); 1473 } 1474 1475 /* 1476 * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. 1477 * Requirements: 1478 * - Must deal with pages in order to ensure that none of the PTE2_* bits 1479 * are ever set, PTE2_V in particular. 1480 * - Assumes we can write to pte2s without pte2_store() atomic ops. 1481 * - Assumes nothing will ever test these addresses for 0 to indicate 1482 * no mapping instead of correctly checking PTE2_V. 1483 * - Assumes a vm_offset_t will fit in a pte2 (true for arm). 1484 * Because PTE2_V is never set, there can be no mappings to invalidate. 1485 */ 1486 static vm_offset_t 1487 pmap_pte2list_alloc(vm_offset_t *head) 1488 { 1489 pt2_entry_t *pte2p; 1490 vm_offset_t va; 1491 1492 va = *head; 1493 if (va == 0) 1494 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 1495 pte2p = pt2map_entry(va); 1496 *head = *pte2p; 1497 if (*head & PTE2_V) 1498 panic("%s: va with PTE2_V set!", __func__); 1499 *pte2p = 0; 1500 return (va); 1501 } 1502 1503 static void 1504 pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) 1505 { 1506 pt2_entry_t *pte2p; 1507 1508 if (va & PTE2_V) 1509 panic("%s: freeing va with PTE2_V set!", __func__); 1510 pte2p = pt2map_entry(va); 1511 *pte2p = *head; /* virtual! PTE2_V is 0 though */ 1512 *head = va; 1513 } 1514 1515 static void 1516 pmap_pte2list_init(vm_offset_t *head, void *base, int npages) 1517 { 1518 int i; 1519 vm_offset_t va; 1520 1521 *head = 0; 1522 for (i = npages - 1; i >= 0; i--) { 1523 va = (vm_offset_t)base + i * PAGE_SIZE; 1524 pmap_pte2list_free(head, va); 1525 } 1526 } 1527 1528 /***************************************************************************** 1529 * 1530 * PMAP third and final stage initialization. 1531 * 1532 * After pmap_init() is called, PMAP subsystem is fully initialized. 1533 * 1534 *****************************************************************************/ 1535 1536 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 1537 1538 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1539 "Max number of PV entries"); 1540 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1541 "Page share factor per proc"); 1542 1543 static u_long nkpt2pg = NKPT2PG; 1544 SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, 1545 &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); 1546 1547 static int sp_enabled = 1; 1548 SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 1549 &sp_enabled, 0, "Are large page mappings enabled?"); 1550 1551 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD, 0, 1552 "1MB page mapping counters"); 1553 1554 static u_long pmap_pte1_demotions; 1555 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, 1556 &pmap_pte1_demotions, 0, "1MB page demotions"); 1557 1558 static u_long pmap_pte1_mappings; 1559 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, 1560 &pmap_pte1_mappings, 0, "1MB page mappings"); 1561 1562 static u_long pmap_pte1_p_failures; 1563 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, 1564 &pmap_pte1_p_failures, 0, "1MB page promotion failures"); 1565 1566 static u_long pmap_pte1_promotions; 1567 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, 1568 &pmap_pte1_promotions, 0, "1MB page promotions"); 1569 1570 static u_long pmap_pte1_kern_demotions; 1571 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, 1572 &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); 1573 1574 static u_long pmap_pte1_kern_promotions; 1575 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, 1576 &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); 1577 1578 static __inline ttb_entry_t 1579 pmap_ttb_get(pmap_t pmap) 1580 { 1581 1582 return (vtophys(pmap->pm_pt1) | ttb_flags); 1583 } 1584 1585 /* 1586 * Initialize a vm_page's machine-dependent fields. 1587 * 1588 * Variations: 1589 * 1. Pages for L2 page tables are always not managed. So, pv_list and 1590 * pt2_wirecount can share same physical space. However, proper 1591 * initialization on a page alloc for page tables and reinitialization 1592 * on the page free must be ensured. 1593 */ 1594 void 1595 pmap_page_init(vm_page_t m) 1596 { 1597 1598 TAILQ_INIT(&m->md.pv_list); 1599 pt2_wirecount_init(m); 1600 m->md.pat_mode = VM_MEMATTR_DEFAULT; 1601 } 1602 1603 /* 1604 * Virtualization for faster way how to zero whole page. 1605 */ 1606 static __inline void 1607 pagezero(void *page) 1608 { 1609 1610 bzero(page, PAGE_SIZE); 1611 } 1612 1613 /* 1614 * Zero L2 page table page. 1615 * Use same KVA as in pmap_zero_page(). 1616 */ 1617 static __inline vm_paddr_t 1618 pmap_pt2pg_zero(vm_page_t m) 1619 { 1620 pt2_entry_t *cmap2_pte2p; 1621 vm_paddr_t pa; 1622 struct pcpu *pc; 1623 1624 pa = VM_PAGE_TO_PHYS(m); 1625 1626 /* 1627 * XXX: For now, we map whole page even if it's already zero, 1628 * to sync it even if the sync is only DSB. 1629 */ 1630 sched_pin(); 1631 pc = get_pcpu(); 1632 cmap2_pte2p = pc->pc_cmap2_pte2p; 1633 mtx_lock(&pc->pc_cmap_lock); 1634 if (pte2_load(cmap2_pte2p) != 0) 1635 panic("%s: CMAP2 busy", __func__); 1636 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 1637 vm_page_pte2_attr(m))); 1638 /* Even VM_ALLOC_ZERO request is only advisory. */ 1639 if ((m->flags & PG_ZERO) == 0) 1640 pagezero(pc->pc_cmap2_addr); 1641 pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); 1642 pte2_clear(cmap2_pte2p); 1643 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 1644 1645 /* 1646 * Unpin the thread before releasing the lock. Otherwise the thread 1647 * could be rescheduled while still bound to the current CPU, only 1648 * to unpin itself immediately upon resuming execution. 1649 */ 1650 sched_unpin(); 1651 mtx_unlock(&pc->pc_cmap_lock); 1652 1653 return (pa); 1654 } 1655 1656 /* 1657 * Init just allocated page as L2 page table(s) holder 1658 * and return its physical address. 1659 */ 1660 static __inline vm_paddr_t 1661 pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) 1662 { 1663 vm_paddr_t pa; 1664 pt2_entry_t *pte2p; 1665 1666 /* Check page attributes. */ 1667 if (m->md.pat_mode != pt_memattr) 1668 pmap_page_set_memattr(m, pt_memattr); 1669 1670 /* Zero page and init wire counts. */ 1671 pa = pmap_pt2pg_zero(m); 1672 pt2_wirecount_init(m); 1673 1674 /* 1675 * Map page to PT2MAP address space for given pmap. 1676 * Note that PT2MAP space is shared with all pmaps. 1677 */ 1678 if (pmap == kernel_pmap) 1679 pmap_kenter_pt2tab(va, PTE2_KPT(pa)); 1680 else { 1681 pte2p = pmap_pt2tab_entry(pmap, va); 1682 pt2tab_store(pte2p, PTE2_KPT_NG(pa)); 1683 } 1684 1685 return (pa); 1686 } 1687 1688 /* 1689 * Initialize the pmap module. 1690 * Called by vm_init, to initialize any structures that the pmap 1691 * system needs to map virtual memory. 1692 */ 1693 void 1694 pmap_init(void) 1695 { 1696 vm_size_t s; 1697 pt2_entry_t *pte2p, pte2; 1698 u_int i, pte1_idx, pv_npg; 1699 1700 PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); 1701 1702 /* 1703 * Initialize the vm page array entries for kernel pmap's 1704 * L2 page table pages allocated in advance. 1705 */ 1706 pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); 1707 pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); 1708 for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { 1709 vm_paddr_t pa; 1710 vm_page_t m; 1711 1712 pte2 = pte2_load(pte2p); 1713 KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); 1714 1715 pa = pte2_pa(pte2); 1716 m = PHYS_TO_VM_PAGE(pa); 1717 KASSERT(m >= vm_page_array && 1718 m < &vm_page_array[vm_page_array_size], 1719 ("%s: L2 page table page is out of range", __func__)); 1720 1721 m->pindex = pte1_idx; 1722 m->phys_addr = pa; 1723 pte1_idx += NPT2_IN_PG; 1724 } 1725 1726 /* 1727 * Initialize the address space (zone) for the pv entries. Set a 1728 * high water mark so that the system can recover from excessive 1729 * numbers of pv entries. 1730 */ 1731 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1732 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1733 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1734 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1735 pv_entry_high_water = 9 * (pv_entry_max / 10); 1736 1737 /* 1738 * Are large page mappings enabled? 1739 */ 1740 TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); 1741 if (sp_enabled) { 1742 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1743 ("%s: can't assign to pagesizes[1]", __func__)); 1744 pagesizes[1] = PTE1_SIZE; 1745 } 1746 1747 /* 1748 * Calculate the size of the pv head table for sections. 1749 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1750 * Note that the table is only for sections which could be promoted. 1751 */ 1752 first_managed_pa = pte1_trunc(vm_phys_segs[0].start); 1753 pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) 1754 - first_managed_pa) / PTE1_SIZE + 1; 1755 1756 /* 1757 * Allocate memory for the pv head table for sections. 1758 */ 1759 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1760 s = round_page(s); 1761 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1762 M_WAITOK | M_ZERO); 1763 for (i = 0; i < pv_npg; i++) 1764 TAILQ_INIT(&pv_table[i].pv_list); 1765 1766 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1767 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1768 if (pv_chunkbase == NULL) 1769 panic("%s: not enough kvm for pv chunks", __func__); 1770 pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1771 } 1772 1773 /* 1774 * Add a list of wired pages to the kva 1775 * this routine is only used for temporary 1776 * kernel mappings that do not need to have 1777 * page modification or references recorded. 1778 * Note that old mappings are simply written 1779 * over. The page *must* be wired. 1780 * Note: SMP coherent. Uses a ranged shootdown IPI. 1781 */ 1782 void 1783 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1784 { 1785 u_int anychanged; 1786 pt2_entry_t *epte2p, *pte2p, pte2; 1787 vm_page_t m; 1788 vm_paddr_t pa; 1789 1790 anychanged = 0; 1791 pte2p = pt2map_entry(sva); 1792 epte2p = pte2p + count; 1793 while (pte2p < epte2p) { 1794 m = *ma++; 1795 pa = VM_PAGE_TO_PHYS(m); 1796 pte2 = pte2_load(pte2p); 1797 if ((pte2_pa(pte2) != pa) || 1798 (pte2_attr(pte2) != vm_page_pte2_attr(m))) { 1799 anychanged++; 1800 pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, 1801 vm_page_pte2_attr(m))); 1802 } 1803 pte2p++; 1804 } 1805 if (__predict_false(anychanged)) 1806 tlb_flush_range(sva, count * PAGE_SIZE); 1807 } 1808 1809 /* 1810 * This routine tears out page mappings from the 1811 * kernel -- it is meant only for temporary mappings. 1812 * Note: SMP coherent. Uses a ranged shootdown IPI. 1813 */ 1814 void 1815 pmap_qremove(vm_offset_t sva, int count) 1816 { 1817 vm_offset_t va; 1818 1819 va = sva; 1820 while (count-- > 0) { 1821 pmap_kremove(va); 1822 va += PAGE_SIZE; 1823 } 1824 tlb_flush_range(sva, va - sva); 1825 } 1826 1827 /* 1828 * Are we current address space or kernel? 1829 */ 1830 static __inline int 1831 pmap_is_current(pmap_t pmap) 1832 { 1833 1834 return (pmap == kernel_pmap || 1835 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); 1836 } 1837 1838 /* 1839 * If the given pmap is not the current or kernel pmap, the returned 1840 * pte2 must be released by passing it to pmap_pte2_release(). 1841 */ 1842 static pt2_entry_t * 1843 pmap_pte2(pmap_t pmap, vm_offset_t va) 1844 { 1845 pt1_entry_t pte1; 1846 vm_paddr_t pt2pg_pa; 1847 1848 pte1 = pte1_load(pmap_pte1(pmap, va)); 1849 if (pte1_is_section(pte1)) 1850 panic("%s: attempt to map PTE1", __func__); 1851 if (pte1_is_link(pte1)) { 1852 /* Are we current address space or kernel? */ 1853 if (pmap_is_current(pmap)) 1854 return (pt2map_entry(va)); 1855 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1856 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1857 mtx_lock(&PMAP2mutex); 1858 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 1859 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 1860 tlb_flush((vm_offset_t)PADDR2); 1861 } 1862 return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1863 } 1864 return (NULL); 1865 } 1866 1867 /* 1868 * Releases a pte2 that was obtained from pmap_pte2(). 1869 * Be prepared for the pte2p being NULL. 1870 */ 1871 static __inline void 1872 pmap_pte2_release(pt2_entry_t *pte2p) 1873 { 1874 1875 if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { 1876 mtx_unlock(&PMAP2mutex); 1877 } 1878 } 1879 1880 /* 1881 * Super fast pmap_pte2 routine best used when scanning 1882 * the pv lists. This eliminates many coarse-grained 1883 * invltlb calls. Note that many of the pv list 1884 * scans are across different pmaps. It is very wasteful 1885 * to do an entire tlb flush for checking a single mapping. 1886 * 1887 * If the given pmap is not the current pmap, pvh_global_lock 1888 * must be held and curthread pinned to a CPU. 1889 */ 1890 static pt2_entry_t * 1891 pmap_pte2_quick(pmap_t pmap, vm_offset_t va) 1892 { 1893 pt1_entry_t pte1; 1894 vm_paddr_t pt2pg_pa; 1895 1896 pte1 = pte1_load(pmap_pte1(pmap, va)); 1897 if (pte1_is_section(pte1)) 1898 panic("%s: attempt to map PTE1", __func__); 1899 if (pte1_is_link(pte1)) { 1900 /* Are we current address space or kernel? */ 1901 if (pmap_is_current(pmap)) 1902 return (pt2map_entry(va)); 1903 rw_assert(&pvh_global_lock, RA_WLOCKED); 1904 KASSERT(curthread->td_pinned > 0, 1905 ("%s: curthread not pinned", __func__)); 1906 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1907 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1908 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 1909 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 1910 #ifdef SMP 1911 PMAP1cpu = PCPU_GET(cpuid); 1912 #endif 1913 tlb_flush_local((vm_offset_t)PADDR1); 1914 PMAP1changed++; 1915 } else 1916 #ifdef SMP 1917 if (PMAP1cpu != PCPU_GET(cpuid)) { 1918 PMAP1cpu = PCPU_GET(cpuid); 1919 tlb_flush_local((vm_offset_t)PADDR1); 1920 PMAP1changedcpu++; 1921 } else 1922 #endif 1923 PMAP1unchanged++; 1924 return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1925 } 1926 return (NULL); 1927 } 1928 1929 /* 1930 * Routine: pmap_extract 1931 * Function: 1932 * Extract the physical page address associated 1933 * with the given map/virtual_address pair. 1934 */ 1935 vm_paddr_t 1936 pmap_extract(pmap_t pmap, vm_offset_t va) 1937 { 1938 vm_paddr_t pa; 1939 pt1_entry_t pte1; 1940 pt2_entry_t *pte2p; 1941 1942 PMAP_LOCK(pmap); 1943 pte1 = pte1_load(pmap_pte1(pmap, va)); 1944 if (pte1_is_section(pte1)) 1945 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1946 else if (pte1_is_link(pte1)) { 1947 pte2p = pmap_pte2(pmap, va); 1948 pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); 1949 pmap_pte2_release(pte2p); 1950 } else 1951 pa = 0; 1952 PMAP_UNLOCK(pmap); 1953 return (pa); 1954 } 1955 1956 /* 1957 * Routine: pmap_extract_and_hold 1958 * Function: 1959 * Atomically extract and hold the physical page 1960 * with the given pmap and virtual address pair 1961 * if that mapping permits the given protection. 1962 */ 1963 vm_page_t 1964 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1965 { 1966 vm_paddr_t pa, lockpa; 1967 pt1_entry_t pte1; 1968 pt2_entry_t pte2, *pte2p; 1969 vm_page_t m; 1970 1971 lockpa = 0; 1972 m = NULL; 1973 PMAP_LOCK(pmap); 1974 retry: 1975 pte1 = pte1_load(pmap_pte1(pmap, va)); 1976 if (pte1_is_section(pte1)) { 1977 if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { 1978 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1979 if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) 1980 goto retry; 1981 m = PHYS_TO_VM_PAGE(pa); 1982 vm_page_hold(m); 1983 } 1984 } else if (pte1_is_link(pte1)) { 1985 pte2p = pmap_pte2(pmap, va); 1986 pte2 = pte2_load(pte2p); 1987 pmap_pte2_release(pte2p); 1988 if (pte2_is_valid(pte2) && 1989 (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { 1990 pa = pte2_pa(pte2); 1991 if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) 1992 goto retry; 1993 m = PHYS_TO_VM_PAGE(pa); 1994 vm_page_hold(m); 1995 } 1996 } 1997 PA_UNLOCK_COND(lockpa); 1998 PMAP_UNLOCK(pmap); 1999 return (m); 2000 } 2001 2002 /* 2003 * Grow the number of kernel L2 page table entries, if needed. 2004 */ 2005 void 2006 pmap_growkernel(vm_offset_t addr) 2007 { 2008 vm_page_t m; 2009 vm_paddr_t pt2pg_pa, pt2_pa; 2010 pt1_entry_t pte1; 2011 pt2_entry_t pte2; 2012 2013 PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); 2014 /* 2015 * All the time kernel_vm_end is first KVA for which underlying 2016 * L2 page table is either not allocated or linked from L1 page table 2017 * (not considering sections). Except for two possible cases: 2018 * 2019 * (1) in the very beginning as long as pmap_growkernel() was 2020 * not called, it could be first unused KVA (which is not 2021 * rounded up to PTE1_SIZE), 2022 * 2023 * (2) when all KVA space is mapped and kernel_map->max_offset 2024 * address is not rounded up to PTE1_SIZE. (For example, 2025 * it could be 0xFFFFFFFF.) 2026 */ 2027 kernel_vm_end = pte1_roundup(kernel_vm_end); 2028 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2029 addr = roundup2(addr, PTE1_SIZE); 2030 if (addr - 1 >= kernel_map->max_offset) 2031 addr = kernel_map->max_offset; 2032 while (kernel_vm_end < addr) { 2033 pte1 = pte1_load(kern_pte1(kernel_vm_end)); 2034 if (pte1_is_valid(pte1)) { 2035 kernel_vm_end += PTE1_SIZE; 2036 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2037 kernel_vm_end = kernel_map->max_offset; 2038 break; 2039 } 2040 continue; 2041 } 2042 2043 /* 2044 * kernel_vm_end_new is used in pmap_pinit() when kernel 2045 * mappings are entered to new pmap all at once to avoid race 2046 * between pmap_kenter_pte1() and kernel_vm_end increase. 2047 * The same aplies to pmap_kenter_pt2tab(). 2048 */ 2049 kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; 2050 2051 pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); 2052 if (!pte2_is_valid(pte2)) { 2053 /* 2054 * Install new PT2s page into kernel PT2TAB. 2055 */ 2056 m = vm_page_alloc(NULL, 2057 pte1_index(kernel_vm_end) & ~PT2PG_MASK, 2058 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2059 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2060 if (m == NULL) 2061 panic("%s: no memory to grow kernel", __func__); 2062 /* 2063 * QQQ: To link all new L2 page tables from L1 page 2064 * table now and so pmap_kenter_pte1() them 2065 * at once together with pmap_kenter_pt2tab() 2066 * could be nice speed up. However, 2067 * pmap_growkernel() does not happen so often... 2068 * QQQ: The other TTBR is another option. 2069 */ 2070 pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, 2071 m); 2072 } else 2073 pt2pg_pa = pte2_pa(pte2); 2074 2075 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); 2076 pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); 2077 2078 kernel_vm_end = kernel_vm_end_new; 2079 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2080 kernel_vm_end = kernel_map->max_offset; 2081 break; 2082 } 2083 } 2084 } 2085 2086 static int 2087 kvm_size(SYSCTL_HANDLER_ARGS) 2088 { 2089 unsigned long ksize = vm_max_kernel_address - KERNBASE; 2090 2091 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2092 } 2093 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2094 0, 0, kvm_size, "IU", "Size of KVM"); 2095 2096 static int 2097 kvm_free(SYSCTL_HANDLER_ARGS) 2098 { 2099 unsigned long kfree = vm_max_kernel_address - kernel_vm_end; 2100 2101 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2102 } 2103 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2104 0, 0, kvm_free, "IU", "Amount of KVM free"); 2105 2106 /*********************************************** 2107 * 2108 * Pmap allocation/deallocation routines. 2109 * 2110 ***********************************************/ 2111 2112 /* 2113 * Initialize the pmap for the swapper process. 2114 */ 2115 void 2116 pmap_pinit0(pmap_t pmap) 2117 { 2118 PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); 2119 2120 PMAP_LOCK_INIT(pmap); 2121 2122 /* 2123 * Kernel page table directory and pmap stuff around is already 2124 * initialized, we are using it right now and here. So, finish 2125 * only PMAP structures initialization for process0 ... 2126 * 2127 * Since the L1 page table and PT2TAB is shared with the kernel pmap, 2128 * which is already included in the list "allpmaps", this pmap does 2129 * not need to be inserted into that list. 2130 */ 2131 pmap->pm_pt1 = kern_pt1; 2132 pmap->pm_pt2tab = kern_pt2tab; 2133 CPU_ZERO(&pmap->pm_active); 2134 PCPU_SET(curpmap, pmap); 2135 TAILQ_INIT(&pmap->pm_pvchunk); 2136 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2137 CPU_SET(0, &pmap->pm_active); 2138 } 2139 2140 static __inline void 2141 pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, 2142 vm_offset_t eva) 2143 { 2144 u_int idx, count; 2145 2146 idx = pte1_index(sva); 2147 count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); 2148 bcopy(spte1p + idx, dpte1p + idx, count); 2149 } 2150 2151 static __inline void 2152 pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, 2153 vm_offset_t eva) 2154 { 2155 u_int idx, count; 2156 2157 idx = pt2tab_index(sva); 2158 count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); 2159 bcopy(spte2p + idx, dpte2p + idx, count); 2160 } 2161 2162 /* 2163 * Initialize a preallocated and zeroed pmap structure, 2164 * such as one in a vmspace structure. 2165 */ 2166 int 2167 pmap_pinit(pmap_t pmap) 2168 { 2169 pt1_entry_t *pte1p; 2170 pt2_entry_t *pte2p; 2171 vm_paddr_t pa, pt2tab_pa; 2172 u_int i; 2173 2174 PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, 2175 pmap->pm_pt1)); 2176 2177 /* 2178 * No need to allocate L2 page table space yet but we do need 2179 * a valid L1 page table and PT2TAB table. 2180 * 2181 * Install shared kernel mappings to these tables. It's a little 2182 * tricky as some parts of KVA are reserved for vectors, devices, 2183 * and whatever else. These parts are supposed to be above 2184 * vm_max_kernel_address. Thus two regions should be installed: 2185 * 2186 * (1) <KERNBASE, kernel_vm_end), 2187 * (2) <vm_max_kernel_address, 0xFFFFFFFF>. 2188 * 2189 * QQQ: The second region should be stable enough to be installed 2190 * only once in time when the tables are allocated. 2191 * QQQ: Maybe copy of both regions at once could be faster ... 2192 * QQQ: Maybe the other TTBR is an option. 2193 * 2194 * Finally, install own PT2TAB table to these tables. 2195 */ 2196 2197 if (pmap->pm_pt1 == NULL) { 2198 pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(kernel_arena, 2199 NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, 2200 pt_memattr); 2201 if (pmap->pm_pt1 == NULL) 2202 return (0); 2203 } 2204 if (pmap->pm_pt2tab == NULL) { 2205 /* 2206 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page 2207 * only, what should be the only size for 32 bit systems, 2208 * then we could allocate it with vm_page_alloc() and all 2209 * the stuff needed as other L2 page table pages. 2210 * (2) Note that a process PT2TAB is special L2 page table 2211 * page. Its mapping in kernel_arena is permanent and can 2212 * be used no matter which process is current. Its mapping 2213 * in PT2MAP can be used only for current process. 2214 */ 2215 pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(kernel_arena, 2216 NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); 2217 if (pmap->pm_pt2tab == NULL) { 2218 /* 2219 * QQQ: As struct pmap is allocated from UMA with 2220 * UMA_ZONE_NOFREE flag, it's important to leave 2221 * no allocation in pmap if initialization failed. 2222 */ 2223 kmem_free(kernel_arena, (vm_offset_t)pmap->pm_pt1, 2224 NB_IN_PT1); 2225 pmap->pm_pt1 = NULL; 2226 return (0); 2227 } 2228 /* 2229 * QQQ: Each L2 page table page vm_page_t has pindex set to 2230 * pte1 index of virtual address mapped by this page. 2231 * It's not valid for non kernel PT2TABs themselves. 2232 * The pindex of these pages can not be altered because 2233 * of the way how they are allocated now. However, it 2234 * should not be a problem. 2235 */ 2236 } 2237 2238 mtx_lock_spin(&allpmaps_lock); 2239 /* 2240 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), 2241 * kernel_vm_end_new is used here instead of kernel_vm_end. 2242 */ 2243 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, 2244 kernel_vm_end_new - 1); 2245 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 2246 0xFFFFFFFF); 2247 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, 2248 kernel_vm_end_new - 1); 2249 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 2250 0xFFFFFFFF); 2251 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 2252 mtx_unlock_spin(&allpmaps_lock); 2253 2254 /* 2255 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. 2256 * I.e. self reference mapping. The PT2TAB is private, however mapped 2257 * into shared PT2MAP space, so the mapping should be not global. 2258 */ 2259 pt2tab_pa = vtophys(pmap->pm_pt2tab); 2260 pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); 2261 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 2262 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 2263 } 2264 2265 /* Insert PT2MAP PT2s into pmap PT1. */ 2266 pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); 2267 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 2268 pte1_store(pte1p++, PTE1_LINK(pa)); 2269 } 2270 2271 /* 2272 * Now synchronize new mapping which was made above. 2273 */ 2274 pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); 2275 pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); 2276 2277 CPU_ZERO(&pmap->pm_active); 2278 TAILQ_INIT(&pmap->pm_pvchunk); 2279 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2280 2281 return (1); 2282 } 2283 2284 #ifdef INVARIANTS 2285 static boolean_t 2286 pt2tab_user_is_empty(pt2_entry_t *tab) 2287 { 2288 u_int i, end; 2289 2290 end = pt2tab_index(VM_MAXUSER_ADDRESS); 2291 for (i = 0; i < end; i++) 2292 if (tab[i] != 0) return (FALSE); 2293 return (TRUE); 2294 } 2295 #endif 2296 /* 2297 * Release any resources held by the given physical map. 2298 * Called when a pmap initialized by pmap_pinit is being released. 2299 * Should only be called if the map contains no valid mappings. 2300 */ 2301 void 2302 pmap_release(pmap_t pmap) 2303 { 2304 #ifdef INVARIANTS 2305 vm_offset_t start, end; 2306 #endif 2307 KASSERT(pmap->pm_stats.resident_count == 0, 2308 ("%s: pmap resident count %ld != 0", __func__, 2309 pmap->pm_stats.resident_count)); 2310 KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), 2311 ("%s: has allocated user PT2(s)", __func__)); 2312 KASSERT(CPU_EMPTY(&pmap->pm_active), 2313 ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); 2314 2315 mtx_lock_spin(&allpmaps_lock); 2316 LIST_REMOVE(pmap, pm_list); 2317 mtx_unlock_spin(&allpmaps_lock); 2318 2319 #ifdef INVARIANTS 2320 start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); 2321 end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); 2322 bzero((char *)pmap->pm_pt1 + start, end - start); 2323 2324 start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); 2325 end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); 2326 bzero((char *)pmap->pm_pt2tab + start, end - start); 2327 #endif 2328 /* 2329 * We are leaving PT1 and PT2TAB allocated on released pmap, 2330 * so hopefully UMA vmspace_zone will always be inited with 2331 * UMA_ZONE_NOFREE flag. 2332 */ 2333 } 2334 2335 /********************************************************* 2336 * 2337 * L2 table pages and their pages management routines. 2338 * 2339 *********************************************************/ 2340 2341 /* 2342 * Virtual interface for L2 page table wire counting. 2343 * 2344 * Each L2 page table in a page has own counter which counts a number of 2345 * valid mappings in a table. Global page counter counts mappings in all 2346 * tables in a page plus a single itself mapping in PT2TAB. 2347 * 2348 * During a promotion we leave the associated L2 page table counter 2349 * untouched, so the table (strictly speaking a page which holds it) 2350 * is never freed if promoted. 2351 * 2352 * If a page m->wire_count == 1 then no valid mappings exist in any L2 page 2353 * table in the page and the page itself is only mapped in PT2TAB. 2354 */ 2355 2356 static __inline void 2357 pt2_wirecount_init(vm_page_t m) 2358 { 2359 u_int i; 2360 2361 /* 2362 * Note: A page m is allocated with VM_ALLOC_WIRED flag and 2363 * m->wire_count should be already set correctly. 2364 * So, there is no need to set it again herein. 2365 */ 2366 for (i = 0; i < NPT2_IN_PG; i++) 2367 m->md.pt2_wirecount[i] = 0; 2368 } 2369 2370 static __inline void 2371 pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) 2372 { 2373 2374 /* 2375 * Note: A just modificated pte2 (i.e. already allocated) 2376 * is acquiring one extra reference which must be 2377 * explicitly cleared. It influences the KASSERTs herein. 2378 * All L2 page tables in a page always belong to the same 2379 * pmap, so we allow only one extra reference for the page. 2380 */ 2381 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), 2382 ("%s: PT2 is overflowing ...", __func__)); 2383 KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), 2384 ("%s: PT2PG is overflowing ...", __func__)); 2385 2386 m->wire_count++; 2387 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; 2388 } 2389 2390 static __inline void 2391 pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) 2392 { 2393 2394 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, 2395 ("%s: PT2 is underflowing ...", __func__)); 2396 KASSERT(m->wire_count > 1, 2397 ("%s: PT2PG is underflowing ...", __func__)); 2398 2399 m->wire_count--; 2400 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; 2401 } 2402 2403 static __inline void 2404 pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) 2405 { 2406 2407 KASSERT(count <= NPTE2_IN_PT2, 2408 ("%s: invalid count %u", __func__, count)); 2409 KASSERT(m->wire_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], 2410 ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count, 2411 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); 2412 2413 m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; 2414 m->wire_count += count; 2415 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; 2416 2417 KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), 2418 ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count)); 2419 } 2420 2421 static __inline uint32_t 2422 pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) 2423 { 2424 2425 return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); 2426 } 2427 2428 static __inline boolean_t 2429 pt2_is_empty(vm_page_t m, vm_offset_t va) 2430 { 2431 2432 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); 2433 } 2434 2435 static __inline boolean_t 2436 pt2_is_full(vm_page_t m, vm_offset_t va) 2437 { 2438 2439 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 2440 NPTE2_IN_PT2); 2441 } 2442 2443 static __inline boolean_t 2444 pt2pg_is_empty(vm_page_t m) 2445 { 2446 2447 return (m->wire_count == 1); 2448 } 2449 2450 /* 2451 * This routine is called if the L2 page table 2452 * is not mapped correctly. 2453 */ 2454 static vm_page_t 2455 _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2456 { 2457 uint32_t pte1_idx; 2458 pt1_entry_t *pte1p; 2459 pt2_entry_t pte2; 2460 vm_page_t m; 2461 vm_paddr_t pt2pg_pa, pt2_pa; 2462 2463 pte1_idx = pte1_index(va); 2464 pte1p = pmap->pm_pt1 + pte1_idx; 2465 2466 KASSERT(pte1_load(pte1p) == 0, 2467 ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, 2468 pte1_load(pte1p))); 2469 2470 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); 2471 if (!pte2_is_valid(pte2)) { 2472 /* 2473 * Install new PT2s page into pmap PT2TAB. 2474 */ 2475 m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, 2476 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2477 if (m == NULL) { 2478 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2479 PMAP_UNLOCK(pmap); 2480 rw_wunlock(&pvh_global_lock); 2481 vm_wait(NULL); 2482 rw_wlock(&pvh_global_lock); 2483 PMAP_LOCK(pmap); 2484 } 2485 2486 /* 2487 * Indicate the need to retry. While waiting, 2488 * the L2 page table page may have been allocated. 2489 */ 2490 return (NULL); 2491 } 2492 pmap->pm_stats.resident_count++; 2493 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 2494 } else { 2495 pt2pg_pa = pte2_pa(pte2); 2496 m = PHYS_TO_VM_PAGE(pt2pg_pa); 2497 } 2498 2499 pt2_wirecount_inc(m, pte1_idx); 2500 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 2501 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 2502 2503 return (m); 2504 } 2505 2506 static vm_page_t 2507 pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2508 { 2509 u_int pte1_idx; 2510 pt1_entry_t *pte1p, pte1; 2511 vm_page_t m; 2512 2513 pte1_idx = pte1_index(va); 2514 retry: 2515 pte1p = pmap->pm_pt1 + pte1_idx; 2516 pte1 = pte1_load(pte1p); 2517 2518 /* 2519 * This supports switching from a 1MB page to a 2520 * normal 4K page. 2521 */ 2522 if (pte1_is_section(pte1)) { 2523 (void)pmap_demote_pte1(pmap, pte1p, va); 2524 /* 2525 * Reload pte1 after demotion. 2526 * 2527 * Note: Demotion can even fail as either PT2 is not find for 2528 * the virtual address or PT2PG can not be allocated. 2529 */ 2530 pte1 = pte1_load(pte1p); 2531 } 2532 2533 /* 2534 * If the L2 page table page is mapped, we just increment the 2535 * hold count, and activate it. 2536 */ 2537 if (pte1_is_link(pte1)) { 2538 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2539 pt2_wirecount_inc(m, pte1_idx); 2540 } else { 2541 /* 2542 * Here if the PT2 isn't mapped, or if it has 2543 * been deallocated. 2544 */ 2545 m = _pmap_allocpte2(pmap, va, flags); 2546 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2547 goto retry; 2548 } 2549 2550 return (m); 2551 } 2552 2553 /* 2554 * Schedule the specified unused L2 page table page to be freed. Specifically, 2555 * add the page to the specified list of pages that will be released to the 2556 * physical memory manager after the TLB has been updated. 2557 */ 2558 static __inline void 2559 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) 2560 { 2561 2562 /* 2563 * Put page on a list so that it is released after 2564 * *ALL* TLB shootdown is done 2565 */ 2566 #ifdef PMAP_DEBUG 2567 pmap_zero_page_check(m); 2568 #endif 2569 m->flags |= PG_ZERO; 2570 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2571 } 2572 2573 /* 2574 * Unwire L2 page tables page. 2575 */ 2576 static void 2577 pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) 2578 { 2579 pt1_entry_t *pte1p, opte1 __unused; 2580 pt2_entry_t *pte2p; 2581 uint32_t i; 2582 2583 KASSERT(pt2pg_is_empty(m), 2584 ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); 2585 2586 /* 2587 * Unmap all L2 page tables in the page from L1 page table. 2588 * 2589 * QQQ: Individual L2 page tables (except the last one) can be unmapped 2590 * earlier. However, we are doing that this way. 2591 */ 2592 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 2593 ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); 2594 pte1p = pmap->pm_pt1 + m->pindex; 2595 for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { 2596 KASSERT(m->md.pt2_wirecount[i] == 0, 2597 ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); 2598 opte1 = pte1_load(pte1p); 2599 if (pte1_is_link(opte1)) { 2600 pte1_clear(pte1p); 2601 /* 2602 * Flush intermediate TLB cache. 2603 */ 2604 pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); 2605 } 2606 #ifdef INVARIANTS 2607 else 2608 KASSERT((opte1 == 0) || pte1_is_section(opte1), 2609 ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, 2610 pmap, va, opte1, i)); 2611 #endif 2612 } 2613 2614 /* 2615 * Unmap the page from PT2TAB. 2616 */ 2617 pte2p = pmap_pt2tab_entry(pmap, va); 2618 (void)pt2tab_load_clear(pte2p); 2619 pmap_tlb_flush(pmap, pt2map_pt2pg(va)); 2620 2621 m->wire_count = 0; 2622 pmap->pm_stats.resident_count--; 2623 2624 /* 2625 * This barrier is so that the ordinary store unmapping 2626 * the L2 page table page is globally performed before TLB shoot- 2627 * down is begun. 2628 */ 2629 wmb(); 2630 vm_wire_sub(1); 2631 } 2632 2633 /* 2634 * Decrements a L2 page table page's wire count, which is used to record the 2635 * number of valid page table entries within the page. If the wire count 2636 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2637 * page table page was unmapped and FALSE otherwise. 2638 */ 2639 static __inline boolean_t 2640 pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2641 { 2642 pt2_wirecount_dec(m, pte1_index(va)); 2643 if (pt2pg_is_empty(m)) { 2644 /* 2645 * QQQ: Wire count is zero, so whole page should be zero and 2646 * we can set PG_ZERO flag to it. 2647 * Note that when promotion is enabled, it takes some 2648 * more efforts. See pmap_unwire_pt2_all() below. 2649 */ 2650 pmap_unwire_pt2pg(pmap, va, m); 2651 pmap_add_delayed_free_list(m, free); 2652 return (TRUE); 2653 } else 2654 return (FALSE); 2655 } 2656 2657 /* 2658 * Drop a L2 page table page's wire count at once, which is used to record 2659 * the number of valid L2 page table entries within the page. If the wire 2660 * count drops to zero, then the L2 page table page is unmapped. 2661 */ 2662 static __inline void 2663 pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, 2664 struct spglist *free) 2665 { 2666 u_int pte1_idx = pte1_index(va); 2667 2668 KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), 2669 ("%s: PT2 page's pindex is wrong", __func__)); 2670 KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx), 2671 ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count, 2672 pt2_wirecount_get(m, pte1_idx))); 2673 2674 /* 2675 * It's possible that the L2 page table was never used. 2676 * It happened in case that a section was created without promotion. 2677 */ 2678 if (pt2_is_full(m, va)) { 2679 pt2_wirecount_set(m, pte1_idx, 0); 2680 2681 /* 2682 * QQQ: We clear L2 page table now, so when L2 page table page 2683 * is going to be freed, we can set it PG_ZERO flag ... 2684 * This function is called only on section mappings, so 2685 * hopefully it's not to big overload. 2686 * 2687 * XXX: If pmap is current, existing PT2MAP mapping could be 2688 * used for zeroing. 2689 */ 2690 pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); 2691 } 2692 #ifdef INVARIANTS 2693 else 2694 KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", 2695 __func__, pt2_wirecount_get(m, pte1_idx))); 2696 #endif 2697 if (pt2pg_is_empty(m)) { 2698 pmap_unwire_pt2pg(pmap, va, m); 2699 pmap_add_delayed_free_list(m, free); 2700 } 2701 } 2702 2703 /* 2704 * After removing a L2 page table entry, this routine is used to 2705 * conditionally free the page, and manage the hold/wire counts. 2706 */ 2707 static boolean_t 2708 pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) 2709 { 2710 pt1_entry_t pte1; 2711 vm_page_t mpte; 2712 2713 if (va >= VM_MAXUSER_ADDRESS) 2714 return (FALSE); 2715 pte1 = pte1_load(pmap_pte1(pmap, va)); 2716 mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2717 return (pmap_unwire_pt2(pmap, va, mpte, free)); 2718 } 2719 2720 /************************************* 2721 * 2722 * Page management routines. 2723 * 2724 *************************************/ 2725 2726 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2727 CTASSERT(_NPCM == 11); 2728 CTASSERT(_NPCPV == 336); 2729 2730 static __inline struct pv_chunk * 2731 pv_to_chunk(pv_entry_t pv) 2732 { 2733 2734 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2735 } 2736 2737 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2738 2739 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2740 #define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2741 2742 static const uint32_t pc_freemask[_NPCM] = { 2743 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2744 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2745 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2746 PC_FREE0_9, PC_FREE10 2747 }; 2748 2749 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2750 "Current number of pv entries"); 2751 2752 #ifdef PV_STATS 2753 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2754 2755 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2756 "Current number of pv entry chunks"); 2757 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2758 "Current number of pv entry chunks allocated"); 2759 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2760 "Current number of pv entry chunks frees"); 2761 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 2762 0, "Number of times tried to get a chunk page but failed."); 2763 2764 static long pv_entry_frees, pv_entry_allocs; 2765 static int pv_entry_spare; 2766 2767 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2768 "Current number of pv entry frees"); 2769 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 2770 0, "Current number of pv entry allocs"); 2771 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2772 "Current number of spare pv entries"); 2773 #endif 2774 2775 /* 2776 * Is given page managed? 2777 */ 2778 static __inline bool 2779 is_managed(vm_paddr_t pa) 2780 { 2781 vm_page_t m; 2782 2783 m = PHYS_TO_VM_PAGE(pa); 2784 if (m == NULL) 2785 return (false); 2786 return ((m->oflags & VPO_UNMANAGED) == 0); 2787 } 2788 2789 static __inline bool 2790 pte1_is_managed(pt1_entry_t pte1) 2791 { 2792 2793 return (is_managed(pte1_pa(pte1))); 2794 } 2795 2796 static __inline bool 2797 pte2_is_managed(pt2_entry_t pte2) 2798 { 2799 2800 return (is_managed(pte2_pa(pte2))); 2801 } 2802 2803 /* 2804 * We are in a serious low memory condition. Resort to 2805 * drastic measures to free some pages so we can allocate 2806 * another pv entry chunk. 2807 */ 2808 static vm_page_t 2809 pmap_pv_reclaim(pmap_t locked_pmap) 2810 { 2811 struct pch newtail; 2812 struct pv_chunk *pc; 2813 struct md_page *pvh; 2814 pt1_entry_t *pte1p; 2815 pmap_t pmap; 2816 pt2_entry_t *pte2p, tpte2; 2817 pv_entry_t pv; 2818 vm_offset_t va; 2819 vm_page_t m, m_pc; 2820 struct spglist free; 2821 uint32_t inuse; 2822 int bit, field, freed; 2823 2824 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2825 pmap = NULL; 2826 m_pc = NULL; 2827 SLIST_INIT(&free); 2828 TAILQ_INIT(&newtail); 2829 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2830 SLIST_EMPTY(&free))) { 2831 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2832 if (pmap != pc->pc_pmap) { 2833 if (pmap != NULL) { 2834 if (pmap != locked_pmap) 2835 PMAP_UNLOCK(pmap); 2836 } 2837 pmap = pc->pc_pmap; 2838 /* Avoid deadlock and lock recursion. */ 2839 if (pmap > locked_pmap) 2840 PMAP_LOCK(pmap); 2841 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2842 pmap = NULL; 2843 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2844 continue; 2845 } 2846 } 2847 2848 /* 2849 * Destroy every non-wired, 4 KB page mapping in the chunk. 2850 */ 2851 freed = 0; 2852 for (field = 0; field < _NPCM; field++) { 2853 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2854 inuse != 0; inuse &= ~(1UL << bit)) { 2855 bit = ffs(inuse) - 1; 2856 pv = &pc->pc_pventry[field * 32 + bit]; 2857 va = pv->pv_va; 2858 pte1p = pmap_pte1(pmap, va); 2859 if (pte1_is_section(pte1_load(pte1p))) 2860 continue; 2861 pte2p = pmap_pte2(pmap, va); 2862 tpte2 = pte2_load(pte2p); 2863 if ((tpte2 & PTE2_W) == 0) 2864 tpte2 = pte2_load_clear(pte2p); 2865 pmap_pte2_release(pte2p); 2866 if ((tpte2 & PTE2_W) != 0) 2867 continue; 2868 KASSERT(tpte2 != 0, 2869 ("pmap_pv_reclaim: pmap %p va %#x zero pte", 2870 pmap, va)); 2871 pmap_tlb_flush(pmap, va); 2872 m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); 2873 if (pte2_is_dirty(tpte2)) 2874 vm_page_dirty(m); 2875 if ((tpte2 & PTE2_A) != 0) 2876 vm_page_aflag_set(m, PGA_REFERENCED); 2877 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2878 if (TAILQ_EMPTY(&m->md.pv_list) && 2879 (m->flags & PG_FICTITIOUS) == 0) { 2880 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2881 if (TAILQ_EMPTY(&pvh->pv_list)) { 2882 vm_page_aflag_clear(m, 2883 PGA_WRITEABLE); 2884 } 2885 } 2886 pc->pc_map[field] |= 1UL << bit; 2887 pmap_unuse_pt2(pmap, va, &free); 2888 freed++; 2889 } 2890 } 2891 if (freed == 0) { 2892 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2893 continue; 2894 } 2895 /* Every freed mapping is for a 4 KB page. */ 2896 pmap->pm_stats.resident_count -= freed; 2897 PV_STAT(pv_entry_frees += freed); 2898 PV_STAT(pv_entry_spare += freed); 2899 pv_entry_count -= freed; 2900 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2901 for (field = 0; field < _NPCM; field++) 2902 if (pc->pc_map[field] != pc_freemask[field]) { 2903 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2904 pc_list); 2905 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2906 2907 /* 2908 * One freed pv entry in locked_pmap is 2909 * sufficient. 2910 */ 2911 if (pmap == locked_pmap) 2912 goto out; 2913 break; 2914 } 2915 if (field == _NPCM) { 2916 PV_STAT(pv_entry_spare -= _NPCPV); 2917 PV_STAT(pc_chunk_count--); 2918 PV_STAT(pc_chunk_frees++); 2919 /* Entire chunk is free; return it. */ 2920 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2921 pmap_qremove((vm_offset_t)pc, 1); 2922 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2923 break; 2924 } 2925 } 2926 out: 2927 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2928 if (pmap != NULL) { 2929 if (pmap != locked_pmap) 2930 PMAP_UNLOCK(pmap); 2931 } 2932 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2933 m_pc = SLIST_FIRST(&free); 2934 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2935 /* Recycle a freed page table page. */ 2936 m_pc->wire_count = 1; 2937 vm_wire_add(1); 2938 } 2939 vm_page_free_pages_toq(&free, false); 2940 return (m_pc); 2941 } 2942 2943 static void 2944 free_pv_chunk(struct pv_chunk *pc) 2945 { 2946 vm_page_t m; 2947 2948 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2949 PV_STAT(pv_entry_spare -= _NPCPV); 2950 PV_STAT(pc_chunk_count--); 2951 PV_STAT(pc_chunk_frees++); 2952 /* entire chunk is free, return it */ 2953 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2954 pmap_qremove((vm_offset_t)pc, 1); 2955 vm_page_unwire(m, PQ_NONE); 2956 vm_page_free(m); 2957 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2958 } 2959 2960 /* 2961 * Free the pv_entry back to the free list. 2962 */ 2963 static void 2964 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2965 { 2966 struct pv_chunk *pc; 2967 int idx, field, bit; 2968 2969 rw_assert(&pvh_global_lock, RA_WLOCKED); 2970 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2971 PV_STAT(pv_entry_frees++); 2972 PV_STAT(pv_entry_spare++); 2973 pv_entry_count--; 2974 pc = pv_to_chunk(pv); 2975 idx = pv - &pc->pc_pventry[0]; 2976 field = idx / 32; 2977 bit = idx % 32; 2978 pc->pc_map[field] |= 1ul << bit; 2979 for (idx = 0; idx < _NPCM; idx++) 2980 if (pc->pc_map[idx] != pc_freemask[idx]) { 2981 /* 2982 * 98% of the time, pc is already at the head of the 2983 * list. If it isn't already, move it to the head. 2984 */ 2985 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2986 pc)) { 2987 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2988 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2989 pc_list); 2990 } 2991 return; 2992 } 2993 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2994 free_pv_chunk(pc); 2995 } 2996 2997 /* 2998 * Get a new pv_entry, allocating a block from the system 2999 * when needed. 3000 */ 3001 static pv_entry_t 3002 get_pv_entry(pmap_t pmap, boolean_t try) 3003 { 3004 static const struct timeval printinterval = { 60, 0 }; 3005 static struct timeval lastprint; 3006 int bit, field; 3007 pv_entry_t pv; 3008 struct pv_chunk *pc; 3009 vm_page_t m; 3010 3011 rw_assert(&pvh_global_lock, RA_WLOCKED); 3012 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3013 PV_STAT(pv_entry_allocs++); 3014 pv_entry_count++; 3015 if (pv_entry_count > pv_entry_high_water) 3016 if (ratecheck(&lastprint, &printinterval)) 3017 printf("Approaching the limit on PV entries, consider " 3018 "increasing either the vm.pmap.shpgperproc or the " 3019 "vm.pmap.pv_entry_max tunable.\n"); 3020 retry: 3021 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3022 if (pc != NULL) { 3023 for (field = 0; field < _NPCM; field++) { 3024 if (pc->pc_map[field]) { 3025 bit = ffs(pc->pc_map[field]) - 1; 3026 break; 3027 } 3028 } 3029 if (field < _NPCM) { 3030 pv = &pc->pc_pventry[field * 32 + bit]; 3031 pc->pc_map[field] &= ~(1ul << bit); 3032 /* If this was the last item, move it to tail */ 3033 for (field = 0; field < _NPCM; field++) 3034 if (pc->pc_map[field] != 0) { 3035 PV_STAT(pv_entry_spare--); 3036 return (pv); /* not full, return */ 3037 } 3038 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3039 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3040 PV_STAT(pv_entry_spare--); 3041 return (pv); 3042 } 3043 } 3044 /* 3045 * Access to the pte2list "pv_vafree" is synchronized by the pvh 3046 * global lock. If "pv_vafree" is currently non-empty, it will 3047 * remain non-empty until pmap_pte2list_alloc() completes. 3048 */ 3049 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 3050 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3051 if (try) { 3052 pv_entry_count--; 3053 PV_STAT(pc_chunk_tryfail++); 3054 return (NULL); 3055 } 3056 m = pmap_pv_reclaim(pmap); 3057 if (m == NULL) 3058 goto retry; 3059 } 3060 PV_STAT(pc_chunk_count++); 3061 PV_STAT(pc_chunk_allocs++); 3062 pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); 3063 pmap_qenter((vm_offset_t)pc, &m, 1); 3064 pc->pc_pmap = pmap; 3065 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 3066 for (field = 1; field < _NPCM; field++) 3067 pc->pc_map[field] = pc_freemask[field]; 3068 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3069 pv = &pc->pc_pventry[0]; 3070 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3071 PV_STAT(pv_entry_spare += _NPCPV - 1); 3072 return (pv); 3073 } 3074 3075 /* 3076 * Create a pv entry for page at pa for 3077 * (pmap, va). 3078 */ 3079 static void 3080 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3081 { 3082 pv_entry_t pv; 3083 3084 rw_assert(&pvh_global_lock, RA_WLOCKED); 3085 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3086 pv = get_pv_entry(pmap, FALSE); 3087 pv->pv_va = va; 3088 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3089 } 3090 3091 static __inline pv_entry_t 3092 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3093 { 3094 pv_entry_t pv; 3095 3096 rw_assert(&pvh_global_lock, RA_WLOCKED); 3097 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3098 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3099 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3100 break; 3101 } 3102 } 3103 return (pv); 3104 } 3105 3106 static void 3107 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3108 { 3109 pv_entry_t pv; 3110 3111 pv = pmap_pvh_remove(pvh, pmap, va); 3112 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3113 free_pv_entry(pmap, pv); 3114 } 3115 3116 static void 3117 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 3118 { 3119 struct md_page *pvh; 3120 3121 rw_assert(&pvh_global_lock, RA_WLOCKED); 3122 pmap_pvh_free(&m->md, pmap, va); 3123 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 3124 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3125 if (TAILQ_EMPTY(&pvh->pv_list)) 3126 vm_page_aflag_clear(m, PGA_WRITEABLE); 3127 } 3128 } 3129 3130 static void 3131 pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3132 { 3133 struct md_page *pvh; 3134 pv_entry_t pv; 3135 vm_offset_t va_last; 3136 vm_page_t m; 3137 3138 rw_assert(&pvh_global_lock, RA_WLOCKED); 3139 KASSERT((pa & PTE1_OFFSET) == 0, 3140 ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); 3141 3142 /* 3143 * Transfer the 1mpage's pv entry for this mapping to the first 3144 * page's pv list. 3145 */ 3146 pvh = pa_to_pvh(pa); 3147 va = pte1_trunc(va); 3148 pv = pmap_pvh_remove(pvh, pmap, va); 3149 KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); 3150 m = PHYS_TO_VM_PAGE(pa); 3151 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3152 /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3153 va_last = va + PTE1_SIZE - PAGE_SIZE; 3154 do { 3155 m++; 3156 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3157 ("pmap_pv_demote_pte1: page %p is not managed", m)); 3158 va += PAGE_SIZE; 3159 pmap_insert_entry(pmap, va, m); 3160 } while (va < va_last); 3161 } 3162 3163 #if VM_NRESERVLEVEL > 0 3164 static void 3165 pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3166 { 3167 struct md_page *pvh; 3168 pv_entry_t pv; 3169 vm_offset_t va_last; 3170 vm_page_t m; 3171 3172 rw_assert(&pvh_global_lock, RA_WLOCKED); 3173 KASSERT((pa & PTE1_OFFSET) == 0, 3174 ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); 3175 3176 /* 3177 * Transfer the first page's pv entry for this mapping to the 3178 * 1mpage's pv list. Aside from avoiding the cost of a call 3179 * to get_pv_entry(), a transfer avoids the possibility that 3180 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() 3181 * removes one of the mappings that is being promoted. 3182 */ 3183 m = PHYS_TO_VM_PAGE(pa); 3184 va = pte1_trunc(va); 3185 pv = pmap_pvh_remove(&m->md, pmap, va); 3186 KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); 3187 pvh = pa_to_pvh(pa); 3188 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3189 /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3190 va_last = va + PTE1_SIZE - PAGE_SIZE; 3191 do { 3192 m++; 3193 va += PAGE_SIZE; 3194 pmap_pvh_free(&m->md, pmap, va); 3195 } while (va < va_last); 3196 } 3197 #endif 3198 3199 /* 3200 * Conditionally create a pv entry. 3201 */ 3202 static boolean_t 3203 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3204 { 3205 pv_entry_t pv; 3206 3207 rw_assert(&pvh_global_lock, RA_WLOCKED); 3208 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3209 if (pv_entry_count < pv_entry_high_water && 3210 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3211 pv->pv_va = va; 3212 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3213 return (TRUE); 3214 } else 3215 return (FALSE); 3216 } 3217 3218 /* 3219 * Create the pv entries for each of the pages within a section. 3220 */ 3221 static boolean_t 3222 pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3223 { 3224 struct md_page *pvh; 3225 pv_entry_t pv; 3226 3227 rw_assert(&pvh_global_lock, RA_WLOCKED); 3228 if (pv_entry_count < pv_entry_high_water && 3229 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3230 pv->pv_va = va; 3231 pvh = pa_to_pvh(pa); 3232 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3233 return (TRUE); 3234 } else 3235 return (FALSE); 3236 } 3237 3238 static inline void 3239 pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) 3240 { 3241 3242 /* Kill all the small mappings or the big one only. */ 3243 if (pte1_is_section(npte1)) 3244 pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); 3245 else 3246 pmap_tlb_flush(pmap, pte1_trunc(va)); 3247 } 3248 3249 /* 3250 * Update kernel pte1 on all pmaps. 3251 * 3252 * The following function is called only on one cpu with disabled interrupts. 3253 * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way 3254 * nobody can invoke explicit hardware table walk during the update of pte1. 3255 * Unsolicited hardware table walk can still happen, invoked by speculative 3256 * data or instruction prefetch or even by speculative hardware table walk. 3257 * 3258 * The break-before-make approach should be implemented here. However, it's 3259 * not so easy to do that for kernel mappings as it would be unhappy to unmap 3260 * itself unexpectedly but voluntarily. 3261 */ 3262 static void 3263 pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) 3264 { 3265 pmap_t pmap; 3266 pt1_entry_t *pte1p; 3267 3268 /* 3269 * Get current pmap. Interrupts should be disabled here 3270 * so PCPU_GET() is done atomically. 3271 */ 3272 pmap = PCPU_GET(curpmap); 3273 if (pmap == NULL) 3274 pmap = kernel_pmap; 3275 3276 /* 3277 * (1) Change pte1 on current pmap. 3278 * (2) Flush all obsolete TLB entries on current CPU. 3279 * (3) Change pte1 on all pmaps. 3280 * (4) Flush all obsolete TLB entries on all CPUs in SMP case. 3281 */ 3282 3283 pte1p = pmap_pte1(pmap, va); 3284 pte1_store(pte1p, npte1); 3285 3286 /* Kill all the small mappings or the big one only. */ 3287 if (pte1_is_section(npte1)) { 3288 pmap_pte1_kern_promotions++; 3289 tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); 3290 } else { 3291 pmap_pte1_kern_demotions++; 3292 tlb_flush_local(pte1_trunc(va)); 3293 } 3294 3295 /* 3296 * In SMP case, this function is called when all cpus are at smp 3297 * rendezvous, so there is no need to use 'allpmaps_lock' lock here. 3298 * In UP case, the function is called with this lock locked. 3299 */ 3300 LIST_FOREACH(pmap, &allpmaps, pm_list) { 3301 pte1p = pmap_pte1(pmap, va); 3302 pte1_store(pte1p, npte1); 3303 } 3304 3305 #ifdef SMP 3306 /* Kill all the small mappings or the big one only. */ 3307 if (pte1_is_section(npte1)) 3308 tlb_flush_range(pte1_trunc(va), PTE1_SIZE); 3309 else 3310 tlb_flush(pte1_trunc(va)); 3311 #endif 3312 } 3313 3314 #ifdef SMP 3315 struct pte1_action { 3316 vm_offset_t va; 3317 pt1_entry_t npte1; 3318 u_int update; /* CPU that updates the PTE1 */ 3319 }; 3320 3321 static void 3322 pmap_update_pte1_action(void *arg) 3323 { 3324 struct pte1_action *act = arg; 3325 3326 if (act->update == PCPU_GET(cpuid)) 3327 pmap_update_pte1_kernel(act->va, act->npte1); 3328 } 3329 3330 /* 3331 * Change pte1 on current pmap. 3332 * Note that kernel pte1 must be changed on all pmaps. 3333 * 3334 * According to the architecture reference manual published by ARM, 3335 * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. 3336 * According to this manual, UNPREDICTABLE behaviours must never happen in 3337 * a viable system. In contrast, on x86 processors, it is not specified which 3338 * TLB entry mapping the virtual address will be used, but the MMU doesn't 3339 * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone 3340 * Black). 3341 * 3342 * It's a problem when either promotion or demotion is being done. The pte1 3343 * update and appropriate TLB flush must be done atomically in general. 3344 */ 3345 static void 3346 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3347 pt1_entry_t npte1) 3348 { 3349 3350 if (pmap == kernel_pmap) { 3351 struct pte1_action act; 3352 3353 sched_pin(); 3354 act.va = va; 3355 act.npte1 = npte1; 3356 act.update = PCPU_GET(cpuid); 3357 smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, 3358 pmap_update_pte1_action, NULL, &act); 3359 sched_unpin(); 3360 } else { 3361 register_t cspr; 3362 3363 /* 3364 * Use break-before-make approach for changing userland 3365 * mappings. It can cause L1 translation aborts on other 3366 * cores in SMP case. So, special treatment is implemented 3367 * in pmap_fault(). To reduce the likelihood that another core 3368 * will be affected by the broken mapping, disable interrupts 3369 * until the mapping change is completed. 3370 */ 3371 cspr = disable_interrupts(PSR_I | PSR_F); 3372 pte1_clear(pte1p); 3373 pmap_tlb_flush_pte1(pmap, va, npte1); 3374 pte1_store(pte1p, npte1); 3375 restore_interrupts(cspr); 3376 } 3377 } 3378 #else 3379 static void 3380 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3381 pt1_entry_t npte1) 3382 { 3383 3384 if (pmap == kernel_pmap) { 3385 mtx_lock_spin(&allpmaps_lock); 3386 pmap_update_pte1_kernel(va, npte1); 3387 mtx_unlock_spin(&allpmaps_lock); 3388 } else { 3389 register_t cspr; 3390 3391 /* 3392 * Use break-before-make approach for changing userland 3393 * mappings. It's absolutely safe in UP case when interrupts 3394 * are disabled. 3395 */ 3396 cspr = disable_interrupts(PSR_I | PSR_F); 3397 pte1_clear(pte1p); 3398 pmap_tlb_flush_pte1(pmap, va, npte1); 3399 pte1_store(pte1p, npte1); 3400 restore_interrupts(cspr); 3401 } 3402 } 3403 #endif 3404 3405 #if VM_NRESERVLEVEL > 0 3406 /* 3407 * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are 3408 * within a single page table page (PT2) to a single 1MB page mapping. 3409 * For promotion to occur, two conditions must be met: (1) the 4KB page 3410 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3411 * mappings must have identical characteristics. 3412 * 3413 * Managed (PG_MANAGED) mappings within the kernel address space are not 3414 * promoted. The reason is that kernel PTE1s are replicated in each pmap but 3415 * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only 3416 * read the PTE1 from the kernel pmap. 3417 */ 3418 static void 3419 pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3420 { 3421 pt1_entry_t npte1; 3422 pt2_entry_t *fpte2p, fpte2, fpte2_fav; 3423 pt2_entry_t *pte2p, pte2; 3424 vm_offset_t pteva __unused; 3425 vm_page_t m __unused; 3426 3427 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3428 pmap, va, pte1_load(pte1p), pte1p)); 3429 3430 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3431 3432 /* 3433 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is 3434 * either invalid, unused, or does not map the first 4KB physical page 3435 * within a 1MB page. 3436 */ 3437 fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); 3438 fpte2 = pte2_load(fpte2p); 3439 if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != 3440 (PTE2_A | PTE2_V)) { 3441 pmap_pte1_p_failures++; 3442 CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", 3443 __func__, va, pmap); 3444 return; 3445 } 3446 if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { 3447 pmap_pte1_p_failures++; 3448 CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", 3449 __func__, va, pmap); 3450 return; 3451 } 3452 if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3453 /* 3454 * When page is not modified, PTE2_RO can be set without 3455 * a TLB invalidation. 3456 */ 3457 fpte2 |= PTE2_RO; 3458 pte2_store(fpte2p, fpte2); 3459 } 3460 3461 /* 3462 * Examine each of the other PTE2s in the specified PT2. Abort if this 3463 * PTE2 maps an unexpected 4KB physical page or does not have identical 3464 * characteristics to the first PTE2. 3465 */ 3466 fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); 3467 fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ 3468 for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { 3469 pte2 = pte2_load(pte2p); 3470 if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { 3471 pmap_pte1_p_failures++; 3472 CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", 3473 __func__, va, pmap); 3474 return; 3475 } 3476 if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3477 /* 3478 * When page is not modified, PTE2_RO can be set 3479 * without a TLB invalidation. See note above. 3480 */ 3481 pte2 |= PTE2_RO; 3482 pte2_store(pte2p, pte2); 3483 pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & 3484 PTE2_FRAME); 3485 CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", 3486 __func__, pteva, pmap); 3487 } 3488 if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { 3489 pmap_pte1_p_failures++; 3490 CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", 3491 __func__, va, pmap); 3492 return; 3493 } 3494 3495 fpte2_fav -= PTE2_SIZE; 3496 } 3497 /* 3498 * The page table page in its current state will stay in PT2TAB 3499 * until the PTE1 mapping the section is demoted by pmap_demote_pte1() 3500 * or destroyed by pmap_remove_pte1(). 3501 * 3502 * Note that L2 page table size is not equal to PAGE_SIZE. 3503 */ 3504 m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); 3505 KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], 3506 ("%s: PT2 page is out of range", __func__)); 3507 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 3508 ("%s: PT2 page's pindex is wrong", __func__)); 3509 3510 /* 3511 * Get pte1 from pte2 format. 3512 */ 3513 npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; 3514 3515 /* 3516 * Promote the pv entries. 3517 */ 3518 if (pte2_is_managed(fpte2)) 3519 pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); 3520 3521 /* 3522 * Promote the mappings. 3523 */ 3524 pmap_change_pte1(pmap, pte1p, va, npte1); 3525 3526 pmap_pte1_promotions++; 3527 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3528 __func__, va, pmap); 3529 3530 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3531 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3532 } 3533 #endif /* VM_NRESERVLEVEL > 0 */ 3534 3535 /* 3536 * Zero L2 page table page. 3537 */ 3538 static __inline void 3539 pmap_clear_pt2(pt2_entry_t *fpte2p) 3540 { 3541 pt2_entry_t *pte2p; 3542 3543 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) 3544 pte2_clear(pte2p); 3545 3546 } 3547 3548 /* 3549 * Removes a 1MB page mapping from the kernel pmap. 3550 */ 3551 static void 3552 pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3553 { 3554 vm_page_t m; 3555 uint32_t pte1_idx; 3556 pt2_entry_t *fpte2p; 3557 vm_paddr_t pt2_pa; 3558 3559 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3560 m = pmap_pt2_page(pmap, va); 3561 if (m == NULL) 3562 /* 3563 * QQQ: Is this function called only on promoted pte1? 3564 * We certainly do section mappings directly 3565 * (without promotion) in kernel !!! 3566 */ 3567 panic("%s: missing pt2 page", __func__); 3568 3569 pte1_idx = pte1_index(va); 3570 3571 /* 3572 * Initialize the L2 page table. 3573 */ 3574 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3575 pmap_clear_pt2(fpte2p); 3576 3577 /* 3578 * Remove the mapping. 3579 */ 3580 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); 3581 pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); 3582 3583 /* 3584 * QQQ: We do not need to invalidate PT2MAP mapping 3585 * as we did not change it. I.e. the L2 page table page 3586 * was and still is mapped the same way. 3587 */ 3588 } 3589 3590 /* 3591 * Do the things to unmap a section in a process 3592 */ 3593 static void 3594 pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 3595 struct spglist *free) 3596 { 3597 pt1_entry_t opte1; 3598 struct md_page *pvh; 3599 vm_offset_t eva, va; 3600 vm_page_t m; 3601 3602 PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, 3603 pte1_load(pte1p), pte1p)); 3604 3605 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3606 KASSERT((sva & PTE1_OFFSET) == 0, 3607 ("%s: sva is not 1mpage aligned", __func__)); 3608 3609 /* 3610 * Clear and invalidate the mapping. It should occupy one and only TLB 3611 * entry. So, pmap_tlb_flush() called with aligned address should be 3612 * sufficient. 3613 */ 3614 opte1 = pte1_load_clear(pte1p); 3615 pmap_tlb_flush(pmap, sva); 3616 3617 if (pte1_is_wired(opte1)) 3618 pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; 3619 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 3620 if (pte1_is_managed(opte1)) { 3621 pvh = pa_to_pvh(pte1_pa(opte1)); 3622 pmap_pvh_free(pvh, pmap, sva); 3623 eva = sva + PTE1_SIZE; 3624 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 3625 va < eva; va += PAGE_SIZE, m++) { 3626 if (pte1_is_dirty(opte1)) 3627 vm_page_dirty(m); 3628 if (opte1 & PTE1_A) 3629 vm_page_aflag_set(m, PGA_REFERENCED); 3630 if (TAILQ_EMPTY(&m->md.pv_list) && 3631 TAILQ_EMPTY(&pvh->pv_list)) 3632 vm_page_aflag_clear(m, PGA_WRITEABLE); 3633 } 3634 } 3635 if (pmap == kernel_pmap) { 3636 /* 3637 * L2 page table(s) can't be removed from kernel map as 3638 * kernel counts on it (stuff around pmap_growkernel()). 3639 */ 3640 pmap_remove_kernel_pte1(pmap, pte1p, sva); 3641 } else { 3642 /* 3643 * Get associated L2 page table page. 3644 * It's possible that the page was never allocated. 3645 */ 3646 m = pmap_pt2_page(pmap, sva); 3647 if (m != NULL) 3648 pmap_unwire_pt2_all(pmap, sva, m, free); 3649 } 3650 } 3651 3652 /* 3653 * Fills L2 page table page with mappings to consecutive physical pages. 3654 */ 3655 static __inline void 3656 pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) 3657 { 3658 pt2_entry_t *pte2p; 3659 3660 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { 3661 pte2_store(pte2p, npte2); 3662 npte2 += PTE2_SIZE; 3663 } 3664 } 3665 3666 /* 3667 * Tries to demote a 1MB page mapping. If demotion fails, the 3668 * 1MB page mapping is invalidated. 3669 */ 3670 static boolean_t 3671 pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3672 { 3673 pt1_entry_t opte1, npte1; 3674 pt2_entry_t *fpte2p, npte2; 3675 vm_paddr_t pt2pg_pa, pt2_pa; 3676 vm_page_t m; 3677 struct spglist free; 3678 uint32_t pte1_idx, isnew = 0; 3679 3680 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3681 pmap, va, pte1_load(pte1p), pte1p)); 3682 3683 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3684 3685 opte1 = pte1_load(pte1p); 3686 KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); 3687 3688 if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { 3689 KASSERT(!pte1_is_wired(opte1), 3690 ("%s: PT2 page for a wired mapping is missing", __func__)); 3691 3692 /* 3693 * Invalidate the 1MB page mapping and return 3694 * "failure" if the mapping was never accessed or the 3695 * allocation of the new page table page fails. 3696 */ 3697 if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, 3698 pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | 3699 VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { 3700 SLIST_INIT(&free); 3701 pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); 3702 vm_page_free_pages_toq(&free, false); 3703 CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", 3704 __func__, va, pmap); 3705 return (FALSE); 3706 } 3707 if (va < VM_MAXUSER_ADDRESS) 3708 pmap->pm_stats.resident_count++; 3709 3710 isnew = 1; 3711 3712 /* 3713 * We init all L2 page tables in the page even if 3714 * we are going to change everything for one L2 page 3715 * table in a while. 3716 */ 3717 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 3718 } else { 3719 if (va < VM_MAXUSER_ADDRESS) { 3720 if (pt2_is_empty(m, va)) 3721 isnew = 1; /* Demoting section w/o promotion. */ 3722 #ifdef INVARIANTS 3723 else 3724 KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" 3725 " count %u", __func__, 3726 pt2_wirecount_get(m, pte1_index(va)))); 3727 #endif 3728 } 3729 } 3730 3731 pt2pg_pa = VM_PAGE_TO_PHYS(m); 3732 pte1_idx = pte1_index(va); 3733 /* 3734 * If the pmap is current, then the PT2MAP can provide access to 3735 * the page table page (promoted L2 page tables are not unmapped). 3736 * Otherwise, temporarily map the L2 page table page (m) into 3737 * the kernel's address space at either PADDR1 or PADDR2. 3738 * 3739 * Note that L2 page table size is not equal to PAGE_SIZE. 3740 */ 3741 if (pmap_is_current(pmap)) 3742 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3743 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 3744 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 3745 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 3746 #ifdef SMP 3747 PMAP1cpu = PCPU_GET(cpuid); 3748 #endif 3749 tlb_flush_local((vm_offset_t)PADDR1); 3750 PMAP1changed++; 3751 } else 3752 #ifdef SMP 3753 if (PMAP1cpu != PCPU_GET(cpuid)) { 3754 PMAP1cpu = PCPU_GET(cpuid); 3755 tlb_flush_local((vm_offset_t)PADDR1); 3756 PMAP1changedcpu++; 3757 } else 3758 #endif 3759 PMAP1unchanged++; 3760 fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); 3761 } else { 3762 mtx_lock(&PMAP2mutex); 3763 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 3764 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 3765 tlb_flush((vm_offset_t)PADDR2); 3766 } 3767 fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); 3768 } 3769 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 3770 npte1 = PTE1_LINK(pt2_pa); 3771 3772 KASSERT((opte1 & PTE1_A) != 0, 3773 ("%s: opte1 is missing PTE1_A", __func__)); 3774 KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, 3775 ("%s: opte1 has PTE1_NM", __func__)); 3776 3777 /* 3778 * Get pte2 from pte1 format. 3779 */ 3780 npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; 3781 3782 /* 3783 * If the L2 page table page is new, initialize it. If the mapping 3784 * has changed attributes, update the page table entries. 3785 */ 3786 if (isnew != 0) { 3787 pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); 3788 pmap_fill_pt2(fpte2p, npte2); 3789 } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != 3790 (npte2 & PTE2_PROMOTE)) 3791 pmap_fill_pt2(fpte2p, npte2); 3792 3793 KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), 3794 ("%s: fpte2p and npte2 map different physical addresses", 3795 __func__)); 3796 3797 if (fpte2p == PADDR2) 3798 mtx_unlock(&PMAP2mutex); 3799 3800 /* 3801 * Demote the mapping. This pmap is locked. The old PTE1 has 3802 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also 3803 * has not PTE1_NM set. Thus, there is no danger of a race with 3804 * another processor changing the setting of PTE1_A and/or PTE1_NM 3805 * between the read above and the store below. 3806 */ 3807 pmap_change_pte1(pmap, pte1p, va, npte1); 3808 3809 /* 3810 * Demote the pv entry. This depends on the earlier demotion 3811 * of the mapping. Specifically, the (re)creation of a per- 3812 * page pv entry might trigger the execution of pmap_pv_reclaim(), 3813 * which might reclaim a newly (re)created per-page pv entry 3814 * and destroy the associated mapping. In order to destroy 3815 * the mapping, the PTE1 must have already changed from mapping 3816 * the 1mpage to referencing the page table page. 3817 */ 3818 if (pte1_is_managed(opte1)) 3819 pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); 3820 3821 pmap_pte1_demotions++; 3822 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3823 __func__, va, pmap); 3824 3825 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3826 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3827 return (TRUE); 3828 } 3829 3830 /* 3831 * Insert the given physical page (p) at 3832 * the specified virtual address (v) in the 3833 * target physical map with the protection requested. 3834 * 3835 * If specified, the page will be wired down, meaning 3836 * that the related pte can not be reclaimed. 3837 * 3838 * NB: This is the only routine which MAY NOT lazy-evaluate 3839 * or lose information. That is, this routine must actually 3840 * insert this page into the given map NOW. 3841 */ 3842 int 3843 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3844 u_int flags, int8_t psind) 3845 { 3846 pt1_entry_t *pte1p; 3847 pt2_entry_t *pte2p; 3848 pt2_entry_t npte2, opte2; 3849 pv_entry_t pv; 3850 vm_paddr_t opa, pa; 3851 vm_page_t mpte2, om; 3852 boolean_t wired; 3853 3854 va = trunc_page(va); 3855 mpte2 = NULL; 3856 wired = (flags & PMAP_ENTER_WIRED) != 0; 3857 3858 KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); 3859 KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, 3860 ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, 3861 va)); 3862 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3863 VM_OBJECT_ASSERT_LOCKED(m->object); 3864 3865 rw_wlock(&pvh_global_lock); 3866 PMAP_LOCK(pmap); 3867 sched_pin(); 3868 3869 /* 3870 * In the case that a page table page is not 3871 * resident, we are creating it here. 3872 */ 3873 if (va < VM_MAXUSER_ADDRESS) { 3874 mpte2 = pmap_allocpte2(pmap, va, flags); 3875 if (mpte2 == NULL) { 3876 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3877 ("pmap_allocpte2 failed with sleep allowed")); 3878 sched_unpin(); 3879 rw_wunlock(&pvh_global_lock); 3880 PMAP_UNLOCK(pmap); 3881 return (KERN_RESOURCE_SHORTAGE); 3882 } 3883 } 3884 pte1p = pmap_pte1(pmap, va); 3885 if (pte1_is_section(pte1_load(pte1p))) 3886 panic("%s: attempted on 1MB page", __func__); 3887 pte2p = pmap_pte2_quick(pmap, va); 3888 if (pte2p == NULL) 3889 panic("%s: invalid L1 page table entry va=%#x", __func__, va); 3890 3891 om = NULL; 3892 pa = VM_PAGE_TO_PHYS(m); 3893 opte2 = pte2_load(pte2p); 3894 opa = pte2_pa(opte2); 3895 /* 3896 * Mapping has not changed, must be protection or wiring change. 3897 */ 3898 if (pte2_is_valid(opte2) && (opa == pa)) { 3899 /* 3900 * Wiring change, just update stats. We don't worry about 3901 * wiring PT2 pages as they remain resident as long as there 3902 * are valid mappings in them. Hence, if a user page is wired, 3903 * the PT2 page will be also. 3904 */ 3905 if (wired && !pte2_is_wired(opte2)) 3906 pmap->pm_stats.wired_count++; 3907 else if (!wired && pte2_is_wired(opte2)) 3908 pmap->pm_stats.wired_count--; 3909 3910 /* 3911 * Remove extra pte2 reference 3912 */ 3913 if (mpte2) 3914 pt2_wirecount_dec(mpte2, pte1_index(va)); 3915 if (pte2_is_managed(opte2)) 3916 om = m; 3917 goto validate; 3918 } 3919 3920 /* 3921 * QQQ: We think that changing physical address on writeable mapping 3922 * is not safe. Well, maybe on kernel address space with correct 3923 * locking, it can make a sense. However, we have no idea why 3924 * anyone should do that on user address space. Are we wrong? 3925 */ 3926 KASSERT((opa == 0) || (opa == pa) || 3927 !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), 3928 ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", 3929 __func__, pmap, va, opte2, opa, pa, flags, prot)); 3930 3931 pv = NULL; 3932 3933 /* 3934 * Mapping has changed, invalidate old range and fall through to 3935 * handle validating new mapping. 3936 */ 3937 if (opa) { 3938 if (pte2_is_wired(opte2)) 3939 pmap->pm_stats.wired_count--; 3940 if (pte2_is_managed(opte2)) { 3941 om = PHYS_TO_VM_PAGE(opa); 3942 pv = pmap_pvh_remove(&om->md, pmap, va); 3943 } 3944 /* 3945 * Remove extra pte2 reference 3946 */ 3947 if (mpte2 != NULL) 3948 pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); 3949 } else 3950 pmap->pm_stats.resident_count++; 3951 3952 /* 3953 * Enter on the PV list if part of our managed memory. 3954 */ 3955 if ((m->oflags & VPO_UNMANAGED) == 0) { 3956 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3957 ("%s: managed mapping within the clean submap", __func__)); 3958 if (pv == NULL) 3959 pv = get_pv_entry(pmap, FALSE); 3960 pv->pv_va = va; 3961 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3962 } else if (pv != NULL) 3963 free_pv_entry(pmap, pv); 3964 3965 /* 3966 * Increment counters 3967 */ 3968 if (wired) 3969 pmap->pm_stats.wired_count++; 3970 3971 validate: 3972 /* 3973 * Now validate mapping with desired protection/wiring. 3974 */ 3975 npte2 = PTE2(pa, PTE2_NM, vm_page_pte2_attr(m)); 3976 if (prot & VM_PROT_WRITE) { 3977 if (pte2_is_managed(npte2)) 3978 vm_page_aflag_set(m, PGA_WRITEABLE); 3979 } 3980 else 3981 npte2 |= PTE2_RO; 3982 if ((prot & VM_PROT_EXECUTE) == 0) 3983 npte2 |= PTE2_NX; 3984 if (wired) 3985 npte2 |= PTE2_W; 3986 if (va < VM_MAXUSER_ADDRESS) 3987 npte2 |= PTE2_U; 3988 if (pmap != kernel_pmap) 3989 npte2 |= PTE2_NG; 3990 3991 /* 3992 * If the mapping or permission bits are different, we need 3993 * to update the pte2. 3994 * 3995 * QQQ: Think again and again what to do 3996 * if the mapping is going to be changed! 3997 */ 3998 if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { 3999 /* 4000 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4001 * is set. Do it now, before the mapping is stored and made 4002 * valid for hardware table walk. If done later, there is a race 4003 * for other threads of current process in lazy loading case. 4004 * Don't do it for kernel memory which is mapped with exec 4005 * permission even if the memory isn't going to hold executable 4006 * code. The only time when icache sync is needed is after 4007 * kernel module is loaded and the relocation info is processed. 4008 * And it's done in elf_cpu_load_file(). 4009 * 4010 * QQQ: (1) Does it exist any better way where 4011 * or how to sync icache? 4012 * (2) Now, we do it on a page basis. 4013 */ 4014 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4015 m->md.pat_mode == VM_MEMATTR_WB_WA && 4016 (opa != pa || (opte2 & PTE2_NX))) 4017 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4018 4019 npte2 |= PTE2_A; 4020 if (flags & VM_PROT_WRITE) 4021 npte2 &= ~PTE2_NM; 4022 if (opte2 & PTE2_V) { 4023 /* Change mapping with break-before-make approach. */ 4024 opte2 = pte2_load_clear(pte2p); 4025 pmap_tlb_flush(pmap, va); 4026 pte2_store(pte2p, npte2); 4027 if (opte2 & PTE2_A) { 4028 if (pte2_is_managed(opte2)) 4029 vm_page_aflag_set(om, PGA_REFERENCED); 4030 } 4031 if (pte2_is_dirty(opte2)) { 4032 if (pte2_is_managed(opte2)) 4033 vm_page_dirty(om); 4034 } 4035 if (pte2_is_managed(opte2) && 4036 TAILQ_EMPTY(&om->md.pv_list) && 4037 ((om->flags & PG_FICTITIOUS) != 0 || 4038 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4039 vm_page_aflag_clear(om, PGA_WRITEABLE); 4040 } else 4041 pte2_store(pte2p, npte2); 4042 } 4043 #if 0 4044 else { 4045 /* 4046 * QQQ: In time when both access and not mofified bits are 4047 * emulated by software, this should not happen. Some 4048 * analysis is need, if this really happen. Missing 4049 * tlb flush somewhere could be the reason. 4050 */ 4051 panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, 4052 va, opte2, npte2); 4053 } 4054 #endif 4055 4056 #if VM_NRESERVLEVEL > 0 4057 /* 4058 * If both the L2 page table page and the reservation are fully 4059 * populated, then attempt promotion. 4060 */ 4061 if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && 4062 sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && 4063 vm_reserv_level_iffullpop(m) == 0) 4064 pmap_promote_pte1(pmap, pte1p, va); 4065 #endif 4066 sched_unpin(); 4067 rw_wunlock(&pvh_global_lock); 4068 PMAP_UNLOCK(pmap); 4069 return (KERN_SUCCESS); 4070 } 4071 4072 /* 4073 * Do the things to unmap a page in a process. 4074 */ 4075 static int 4076 pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, 4077 struct spglist *free) 4078 { 4079 pt2_entry_t opte2; 4080 vm_page_t m; 4081 4082 rw_assert(&pvh_global_lock, RA_WLOCKED); 4083 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4084 4085 /* Clear and invalidate the mapping. */ 4086 opte2 = pte2_load_clear(pte2p); 4087 pmap_tlb_flush(pmap, va); 4088 4089 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", 4090 __func__, pmap, va, opte2)); 4091 4092 if (opte2 & PTE2_W) 4093 pmap->pm_stats.wired_count -= 1; 4094 pmap->pm_stats.resident_count -= 1; 4095 if (pte2_is_managed(opte2)) { 4096 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4097 if (pte2_is_dirty(opte2)) 4098 vm_page_dirty(m); 4099 if (opte2 & PTE2_A) 4100 vm_page_aflag_set(m, PGA_REFERENCED); 4101 pmap_remove_entry(pmap, m, va); 4102 } 4103 return (pmap_unuse_pt2(pmap, va, free)); 4104 } 4105 4106 /* 4107 * Remove a single page from a process address space. 4108 */ 4109 static void 4110 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 4111 { 4112 pt2_entry_t *pte2p; 4113 4114 rw_assert(&pvh_global_lock, RA_WLOCKED); 4115 KASSERT(curthread->td_pinned > 0, 4116 ("%s: curthread not pinned", __func__)); 4117 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4118 if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || 4119 !pte2_is_valid(pte2_load(pte2p))) 4120 return; 4121 pmap_remove_pte2(pmap, pte2p, va, free); 4122 } 4123 4124 /* 4125 * Remove the given range of addresses from the specified map. 4126 * 4127 * It is assumed that the start and end are properly 4128 * rounded to the page size. 4129 */ 4130 void 4131 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4132 { 4133 vm_offset_t nextva; 4134 pt1_entry_t *pte1p, pte1; 4135 pt2_entry_t *pte2p, pte2; 4136 struct spglist free; 4137 4138 /* 4139 * Perform an unsynchronized read. This is, however, safe. 4140 */ 4141 if (pmap->pm_stats.resident_count == 0) 4142 return; 4143 4144 SLIST_INIT(&free); 4145 4146 rw_wlock(&pvh_global_lock); 4147 sched_pin(); 4148 PMAP_LOCK(pmap); 4149 4150 /* 4151 * Special handling of removing one page. A very common 4152 * operation and easy to short circuit some code. 4153 */ 4154 if (sva + PAGE_SIZE == eva) { 4155 pte1 = pte1_load(pmap_pte1(pmap, sva)); 4156 if (pte1_is_link(pte1)) { 4157 pmap_remove_page(pmap, sva, &free); 4158 goto out; 4159 } 4160 } 4161 4162 for (; sva < eva; sva = nextva) { 4163 /* 4164 * Calculate address for next L2 page table. 4165 */ 4166 nextva = pte1_trunc(sva + PTE1_SIZE); 4167 if (nextva < sva) 4168 nextva = eva; 4169 if (pmap->pm_stats.resident_count == 0) 4170 break; 4171 4172 pte1p = pmap_pte1(pmap, sva); 4173 pte1 = pte1_load(pte1p); 4174 4175 /* 4176 * Weed out invalid mappings. Note: we assume that the L1 page 4177 * table is always allocated, and in kernel virtual. 4178 */ 4179 if (pte1 == 0) 4180 continue; 4181 4182 if (pte1_is_section(pte1)) { 4183 /* 4184 * Are we removing the entire large page? If not, 4185 * demote the mapping and fall through. 4186 */ 4187 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4188 pmap_remove_pte1(pmap, pte1p, sva, &free); 4189 continue; 4190 } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4191 /* The large page mapping was destroyed. */ 4192 continue; 4193 } 4194 #ifdef INVARIANTS 4195 else { 4196 /* Update pte1 after demotion. */ 4197 pte1 = pte1_load(pte1p); 4198 } 4199 #endif 4200 } 4201 4202 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4203 " is not link", __func__, pmap, sva, pte1, pte1p)); 4204 4205 /* 4206 * Limit our scan to either the end of the va represented 4207 * by the current L2 page table page, or to the end of the 4208 * range being removed. 4209 */ 4210 if (nextva > eva) 4211 nextva = eva; 4212 4213 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; 4214 pte2p++, sva += PAGE_SIZE) { 4215 pte2 = pte2_load(pte2p); 4216 if (!pte2_is_valid(pte2)) 4217 continue; 4218 if (pmap_remove_pte2(pmap, pte2p, sva, &free)) 4219 break; 4220 } 4221 } 4222 out: 4223 sched_unpin(); 4224 rw_wunlock(&pvh_global_lock); 4225 PMAP_UNLOCK(pmap); 4226 vm_page_free_pages_toq(&free, false); 4227 } 4228 4229 /* 4230 * Routine: pmap_remove_all 4231 * Function: 4232 * Removes this physical page from 4233 * all physical maps in which it resides. 4234 * Reflects back modify bits to the pager. 4235 * 4236 * Notes: 4237 * Original versions of this routine were very 4238 * inefficient because they iteratively called 4239 * pmap_remove (slow...) 4240 */ 4241 4242 void 4243 pmap_remove_all(vm_page_t m) 4244 { 4245 struct md_page *pvh; 4246 pv_entry_t pv; 4247 pmap_t pmap; 4248 pt2_entry_t *pte2p, opte2; 4249 pt1_entry_t *pte1p; 4250 vm_offset_t va; 4251 struct spglist free; 4252 4253 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4254 ("%s: page %p is not managed", __func__, m)); 4255 SLIST_INIT(&free); 4256 rw_wlock(&pvh_global_lock); 4257 sched_pin(); 4258 if ((m->flags & PG_FICTITIOUS) != 0) 4259 goto small_mappings; 4260 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4261 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4262 va = pv->pv_va; 4263 pmap = PV_PMAP(pv); 4264 PMAP_LOCK(pmap); 4265 pte1p = pmap_pte1(pmap, va); 4266 (void)pmap_demote_pte1(pmap, pte1p, va); 4267 PMAP_UNLOCK(pmap); 4268 } 4269 small_mappings: 4270 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4271 pmap = PV_PMAP(pv); 4272 PMAP_LOCK(pmap); 4273 pmap->pm_stats.resident_count--; 4274 pte1p = pmap_pte1(pmap, pv->pv_va); 4275 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " 4276 "a 1mpage in page %p's pv list", __func__, m)); 4277 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 4278 opte2 = pte2_load_clear(pte2p); 4279 pmap_tlb_flush(pmap, pv->pv_va); 4280 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", 4281 __func__, pmap, pv->pv_va)); 4282 if (pte2_is_wired(opte2)) 4283 pmap->pm_stats.wired_count--; 4284 if (opte2 & PTE2_A) 4285 vm_page_aflag_set(m, PGA_REFERENCED); 4286 4287 /* 4288 * Update the vm_page_t clean and reference bits. 4289 */ 4290 if (pte2_is_dirty(opte2)) 4291 vm_page_dirty(m); 4292 pmap_unuse_pt2(pmap, pv->pv_va, &free); 4293 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4294 free_pv_entry(pmap, pv); 4295 PMAP_UNLOCK(pmap); 4296 } 4297 vm_page_aflag_clear(m, PGA_WRITEABLE); 4298 sched_unpin(); 4299 rw_wunlock(&pvh_global_lock); 4300 vm_page_free_pages_toq(&free, false); 4301 } 4302 4303 /* 4304 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4305 * good coding style, a.k.a. 80 character line width limit hell. 4306 */ 4307 static __inline void 4308 pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, 4309 struct spglist *free) 4310 { 4311 vm_paddr_t pa; 4312 vm_page_t m, mt, mpt2pg; 4313 struct md_page *pvh; 4314 4315 pa = pte1_pa(pte1); 4316 m = PHYS_TO_VM_PAGE(pa); 4317 4318 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4319 __func__, m, m->phys_addr, pa)); 4320 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4321 m < &vm_page_array[vm_page_array_size], 4322 ("%s: bad pte1 %#x", __func__, pte1)); 4323 4324 if (pte1_is_dirty(pte1)) { 4325 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4326 vm_page_dirty(mt); 4327 } 4328 4329 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 4330 pvh = pa_to_pvh(pa); 4331 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4332 if (TAILQ_EMPTY(&pvh->pv_list)) { 4333 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4334 if (TAILQ_EMPTY(&mt->md.pv_list)) 4335 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4336 } 4337 mpt2pg = pmap_pt2_page(pmap, pv->pv_va); 4338 if (mpt2pg != NULL) 4339 pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); 4340 } 4341 4342 /* 4343 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4344 * good coding style, a.k.a. 80 character line width limit hell. 4345 */ 4346 static __inline void 4347 pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, 4348 struct spglist *free) 4349 { 4350 vm_paddr_t pa; 4351 vm_page_t m; 4352 struct md_page *pvh; 4353 4354 pa = pte2_pa(pte2); 4355 m = PHYS_TO_VM_PAGE(pa); 4356 4357 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4358 __func__, m, m->phys_addr, pa)); 4359 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4360 m < &vm_page_array[vm_page_array_size], 4361 ("%s: bad pte2 %#x", __func__, pte2)); 4362 4363 if (pte2_is_dirty(pte2)) 4364 vm_page_dirty(m); 4365 4366 pmap->pm_stats.resident_count--; 4367 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4368 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 4369 pvh = pa_to_pvh(pa); 4370 if (TAILQ_EMPTY(&pvh->pv_list)) 4371 vm_page_aflag_clear(m, PGA_WRITEABLE); 4372 } 4373 pmap_unuse_pt2(pmap, pv->pv_va, free); 4374 } 4375 4376 /* 4377 * Remove all pages from specified address space this aids process 4378 * exit speeds. Also, this code is special cased for current process 4379 * only, but can have the more generic (and slightly slower) mode enabled. 4380 * This is much faster than pmap_remove in the case of running down 4381 * an entire address space. 4382 */ 4383 void 4384 pmap_remove_pages(pmap_t pmap) 4385 { 4386 pt1_entry_t *pte1p, pte1; 4387 pt2_entry_t *pte2p, pte2; 4388 pv_entry_t pv; 4389 struct pv_chunk *pc, *npc; 4390 struct spglist free; 4391 int field, idx; 4392 int32_t bit; 4393 uint32_t inuse, bitmask; 4394 boolean_t allfree; 4395 4396 /* 4397 * Assert that the given pmap is only active on the current 4398 * CPU. Unfortunately, we cannot block another CPU from 4399 * activating the pmap while this function is executing. 4400 */ 4401 KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), 4402 ("%s: non-current pmap %p", __func__, pmap)); 4403 #if defined(SMP) && defined(INVARIANTS) 4404 { 4405 cpuset_t other_cpus; 4406 4407 sched_pin(); 4408 other_cpus = pmap->pm_active; 4409 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 4410 sched_unpin(); 4411 KASSERT(CPU_EMPTY(&other_cpus), 4412 ("%s: pmap %p active on other cpus", __func__, pmap)); 4413 } 4414 #endif 4415 SLIST_INIT(&free); 4416 rw_wlock(&pvh_global_lock); 4417 PMAP_LOCK(pmap); 4418 sched_pin(); 4419 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4420 KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", 4421 __func__, pmap, pc->pc_pmap)); 4422 allfree = TRUE; 4423 for (field = 0; field < _NPCM; field++) { 4424 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4425 while (inuse != 0) { 4426 bit = ffs(inuse) - 1; 4427 bitmask = 1UL << bit; 4428 idx = field * 32 + bit; 4429 pv = &pc->pc_pventry[idx]; 4430 inuse &= ~bitmask; 4431 4432 /* 4433 * Note that we cannot remove wired pages 4434 * from a process' mapping at this time 4435 */ 4436 pte1p = pmap_pte1(pmap, pv->pv_va); 4437 pte1 = pte1_load(pte1p); 4438 if (pte1_is_section(pte1)) { 4439 if (pte1_is_wired(pte1)) { 4440 allfree = FALSE; 4441 continue; 4442 } 4443 pte1_clear(pte1p); 4444 pmap_remove_pte1_quick(pmap, pte1, pv, 4445 &free); 4446 } 4447 else if (pte1_is_link(pte1)) { 4448 pte2p = pt2map_entry(pv->pv_va); 4449 pte2 = pte2_load(pte2p); 4450 4451 if (!pte2_is_valid(pte2)) { 4452 printf("%s: pmap %p va %#x " 4453 "pte2 %#x\n", __func__, 4454 pmap, pv->pv_va, pte2); 4455 panic("bad pte2"); 4456 } 4457 4458 if (pte2_is_wired(pte2)) { 4459 allfree = FALSE; 4460 continue; 4461 } 4462 pte2_clear(pte2p); 4463 pmap_remove_pte2_quick(pmap, pte2, pv, 4464 &free); 4465 } else { 4466 printf("%s: pmap %p va %#x pte1 %#x\n", 4467 __func__, pmap, pv->pv_va, pte1); 4468 panic("bad pte1"); 4469 } 4470 4471 /* Mark free */ 4472 PV_STAT(pv_entry_frees++); 4473 PV_STAT(pv_entry_spare++); 4474 pv_entry_count--; 4475 pc->pc_map[field] |= bitmask; 4476 } 4477 } 4478 if (allfree) { 4479 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4480 free_pv_chunk(pc); 4481 } 4482 } 4483 tlb_flush_all_ng_local(); 4484 sched_unpin(); 4485 rw_wunlock(&pvh_global_lock); 4486 PMAP_UNLOCK(pmap); 4487 vm_page_free_pages_toq(&free, false); 4488 } 4489 4490 /* 4491 * This code makes some *MAJOR* assumptions: 4492 * 1. Current pmap & pmap exists. 4493 * 2. Not wired. 4494 * 3. Read access. 4495 * 4. No L2 page table pages. 4496 * but is *MUCH* faster than pmap_enter... 4497 */ 4498 static vm_page_t 4499 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4500 vm_prot_t prot, vm_page_t mpt2pg) 4501 { 4502 pt2_entry_t *pte2p, pte2; 4503 vm_paddr_t pa; 4504 struct spglist free; 4505 uint32_t l2prot; 4506 4507 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4508 (m->oflags & VPO_UNMANAGED) != 0, 4509 ("%s: managed mapping within the clean submap", __func__)); 4510 rw_assert(&pvh_global_lock, RA_WLOCKED); 4511 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4512 4513 /* 4514 * In the case that a L2 page table page is not 4515 * resident, we are creating it here. 4516 */ 4517 if (va < VM_MAXUSER_ADDRESS) { 4518 u_int pte1_idx; 4519 pt1_entry_t pte1, *pte1p; 4520 vm_paddr_t pt2_pa; 4521 4522 /* 4523 * Get L1 page table things. 4524 */ 4525 pte1_idx = pte1_index(va); 4526 pte1p = pmap_pte1(pmap, va); 4527 pte1 = pte1_load(pte1p); 4528 4529 if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { 4530 /* 4531 * Each of NPT2_IN_PG L2 page tables on the page can 4532 * come here. Make sure that associated L1 page table 4533 * link is established. 4534 * 4535 * QQQ: It comes that we don't establish all links to 4536 * L2 page tables for newly allocated L2 page 4537 * tables page. 4538 */ 4539 KASSERT(!pte1_is_section(pte1), 4540 ("%s: pte1 %#x is section", __func__, pte1)); 4541 if (!pte1_is_link(pte1)) { 4542 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), 4543 pte1_idx); 4544 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 4545 } 4546 pt2_wirecount_inc(mpt2pg, pte1_idx); 4547 } else { 4548 /* 4549 * If the L2 page table page is mapped, we just 4550 * increment the hold count, and activate it. 4551 */ 4552 if (pte1_is_section(pte1)) { 4553 return (NULL); 4554 } else if (pte1_is_link(pte1)) { 4555 mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 4556 pt2_wirecount_inc(mpt2pg, pte1_idx); 4557 } else { 4558 mpt2pg = _pmap_allocpte2(pmap, va, 4559 PMAP_ENTER_NOSLEEP); 4560 if (mpt2pg == NULL) 4561 return (NULL); 4562 } 4563 } 4564 } else { 4565 mpt2pg = NULL; 4566 } 4567 4568 /* 4569 * This call to pt2map_entry() makes the assumption that we are 4570 * entering the page into the current pmap. In order to support 4571 * quick entry into any pmap, one would likely use pmap_pte2_quick(). 4572 * But that isn't as quick as pt2map_entry(). 4573 */ 4574 pte2p = pt2map_entry(va); 4575 pte2 = pte2_load(pte2p); 4576 if (pte2_is_valid(pte2)) { 4577 if (mpt2pg != NULL) { 4578 /* 4579 * Remove extra pte2 reference 4580 */ 4581 pt2_wirecount_dec(mpt2pg, pte1_index(va)); 4582 mpt2pg = NULL; 4583 } 4584 return (NULL); 4585 } 4586 4587 /* 4588 * Enter on the PV list if part of our managed memory. 4589 */ 4590 if ((m->oflags & VPO_UNMANAGED) == 0 && 4591 !pmap_try_insert_pv_entry(pmap, va, m)) { 4592 if (mpt2pg != NULL) { 4593 SLIST_INIT(&free); 4594 if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { 4595 pmap_tlb_flush(pmap, va); 4596 vm_page_free_pages_toq(&free, false); 4597 } 4598 4599 mpt2pg = NULL; 4600 } 4601 return (NULL); 4602 } 4603 4604 /* 4605 * Increment counters 4606 */ 4607 pmap->pm_stats.resident_count++; 4608 4609 /* 4610 * Now validate mapping with RO protection 4611 */ 4612 pa = VM_PAGE_TO_PHYS(m); 4613 l2prot = PTE2_RO | PTE2_NM; 4614 if (va < VM_MAXUSER_ADDRESS) 4615 l2prot |= PTE2_U | PTE2_NG; 4616 if ((prot & VM_PROT_EXECUTE) == 0) 4617 l2prot |= PTE2_NX; 4618 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4619 /* 4620 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4621 * is set. QQQ: For more info, see comments in pmap_enter(). 4622 */ 4623 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4624 } 4625 pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); 4626 4627 return (mpt2pg); 4628 } 4629 4630 void 4631 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4632 { 4633 4634 rw_wlock(&pvh_global_lock); 4635 PMAP_LOCK(pmap); 4636 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4637 rw_wunlock(&pvh_global_lock); 4638 PMAP_UNLOCK(pmap); 4639 } 4640 4641 /* 4642 * Tries to create 1MB page mapping. Returns TRUE if successful and 4643 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 4644 * blocking, (2) a mapping already exists at the specified virtual address, or 4645 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 4646 */ 4647 static boolean_t 4648 pmap_enter_pte1(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4649 { 4650 pt1_entry_t *pte1p; 4651 vm_paddr_t pa; 4652 uint32_t l1prot; 4653 4654 rw_assert(&pvh_global_lock, RA_WLOCKED); 4655 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4656 pte1p = pmap_pte1(pmap, va); 4657 if (pte1_is_valid(pte1_load(pte1p))) { 4658 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, 4659 va, pmap); 4660 return (FALSE); 4661 } 4662 if ((m->oflags & VPO_UNMANAGED) == 0) { 4663 /* 4664 * Abort this mapping if its PV entry could not be created. 4665 */ 4666 if (!pmap_pv_insert_pte1(pmap, va, VM_PAGE_TO_PHYS(m))) { 4667 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4668 __func__, va, pmap); 4669 return (FALSE); 4670 } 4671 } 4672 /* 4673 * Increment counters. 4674 */ 4675 pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; 4676 4677 /* 4678 * Map the section. 4679 * 4680 * QQQ: Why VM_PROT_WRITE is not evaluated and the mapping is 4681 * made readonly? 4682 */ 4683 pa = VM_PAGE_TO_PHYS(m); 4684 l1prot = PTE1_RO | PTE1_NM; 4685 if (va < VM_MAXUSER_ADDRESS) 4686 l1prot |= PTE1_U | PTE1_NG; 4687 if ((prot & VM_PROT_EXECUTE) == 0) 4688 l1prot |= PTE1_NX; 4689 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4690 /* 4691 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4692 * is set. QQQ: For more info, see comments in pmap_enter(). 4693 */ 4694 cache_icache_sync_fresh(va, pa, PTE1_SIZE); 4695 } 4696 pte1_store(pte1p, PTE1(pa, l1prot, ATTR_TO_L1(vm_page_pte2_attr(m)))); 4697 4698 pmap_pte1_mappings++; 4699 CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, 4700 pmap); 4701 return (TRUE); 4702 } 4703 4704 /* 4705 * Maps a sequence of resident pages belonging to the same object. 4706 * The sequence begins with the given page m_start. This page is 4707 * mapped at the given virtual address start. Each subsequent page is 4708 * mapped at a virtual address that is offset from start by the same 4709 * amount as the page is offset from m_start within the object. The 4710 * last page in the sequence is the page with the largest offset from 4711 * m_start that can be mapped at a virtual address less than the given 4712 * virtual address end. Not every virtual page between start and end 4713 * is mapped; only those for which a resident page exists with the 4714 * corresponding offset from m_start are mapped. 4715 */ 4716 void 4717 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4718 vm_page_t m_start, vm_prot_t prot) 4719 { 4720 vm_offset_t va; 4721 vm_page_t m, mpt2pg; 4722 vm_pindex_t diff, psize; 4723 4724 PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", 4725 __func__, pmap, start, end, m_start, prot)); 4726 4727 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4728 psize = atop(end - start); 4729 mpt2pg = NULL; 4730 m = m_start; 4731 rw_wlock(&pvh_global_lock); 4732 PMAP_LOCK(pmap); 4733 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4734 va = start + ptoa(diff); 4735 if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && 4736 m->psind == 1 && sp_enabled && 4737 pmap_enter_pte1(pmap, va, m, prot)) 4738 m = &m[PTE1_SIZE / PAGE_SIZE - 1]; 4739 else 4740 mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, 4741 mpt2pg); 4742 m = TAILQ_NEXT(m, listq); 4743 } 4744 rw_wunlock(&pvh_global_lock); 4745 PMAP_UNLOCK(pmap); 4746 } 4747 4748 /* 4749 * This code maps large physical mmap regions into the 4750 * processor address space. Note that some shortcuts 4751 * are taken, but the code works. 4752 */ 4753 void 4754 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4755 vm_pindex_t pindex, vm_size_t size) 4756 { 4757 pt1_entry_t *pte1p; 4758 vm_paddr_t pa, pte2_pa; 4759 vm_page_t p; 4760 vm_memattr_t pat_mode; 4761 u_int l1attr, l1prot; 4762 4763 VM_OBJECT_ASSERT_WLOCKED(object); 4764 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4765 ("%s: non-device object", __func__)); 4766 if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { 4767 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4768 return; 4769 p = vm_page_lookup(object, pindex); 4770 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4771 ("%s: invalid page %p", __func__, p)); 4772 pat_mode = p->md.pat_mode; 4773 4774 /* 4775 * Abort the mapping if the first page is not physically 4776 * aligned to a 1MB page boundary. 4777 */ 4778 pte2_pa = VM_PAGE_TO_PHYS(p); 4779 if (pte2_pa & PTE1_OFFSET) 4780 return; 4781 4782 /* 4783 * Skip the first page. Abort the mapping if the rest of 4784 * the pages are not physically contiguous or have differing 4785 * memory attributes. 4786 */ 4787 p = TAILQ_NEXT(p, listq); 4788 for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; 4789 pa += PAGE_SIZE) { 4790 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4791 ("%s: invalid page %p", __func__, p)); 4792 if (pa != VM_PAGE_TO_PHYS(p) || 4793 pat_mode != p->md.pat_mode) 4794 return; 4795 p = TAILQ_NEXT(p, listq); 4796 } 4797 4798 /* 4799 * Map using 1MB pages. 4800 * 4801 * QQQ: Well, we are mapping a section, so same condition must 4802 * be hold like during promotion. It looks that only RW mapping 4803 * is done here, so readonly mapping must be done elsewhere. 4804 */ 4805 l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; 4806 l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); 4807 PMAP_LOCK(pmap); 4808 for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { 4809 pte1p = pmap_pte1(pmap, addr); 4810 if (!pte1_is_valid(pte1_load(pte1p))) { 4811 pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); 4812 pmap->pm_stats.resident_count += PTE1_SIZE / 4813 PAGE_SIZE; 4814 pmap_pte1_mappings++; 4815 } 4816 /* Else continue on if the PTE1 is already valid. */ 4817 addr += PTE1_SIZE; 4818 } 4819 PMAP_UNLOCK(pmap); 4820 } 4821 } 4822 4823 /* 4824 * Do the things to protect a 1mpage in a process. 4825 */ 4826 static void 4827 pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 4828 vm_prot_t prot) 4829 { 4830 pt1_entry_t npte1, opte1; 4831 vm_offset_t eva, va; 4832 vm_page_t m; 4833 4834 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4835 KASSERT((sva & PTE1_OFFSET) == 0, 4836 ("%s: sva is not 1mpage aligned", __func__)); 4837 4838 opte1 = npte1 = pte1_load(pte1p); 4839 if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { 4840 eva = sva + PTE1_SIZE; 4841 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 4842 va < eva; va += PAGE_SIZE, m++) 4843 vm_page_dirty(m); 4844 } 4845 if ((prot & VM_PROT_WRITE) == 0) 4846 npte1 |= PTE1_RO | PTE1_NM; 4847 if ((prot & VM_PROT_EXECUTE) == 0) 4848 npte1 |= PTE1_NX; 4849 4850 /* 4851 * QQQ: Herein, execute permission is never set. 4852 * It only can be cleared. So, no icache 4853 * syncing is needed. 4854 */ 4855 4856 if (npte1 != opte1) { 4857 pte1_store(pte1p, npte1); 4858 pmap_tlb_flush(pmap, sva); 4859 } 4860 } 4861 4862 /* 4863 * Set the physical protection on the 4864 * specified range of this map as requested. 4865 */ 4866 void 4867 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4868 { 4869 boolean_t pv_lists_locked; 4870 vm_offset_t nextva; 4871 pt1_entry_t *pte1p, pte1; 4872 pt2_entry_t *pte2p, opte2, npte2; 4873 4874 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4875 if (prot == VM_PROT_NONE) { 4876 pmap_remove(pmap, sva, eva); 4877 return; 4878 } 4879 4880 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 4881 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 4882 return; 4883 4884 if (pmap_is_current(pmap)) 4885 pv_lists_locked = FALSE; 4886 else { 4887 pv_lists_locked = TRUE; 4888 resume: 4889 rw_wlock(&pvh_global_lock); 4890 sched_pin(); 4891 } 4892 4893 PMAP_LOCK(pmap); 4894 for (; sva < eva; sva = nextva) { 4895 /* 4896 * Calculate address for next L2 page table. 4897 */ 4898 nextva = pte1_trunc(sva + PTE1_SIZE); 4899 if (nextva < sva) 4900 nextva = eva; 4901 4902 pte1p = pmap_pte1(pmap, sva); 4903 pte1 = pte1_load(pte1p); 4904 4905 /* 4906 * Weed out invalid mappings. Note: we assume that L1 page 4907 * page table is always allocated, and in kernel virtual. 4908 */ 4909 if (pte1 == 0) 4910 continue; 4911 4912 if (pte1_is_section(pte1)) { 4913 /* 4914 * Are we protecting the entire large page? If not, 4915 * demote the mapping and fall through. 4916 */ 4917 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4918 pmap_protect_pte1(pmap, pte1p, sva, prot); 4919 continue; 4920 } else { 4921 if (!pv_lists_locked) { 4922 pv_lists_locked = TRUE; 4923 if (!rw_try_wlock(&pvh_global_lock)) { 4924 PMAP_UNLOCK(pmap); 4925 goto resume; 4926 } 4927 sched_pin(); 4928 } 4929 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4930 /* 4931 * The large page mapping 4932 * was destroyed. 4933 */ 4934 continue; 4935 } 4936 #ifdef INVARIANTS 4937 else { 4938 /* Update pte1 after demotion */ 4939 pte1 = pte1_load(pte1p); 4940 } 4941 #endif 4942 } 4943 } 4944 4945 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4946 " is not link", __func__, pmap, sva, pte1, pte1p)); 4947 4948 /* 4949 * Limit our scan to either the end of the va represented 4950 * by the current L2 page table page, or to the end of the 4951 * range being protected. 4952 */ 4953 if (nextva > eva) 4954 nextva = eva; 4955 4956 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 4957 sva += PAGE_SIZE) { 4958 vm_page_t m; 4959 4960 opte2 = npte2 = pte2_load(pte2p); 4961 if (!pte2_is_valid(opte2)) 4962 continue; 4963 4964 if ((prot & VM_PROT_WRITE) == 0) { 4965 if (pte2_is_managed(opte2) && 4966 pte2_is_dirty(opte2)) { 4967 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4968 vm_page_dirty(m); 4969 } 4970 npte2 |= PTE2_RO | PTE2_NM; 4971 } 4972 4973 if ((prot & VM_PROT_EXECUTE) == 0) 4974 npte2 |= PTE2_NX; 4975 4976 /* 4977 * QQQ: Herein, execute permission is never set. 4978 * It only can be cleared. So, no icache 4979 * syncing is needed. 4980 */ 4981 4982 if (npte2 != opte2) { 4983 pte2_store(pte2p, npte2); 4984 pmap_tlb_flush(pmap, sva); 4985 } 4986 } 4987 } 4988 if (pv_lists_locked) { 4989 sched_unpin(); 4990 rw_wunlock(&pvh_global_lock); 4991 } 4992 PMAP_UNLOCK(pmap); 4993 } 4994 4995 /* 4996 * pmap_pvh_wired_mappings: 4997 * 4998 * Return the updated number "count" of managed mappings that are wired. 4999 */ 5000 static int 5001 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 5002 { 5003 pmap_t pmap; 5004 pt1_entry_t pte1; 5005 pt2_entry_t pte2; 5006 pv_entry_t pv; 5007 5008 rw_assert(&pvh_global_lock, RA_WLOCKED); 5009 sched_pin(); 5010 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5011 pmap = PV_PMAP(pv); 5012 PMAP_LOCK(pmap); 5013 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5014 if (pte1_is_section(pte1)) { 5015 if (pte1_is_wired(pte1)) 5016 count++; 5017 } else { 5018 KASSERT(pte1_is_link(pte1), 5019 ("%s: pte1 %#x is not link", __func__, pte1)); 5020 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5021 if (pte2_is_wired(pte2)) 5022 count++; 5023 } 5024 PMAP_UNLOCK(pmap); 5025 } 5026 sched_unpin(); 5027 return (count); 5028 } 5029 5030 /* 5031 * pmap_page_wired_mappings: 5032 * 5033 * Return the number of managed mappings to the given physical page 5034 * that are wired. 5035 */ 5036 int 5037 pmap_page_wired_mappings(vm_page_t m) 5038 { 5039 int count; 5040 5041 count = 0; 5042 if ((m->oflags & VPO_UNMANAGED) != 0) 5043 return (count); 5044 rw_wlock(&pvh_global_lock); 5045 count = pmap_pvh_wired_mappings(&m->md, count); 5046 if ((m->flags & PG_FICTITIOUS) == 0) { 5047 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 5048 count); 5049 } 5050 rw_wunlock(&pvh_global_lock); 5051 return (count); 5052 } 5053 5054 /* 5055 * Returns TRUE if any of the given mappings were used to modify 5056 * physical memory. Otherwise, returns FALSE. Both page and 1mpage 5057 * mappings are supported. 5058 */ 5059 static boolean_t 5060 pmap_is_modified_pvh(struct md_page *pvh) 5061 { 5062 pv_entry_t pv; 5063 pt1_entry_t pte1; 5064 pt2_entry_t pte2; 5065 pmap_t pmap; 5066 boolean_t rv; 5067 5068 rw_assert(&pvh_global_lock, RA_WLOCKED); 5069 rv = FALSE; 5070 sched_pin(); 5071 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5072 pmap = PV_PMAP(pv); 5073 PMAP_LOCK(pmap); 5074 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5075 if (pte1_is_section(pte1)) { 5076 rv = pte1_is_dirty(pte1); 5077 } else { 5078 KASSERT(pte1_is_link(pte1), 5079 ("%s: pte1 %#x is not link", __func__, pte1)); 5080 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5081 rv = pte2_is_dirty(pte2); 5082 } 5083 PMAP_UNLOCK(pmap); 5084 if (rv) 5085 break; 5086 } 5087 sched_unpin(); 5088 return (rv); 5089 } 5090 5091 /* 5092 * pmap_is_modified: 5093 * 5094 * Return whether or not the specified physical page was modified 5095 * in any physical maps. 5096 */ 5097 boolean_t 5098 pmap_is_modified(vm_page_t m) 5099 { 5100 boolean_t rv; 5101 5102 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5103 ("%s: page %p is not managed", __func__, m)); 5104 5105 /* 5106 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5107 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5108 * is clear, no PTE2s can have PG_M set. 5109 */ 5110 VM_OBJECT_ASSERT_WLOCKED(m->object); 5111 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5112 return (FALSE); 5113 rw_wlock(&pvh_global_lock); 5114 rv = pmap_is_modified_pvh(&m->md) || 5115 ((m->flags & PG_FICTITIOUS) == 0 && 5116 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5117 rw_wunlock(&pvh_global_lock); 5118 return (rv); 5119 } 5120 5121 /* 5122 * pmap_is_prefaultable: 5123 * 5124 * Return whether or not the specified virtual address is eligible 5125 * for prefault. 5126 */ 5127 boolean_t 5128 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5129 { 5130 pt1_entry_t pte1; 5131 pt2_entry_t pte2; 5132 boolean_t rv; 5133 5134 rv = FALSE; 5135 PMAP_LOCK(pmap); 5136 pte1 = pte1_load(pmap_pte1(pmap, addr)); 5137 if (pte1_is_link(pte1)) { 5138 pte2 = pte2_load(pt2map_entry(addr)); 5139 rv = !pte2_is_valid(pte2) ; 5140 } 5141 PMAP_UNLOCK(pmap); 5142 return (rv); 5143 } 5144 5145 /* 5146 * Returns TRUE if any of the given mappings were referenced and FALSE 5147 * otherwise. Both page and 1mpage mappings are supported. 5148 */ 5149 static boolean_t 5150 pmap_is_referenced_pvh(struct md_page *pvh) 5151 { 5152 5153 pv_entry_t pv; 5154 pt1_entry_t pte1; 5155 pt2_entry_t pte2; 5156 pmap_t pmap; 5157 boolean_t rv; 5158 5159 rw_assert(&pvh_global_lock, RA_WLOCKED); 5160 rv = FALSE; 5161 sched_pin(); 5162 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5163 pmap = PV_PMAP(pv); 5164 PMAP_LOCK(pmap); 5165 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5166 if (pte1_is_section(pte1)) { 5167 rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); 5168 } else { 5169 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5170 rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); 5171 } 5172 PMAP_UNLOCK(pmap); 5173 if (rv) 5174 break; 5175 } 5176 sched_unpin(); 5177 return (rv); 5178 } 5179 5180 /* 5181 * pmap_is_referenced: 5182 * 5183 * Return whether or not the specified physical page was referenced 5184 * in any physical maps. 5185 */ 5186 boolean_t 5187 pmap_is_referenced(vm_page_t m) 5188 { 5189 boolean_t rv; 5190 5191 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5192 ("%s: page %p is not managed", __func__, m)); 5193 rw_wlock(&pvh_global_lock); 5194 rv = pmap_is_referenced_pvh(&m->md) || 5195 ((m->flags & PG_FICTITIOUS) == 0 && 5196 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5197 rw_wunlock(&pvh_global_lock); 5198 return (rv); 5199 } 5200 5201 /* 5202 * pmap_ts_referenced: 5203 * 5204 * Return a count of reference bits for a page, clearing those bits. 5205 * It is not necessary for every reference bit to be cleared, but it 5206 * is necessary that 0 only be returned when there are truly no 5207 * reference bits set. 5208 * 5209 * As an optimization, update the page's dirty field if a modified bit is 5210 * found while counting reference bits. This opportunistic update can be 5211 * performed at low cost and can eliminate the need for some future calls 5212 * to pmap_is_modified(). However, since this function stops after 5213 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5214 * dirty pages. Those dirty pages will only be detected by a future call 5215 * to pmap_is_modified(). 5216 */ 5217 int 5218 pmap_ts_referenced(vm_page_t m) 5219 { 5220 struct md_page *pvh; 5221 pv_entry_t pv, pvf; 5222 pmap_t pmap; 5223 pt1_entry_t *pte1p, opte1; 5224 pt2_entry_t *pte2p, opte2; 5225 vm_paddr_t pa; 5226 int rtval = 0; 5227 5228 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5229 ("%s: page %p is not managed", __func__, m)); 5230 pa = VM_PAGE_TO_PHYS(m); 5231 pvh = pa_to_pvh(pa); 5232 rw_wlock(&pvh_global_lock); 5233 sched_pin(); 5234 if ((m->flags & PG_FICTITIOUS) != 0 || 5235 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5236 goto small_mappings; 5237 pv = pvf; 5238 do { 5239 pmap = PV_PMAP(pv); 5240 PMAP_LOCK(pmap); 5241 pte1p = pmap_pte1(pmap, pv->pv_va); 5242 opte1 = pte1_load(pte1p); 5243 if (pte1_is_dirty(opte1)) { 5244 /* 5245 * Although "opte1" is mapping a 1MB page, because 5246 * this function is called at a 4KB page granularity, 5247 * we only update the 4KB page under test. 5248 */ 5249 vm_page_dirty(m); 5250 } 5251 if ((opte1 & PTE1_A) != 0) { 5252 /* 5253 * Since this reference bit is shared by 256 4KB pages, 5254 * it should not be cleared every time it is tested. 5255 * Apply a simple "hash" function on the physical page 5256 * number, the virtual section number, and the pmap 5257 * address to select one 4KB page out of the 256 5258 * on which testing the reference bit will result 5259 * in clearing that bit. This function is designed 5260 * to avoid the selection of the same 4KB page 5261 * for every 1MB page mapping. 5262 * 5263 * On demotion, a mapping that hasn't been referenced 5264 * is simply destroyed. To avoid the possibility of a 5265 * subsequent page fault on a demoted wired mapping, 5266 * always leave its reference bit set. Moreover, 5267 * since the section is wired, the current state of 5268 * its reference bit won't affect page replacement. 5269 */ 5270 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ 5271 (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && 5272 !pte1_is_wired(opte1)) { 5273 pte1_clear_bit(pte1p, PTE1_A); 5274 pmap_tlb_flush(pmap, pv->pv_va); 5275 } 5276 rtval++; 5277 } 5278 PMAP_UNLOCK(pmap); 5279 /* Rotate the PV list if it has more than one entry. */ 5280 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5281 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5282 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5283 } 5284 if (rtval >= PMAP_TS_REFERENCED_MAX) 5285 goto out; 5286 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5287 small_mappings: 5288 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5289 goto out; 5290 pv = pvf; 5291 do { 5292 pmap = PV_PMAP(pv); 5293 PMAP_LOCK(pmap); 5294 pte1p = pmap_pte1(pmap, pv->pv_va); 5295 KASSERT(pte1_is_link(pte1_load(pte1p)), 5296 ("%s: not found a link in page %p's pv list", __func__, m)); 5297 5298 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5299 opte2 = pte2_load(pte2p); 5300 if (pte2_is_dirty(opte2)) 5301 vm_page_dirty(m); 5302 if ((opte2 & PTE2_A) != 0) { 5303 pte2_clear_bit(pte2p, PTE2_A); 5304 pmap_tlb_flush(pmap, pv->pv_va); 5305 rtval++; 5306 } 5307 PMAP_UNLOCK(pmap); 5308 /* Rotate the PV list if it has more than one entry. */ 5309 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5310 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5311 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5312 } 5313 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5314 PMAP_TS_REFERENCED_MAX); 5315 out: 5316 sched_unpin(); 5317 rw_wunlock(&pvh_global_lock); 5318 return (rtval); 5319 } 5320 5321 /* 5322 * Clear the wired attribute from the mappings for the specified range of 5323 * addresses in the given pmap. Every valid mapping within that range 5324 * must have the wired attribute set. In contrast, invalid mappings 5325 * cannot have the wired attribute set, so they are ignored. 5326 * 5327 * The wired attribute of the page table entry is not a hardware feature, 5328 * so there is no need to invalidate any TLB entries. 5329 */ 5330 void 5331 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5332 { 5333 vm_offset_t nextva; 5334 pt1_entry_t *pte1p, pte1; 5335 pt2_entry_t *pte2p, pte2; 5336 boolean_t pv_lists_locked; 5337 5338 if (pmap_is_current(pmap)) 5339 pv_lists_locked = FALSE; 5340 else { 5341 pv_lists_locked = TRUE; 5342 resume: 5343 rw_wlock(&pvh_global_lock); 5344 sched_pin(); 5345 } 5346 PMAP_LOCK(pmap); 5347 for (; sva < eva; sva = nextva) { 5348 nextva = pte1_trunc(sva + PTE1_SIZE); 5349 if (nextva < sva) 5350 nextva = eva; 5351 5352 pte1p = pmap_pte1(pmap, sva); 5353 pte1 = pte1_load(pte1p); 5354 5355 /* 5356 * Weed out invalid mappings. Note: we assume that L1 page 5357 * page table is always allocated, and in kernel virtual. 5358 */ 5359 if (pte1 == 0) 5360 continue; 5361 5362 if (pte1_is_section(pte1)) { 5363 if (!pte1_is_wired(pte1)) 5364 panic("%s: pte1 %#x not wired", __func__, pte1); 5365 5366 /* 5367 * Are we unwiring the entire large page? If not, 5368 * demote the mapping and fall through. 5369 */ 5370 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5371 pte1_clear_bit(pte1p, PTE1_W); 5372 pmap->pm_stats.wired_count -= PTE1_SIZE / 5373 PAGE_SIZE; 5374 continue; 5375 } else { 5376 if (!pv_lists_locked) { 5377 pv_lists_locked = TRUE; 5378 if (!rw_try_wlock(&pvh_global_lock)) { 5379 PMAP_UNLOCK(pmap); 5380 /* Repeat sva. */ 5381 goto resume; 5382 } 5383 sched_pin(); 5384 } 5385 if (!pmap_demote_pte1(pmap, pte1p, sva)) 5386 panic("%s: demotion failed", __func__); 5387 #ifdef INVARIANTS 5388 else { 5389 /* Update pte1 after demotion */ 5390 pte1 = pte1_load(pte1p); 5391 } 5392 #endif 5393 } 5394 } 5395 5396 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5397 " is not link", __func__, pmap, sva, pte1, pte1p)); 5398 5399 /* 5400 * Limit our scan to either the end of the va represented 5401 * by the current L2 page table page, or to the end of the 5402 * range being protected. 5403 */ 5404 if (nextva > eva) 5405 nextva = eva; 5406 5407 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5408 sva += PAGE_SIZE) { 5409 pte2 = pte2_load(pte2p); 5410 if (!pte2_is_valid(pte2)) 5411 continue; 5412 if (!pte2_is_wired(pte2)) 5413 panic("%s: pte2 %#x is missing PTE2_W", 5414 __func__, pte2); 5415 5416 /* 5417 * PTE2_W must be cleared atomically. Although the pmap 5418 * lock synchronizes access to PTE2_W, another processor 5419 * could be changing PTE2_NM and/or PTE2_A concurrently. 5420 */ 5421 pte2_clear_bit(pte2p, PTE2_W); 5422 pmap->pm_stats.wired_count--; 5423 } 5424 } 5425 if (pv_lists_locked) { 5426 sched_unpin(); 5427 rw_wunlock(&pvh_global_lock); 5428 } 5429 PMAP_UNLOCK(pmap); 5430 } 5431 5432 /* 5433 * Clear the write and modified bits in each of the given page's mappings. 5434 */ 5435 void 5436 pmap_remove_write(vm_page_t m) 5437 { 5438 struct md_page *pvh; 5439 pv_entry_t next_pv, pv; 5440 pmap_t pmap; 5441 pt1_entry_t *pte1p; 5442 pt2_entry_t *pte2p, opte2; 5443 vm_offset_t va; 5444 5445 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5446 ("%s: page %p is not managed", __func__, m)); 5447 5448 /* 5449 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5450 * set by another thread while the object is locked. Thus, 5451 * if PGA_WRITEABLE is clear, no page table entries need updating. 5452 */ 5453 VM_OBJECT_ASSERT_WLOCKED(m->object); 5454 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5455 return; 5456 rw_wlock(&pvh_global_lock); 5457 sched_pin(); 5458 if ((m->flags & PG_FICTITIOUS) != 0) 5459 goto small_mappings; 5460 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5461 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5462 va = pv->pv_va; 5463 pmap = PV_PMAP(pv); 5464 PMAP_LOCK(pmap); 5465 pte1p = pmap_pte1(pmap, va); 5466 if (!(pte1_load(pte1p) & PTE1_RO)) 5467 (void)pmap_demote_pte1(pmap, pte1p, va); 5468 PMAP_UNLOCK(pmap); 5469 } 5470 small_mappings: 5471 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5472 pmap = PV_PMAP(pv); 5473 PMAP_LOCK(pmap); 5474 pte1p = pmap_pte1(pmap, pv->pv_va); 5475 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5476 " a section in page %p's pv list", __func__, m)); 5477 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5478 opte2 = pte2_load(pte2p); 5479 if (!(opte2 & PTE2_RO)) { 5480 pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); 5481 if (pte2_is_dirty(opte2)) 5482 vm_page_dirty(m); 5483 pmap_tlb_flush(pmap, pv->pv_va); 5484 } 5485 PMAP_UNLOCK(pmap); 5486 } 5487 vm_page_aflag_clear(m, PGA_WRITEABLE); 5488 sched_unpin(); 5489 rw_wunlock(&pvh_global_lock); 5490 } 5491 5492 /* 5493 * Apply the given advice to the specified range of addresses within the 5494 * given pmap. Depending on the advice, clear the referenced and/or 5495 * modified flags in each mapping and set the mapped page's dirty field. 5496 */ 5497 void 5498 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5499 { 5500 pt1_entry_t *pte1p, opte1; 5501 pt2_entry_t *pte2p, pte2; 5502 vm_offset_t pdnxt; 5503 vm_page_t m; 5504 boolean_t pv_lists_locked; 5505 5506 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5507 return; 5508 if (pmap_is_current(pmap)) 5509 pv_lists_locked = FALSE; 5510 else { 5511 pv_lists_locked = TRUE; 5512 resume: 5513 rw_wlock(&pvh_global_lock); 5514 sched_pin(); 5515 } 5516 PMAP_LOCK(pmap); 5517 for (; sva < eva; sva = pdnxt) { 5518 pdnxt = pte1_trunc(sva + PTE1_SIZE); 5519 if (pdnxt < sva) 5520 pdnxt = eva; 5521 pte1p = pmap_pte1(pmap, sva); 5522 opte1 = pte1_load(pte1p); 5523 if (!pte1_is_valid(opte1)) /* XXX */ 5524 continue; 5525 else if (pte1_is_section(opte1)) { 5526 if (!pte1_is_managed(opte1)) 5527 continue; 5528 if (!pv_lists_locked) { 5529 pv_lists_locked = TRUE; 5530 if (!rw_try_wlock(&pvh_global_lock)) { 5531 PMAP_UNLOCK(pmap); 5532 goto resume; 5533 } 5534 sched_pin(); 5535 } 5536 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5537 /* 5538 * The large page mapping was destroyed. 5539 */ 5540 continue; 5541 } 5542 5543 /* 5544 * Unless the page mappings are wired, remove the 5545 * mapping to a single page so that a subsequent 5546 * access may repromote. Since the underlying L2 page 5547 * table is fully populated, this removal never 5548 * frees a L2 page table page. 5549 */ 5550 if (!pte1_is_wired(opte1)) { 5551 pte2p = pmap_pte2_quick(pmap, sva); 5552 KASSERT(pte2_is_valid(pte2_load(pte2p)), 5553 ("%s: invalid PTE2", __func__)); 5554 pmap_remove_pte2(pmap, pte2p, sva, NULL); 5555 } 5556 } 5557 if (pdnxt > eva) 5558 pdnxt = eva; 5559 for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, 5560 sva += PAGE_SIZE) { 5561 pte2 = pte2_load(pte2p); 5562 if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) 5563 continue; 5564 else if (pte2_is_dirty(pte2)) { 5565 if (advice == MADV_DONTNEED) { 5566 /* 5567 * Future calls to pmap_is_modified() 5568 * can be avoided by making the page 5569 * dirty now. 5570 */ 5571 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 5572 vm_page_dirty(m); 5573 } 5574 pte2_set_bit(pte2p, PTE2_NM); 5575 pte2_clear_bit(pte2p, PTE2_A); 5576 } else if ((pte2 & PTE2_A) != 0) 5577 pte2_clear_bit(pte2p, PTE2_A); 5578 else 5579 continue; 5580 pmap_tlb_flush(pmap, sva); 5581 } 5582 } 5583 if (pv_lists_locked) { 5584 sched_unpin(); 5585 rw_wunlock(&pvh_global_lock); 5586 } 5587 PMAP_UNLOCK(pmap); 5588 } 5589 5590 /* 5591 * Clear the modify bits on the specified physical page. 5592 */ 5593 void 5594 pmap_clear_modify(vm_page_t m) 5595 { 5596 struct md_page *pvh; 5597 pv_entry_t next_pv, pv; 5598 pmap_t pmap; 5599 pt1_entry_t *pte1p, opte1; 5600 pt2_entry_t *pte2p, opte2; 5601 vm_offset_t va; 5602 5603 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5604 ("%s: page %p is not managed", __func__, m)); 5605 VM_OBJECT_ASSERT_WLOCKED(m->object); 5606 KASSERT(!vm_page_xbusied(m), 5607 ("%s: page %p is exclusive busy", __func__, m)); 5608 5609 /* 5610 * If the page is not PGA_WRITEABLE, then no PTE2s can have PTE2_NM 5611 * cleared. If the object containing the page is locked and the page 5612 * is not exclusive busied, then PGA_WRITEABLE cannot be concurrently 5613 * set. 5614 */ 5615 if ((m->flags & PGA_WRITEABLE) == 0) 5616 return; 5617 rw_wlock(&pvh_global_lock); 5618 sched_pin(); 5619 if ((m->flags & PG_FICTITIOUS) != 0) 5620 goto small_mappings; 5621 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5622 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5623 va = pv->pv_va; 5624 pmap = PV_PMAP(pv); 5625 PMAP_LOCK(pmap); 5626 pte1p = pmap_pte1(pmap, va); 5627 opte1 = pte1_load(pte1p); 5628 if (!(opte1 & PTE1_RO)) { 5629 if (pmap_demote_pte1(pmap, pte1p, va) && 5630 !pte1_is_wired(opte1)) { 5631 /* 5632 * Write protect the mapping to a 5633 * single page so that a subsequent 5634 * write access may repromote. 5635 */ 5636 va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); 5637 pte2p = pmap_pte2_quick(pmap, va); 5638 opte2 = pte2_load(pte2p); 5639 if ((opte2 & PTE2_V)) { 5640 pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); 5641 vm_page_dirty(m); 5642 pmap_tlb_flush(pmap, va); 5643 } 5644 } 5645 } 5646 PMAP_UNLOCK(pmap); 5647 } 5648 small_mappings: 5649 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5650 pmap = PV_PMAP(pv); 5651 PMAP_LOCK(pmap); 5652 pte1p = pmap_pte1(pmap, pv->pv_va); 5653 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5654 " a section in page %p's pv list", __func__, m)); 5655 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5656 if (pte2_is_dirty(pte2_load(pte2p))) { 5657 pte2_set_bit(pte2p, PTE2_NM); 5658 pmap_tlb_flush(pmap, pv->pv_va); 5659 } 5660 PMAP_UNLOCK(pmap); 5661 } 5662 sched_unpin(); 5663 rw_wunlock(&pvh_global_lock); 5664 } 5665 5666 5667 /* 5668 * Sets the memory attribute for the specified page. 5669 */ 5670 void 5671 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5672 { 5673 pt2_entry_t *cmap2_pte2p; 5674 vm_memattr_t oma; 5675 vm_paddr_t pa; 5676 struct pcpu *pc; 5677 5678 oma = m->md.pat_mode; 5679 m->md.pat_mode = ma; 5680 5681 CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, 5682 VM_PAGE_TO_PHYS(m), oma, ma); 5683 if ((m->flags & PG_FICTITIOUS) != 0) 5684 return; 5685 #if 0 5686 /* 5687 * If "m" is a normal page, flush it from the cache. 5688 * 5689 * First, try to find an existing mapping of the page by sf 5690 * buffer. sf_buf_invalidate_cache() modifies mapping and 5691 * flushes the cache. 5692 */ 5693 if (sf_buf_invalidate_cache(m, oma)) 5694 return; 5695 #endif 5696 /* 5697 * If page is not mapped by sf buffer, map the page 5698 * transient and do invalidation. 5699 */ 5700 if (ma != oma) { 5701 pa = VM_PAGE_TO_PHYS(m); 5702 sched_pin(); 5703 pc = get_pcpu(); 5704 cmap2_pte2p = pc->pc_cmap2_pte2p; 5705 mtx_lock(&pc->pc_cmap_lock); 5706 if (pte2_load(cmap2_pte2p) != 0) 5707 panic("%s: CMAP2 busy", __func__); 5708 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 5709 vm_memattr_to_pte2(ma))); 5710 dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); 5711 pte2_clear(cmap2_pte2p); 5712 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5713 sched_unpin(); 5714 mtx_unlock(&pc->pc_cmap_lock); 5715 } 5716 } 5717 5718 /* 5719 * Miscellaneous support routines follow 5720 */ 5721 5722 /* 5723 * Returns TRUE if the given page is mapped individually or as part of 5724 * a 1mpage. Otherwise, returns FALSE. 5725 */ 5726 boolean_t 5727 pmap_page_is_mapped(vm_page_t m) 5728 { 5729 boolean_t rv; 5730 5731 if ((m->oflags & VPO_UNMANAGED) != 0) 5732 return (FALSE); 5733 rw_wlock(&pvh_global_lock); 5734 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5735 ((m->flags & PG_FICTITIOUS) == 0 && 5736 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5737 rw_wunlock(&pvh_global_lock); 5738 return (rv); 5739 } 5740 5741 /* 5742 * Returns true if the pmap's pv is one of the first 5743 * 16 pvs linked to from this page. This count may 5744 * be changed upwards or downwards in the future; it 5745 * is only necessary that true be returned for a small 5746 * subset of pmaps for proper page aging. 5747 */ 5748 boolean_t 5749 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5750 { 5751 struct md_page *pvh; 5752 pv_entry_t pv; 5753 int loops = 0; 5754 boolean_t rv; 5755 5756 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5757 ("%s: page %p is not managed", __func__, m)); 5758 rv = FALSE; 5759 rw_wlock(&pvh_global_lock); 5760 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5761 if (PV_PMAP(pv) == pmap) { 5762 rv = TRUE; 5763 break; 5764 } 5765 loops++; 5766 if (loops >= 16) 5767 break; 5768 } 5769 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5770 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5771 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5772 if (PV_PMAP(pv) == pmap) { 5773 rv = TRUE; 5774 break; 5775 } 5776 loops++; 5777 if (loops >= 16) 5778 break; 5779 } 5780 } 5781 rw_wunlock(&pvh_global_lock); 5782 return (rv); 5783 } 5784 5785 /* 5786 * pmap_zero_page zeros the specified hardware page by mapping 5787 * the page into KVM and using bzero to clear its contents. 5788 */ 5789 void 5790 pmap_zero_page(vm_page_t m) 5791 { 5792 pt2_entry_t *cmap2_pte2p; 5793 struct pcpu *pc; 5794 5795 sched_pin(); 5796 pc = get_pcpu(); 5797 cmap2_pte2p = pc->pc_cmap2_pte2p; 5798 mtx_lock(&pc->pc_cmap_lock); 5799 if (pte2_load(cmap2_pte2p) != 0) 5800 panic("%s: CMAP2 busy", __func__); 5801 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5802 vm_page_pte2_attr(m))); 5803 pagezero(pc->pc_cmap2_addr); 5804 pte2_clear(cmap2_pte2p); 5805 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5806 sched_unpin(); 5807 mtx_unlock(&pc->pc_cmap_lock); 5808 } 5809 5810 /* 5811 * pmap_zero_page_area zeros the specified hardware page by mapping 5812 * the page into KVM and using bzero to clear its contents. 5813 * 5814 * off and size may not cover an area beyond a single hardware page. 5815 */ 5816 void 5817 pmap_zero_page_area(vm_page_t m, int off, int size) 5818 { 5819 pt2_entry_t *cmap2_pte2p; 5820 struct pcpu *pc; 5821 5822 sched_pin(); 5823 pc = get_pcpu(); 5824 cmap2_pte2p = pc->pc_cmap2_pte2p; 5825 mtx_lock(&pc->pc_cmap_lock); 5826 if (pte2_load(cmap2_pte2p) != 0) 5827 panic("%s: CMAP2 busy", __func__); 5828 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5829 vm_page_pte2_attr(m))); 5830 if (off == 0 && size == PAGE_SIZE) 5831 pagezero(pc->pc_cmap2_addr); 5832 else 5833 bzero(pc->pc_cmap2_addr + off, size); 5834 pte2_clear(cmap2_pte2p); 5835 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5836 sched_unpin(); 5837 mtx_unlock(&pc->pc_cmap_lock); 5838 } 5839 5840 /* 5841 * pmap_copy_page copies the specified (machine independent) 5842 * page by mapping the page into virtual memory and using 5843 * bcopy to copy the page, one machine dependent page at a 5844 * time. 5845 */ 5846 void 5847 pmap_copy_page(vm_page_t src, vm_page_t dst) 5848 { 5849 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5850 struct pcpu *pc; 5851 5852 sched_pin(); 5853 pc = get_pcpu(); 5854 cmap1_pte2p = pc->pc_cmap1_pte2p; 5855 cmap2_pte2p = pc->pc_cmap2_pte2p; 5856 mtx_lock(&pc->pc_cmap_lock); 5857 if (pte2_load(cmap1_pte2p) != 0) 5858 panic("%s: CMAP1 busy", __func__); 5859 if (pte2_load(cmap2_pte2p) != 0) 5860 panic("%s: CMAP2 busy", __func__); 5861 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), 5862 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); 5863 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), 5864 PTE2_AP_KRW, vm_page_pte2_attr(dst))); 5865 bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); 5866 pte2_clear(cmap1_pte2p); 5867 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5868 pte2_clear(cmap2_pte2p); 5869 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5870 sched_unpin(); 5871 mtx_unlock(&pc->pc_cmap_lock); 5872 } 5873 5874 int unmapped_buf_allowed = 1; 5875 5876 void 5877 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5878 vm_offset_t b_offset, int xfersize) 5879 { 5880 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5881 vm_page_t a_pg, b_pg; 5882 char *a_cp, *b_cp; 5883 vm_offset_t a_pg_offset, b_pg_offset; 5884 struct pcpu *pc; 5885 int cnt; 5886 5887 sched_pin(); 5888 pc = get_pcpu(); 5889 cmap1_pte2p = pc->pc_cmap1_pte2p; 5890 cmap2_pte2p = pc->pc_cmap2_pte2p; 5891 mtx_lock(&pc->pc_cmap_lock); 5892 if (pte2_load(cmap1_pte2p) != 0) 5893 panic("pmap_copy_pages: CMAP1 busy"); 5894 if (pte2_load(cmap2_pte2p) != 0) 5895 panic("pmap_copy_pages: CMAP2 busy"); 5896 while (xfersize > 0) { 5897 a_pg = ma[a_offset >> PAGE_SHIFT]; 5898 a_pg_offset = a_offset & PAGE_MASK; 5899 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5900 b_pg = mb[b_offset >> PAGE_SHIFT]; 5901 b_pg_offset = b_offset & PAGE_MASK; 5902 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5903 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), 5904 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); 5905 tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); 5906 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), 5907 PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); 5908 tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); 5909 a_cp = pc->pc_cmap1_addr + a_pg_offset; 5910 b_cp = pc->pc_cmap2_addr + b_pg_offset; 5911 bcopy(a_cp, b_cp, cnt); 5912 a_offset += cnt; 5913 b_offset += cnt; 5914 xfersize -= cnt; 5915 } 5916 pte2_clear(cmap1_pte2p); 5917 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5918 pte2_clear(cmap2_pte2p); 5919 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5920 sched_unpin(); 5921 mtx_unlock(&pc->pc_cmap_lock); 5922 } 5923 5924 vm_offset_t 5925 pmap_quick_enter_page(vm_page_t m) 5926 { 5927 struct pcpu *pc; 5928 pt2_entry_t *pte2p; 5929 5930 critical_enter(); 5931 pc = get_pcpu(); 5932 pte2p = pc->pc_qmap_pte2p; 5933 5934 KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); 5935 5936 pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5937 vm_page_pte2_attr(m))); 5938 return (pc->pc_qmap_addr); 5939 } 5940 5941 void 5942 pmap_quick_remove_page(vm_offset_t addr) 5943 { 5944 struct pcpu *pc; 5945 pt2_entry_t *pte2p; 5946 5947 pc = get_pcpu(); 5948 pte2p = pc->pc_qmap_pte2p; 5949 5950 KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); 5951 KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); 5952 5953 pte2_clear(pte2p); 5954 tlb_flush(pc->pc_qmap_addr); 5955 critical_exit(); 5956 } 5957 5958 /* 5959 * Copy the range specified by src_addr/len 5960 * from the source map to the range dst_addr/len 5961 * in the destination map. 5962 * 5963 * This routine is only advisory and need not do anything. 5964 */ 5965 void 5966 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 5967 vm_offset_t src_addr) 5968 { 5969 struct spglist free; 5970 vm_offset_t addr; 5971 vm_offset_t end_addr = src_addr + len; 5972 vm_offset_t nextva; 5973 5974 if (dst_addr != src_addr) 5975 return; 5976 5977 if (!pmap_is_current(src_pmap)) 5978 return; 5979 5980 rw_wlock(&pvh_global_lock); 5981 if (dst_pmap < src_pmap) { 5982 PMAP_LOCK(dst_pmap); 5983 PMAP_LOCK(src_pmap); 5984 } else { 5985 PMAP_LOCK(src_pmap); 5986 PMAP_LOCK(dst_pmap); 5987 } 5988 sched_pin(); 5989 for (addr = src_addr; addr < end_addr; addr = nextva) { 5990 pt2_entry_t *src_pte2p, *dst_pte2p; 5991 vm_page_t dst_mpt2pg, src_mpt2pg; 5992 pt1_entry_t src_pte1; 5993 u_int pte1_idx; 5994 5995 KASSERT(addr < VM_MAXUSER_ADDRESS, 5996 ("%s: invalid to pmap_copy page tables", __func__)); 5997 5998 nextva = pte1_trunc(addr + PTE1_SIZE); 5999 if (nextva < addr) 6000 nextva = end_addr; 6001 6002 pte1_idx = pte1_index(addr); 6003 src_pte1 = src_pmap->pm_pt1[pte1_idx]; 6004 if (pte1_is_section(src_pte1)) { 6005 if ((addr & PTE1_OFFSET) != 0 || 6006 (addr + PTE1_SIZE) > end_addr) 6007 continue; 6008 if (dst_pmap->pm_pt1[pte1_idx] == 0 && 6009 (!pte1_is_managed(src_pte1) || 6010 pmap_pv_insert_pte1(dst_pmap, addr, 6011 pte1_pa(src_pte1)))) { 6012 dst_pmap->pm_pt1[pte1_idx] = src_pte1 & 6013 ~PTE1_W; 6014 dst_pmap->pm_stats.resident_count += 6015 PTE1_SIZE / PAGE_SIZE; 6016 pmap_pte1_mappings++; 6017 } 6018 continue; 6019 } else if (!pte1_is_link(src_pte1)) 6020 continue; 6021 6022 src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); 6023 6024 /* 6025 * We leave PT2s to be linked from PT1 even if they are not 6026 * referenced until all PT2s in a page are without reference. 6027 * 6028 * QQQ: It could be changed ... 6029 */ 6030 #if 0 /* single_pt2_link_is_cleared */ 6031 KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, 6032 ("%s: source page table page is unused", __func__)); 6033 #else 6034 if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) 6035 continue; 6036 #endif 6037 if (nextva > end_addr) 6038 nextva = end_addr; 6039 6040 src_pte2p = pt2map_entry(addr); 6041 while (addr < nextva) { 6042 pt2_entry_t temp_pte2; 6043 temp_pte2 = pte2_load(src_pte2p); 6044 /* 6045 * we only virtual copy managed pages 6046 */ 6047 if (pte2_is_managed(temp_pte2)) { 6048 dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, 6049 PMAP_ENTER_NOSLEEP); 6050 if (dst_mpt2pg == NULL) 6051 goto out; 6052 dst_pte2p = pmap_pte2_quick(dst_pmap, addr); 6053 if (!pte2_is_valid(pte2_load(dst_pte2p)) && 6054 pmap_try_insert_pv_entry(dst_pmap, addr, 6055 PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { 6056 /* 6057 * Clear the wired, modified, and 6058 * accessed (referenced) bits 6059 * during the copy. 6060 */ 6061 temp_pte2 &= ~(PTE2_W | PTE2_A); 6062 temp_pte2 |= PTE2_NM; 6063 pte2_store(dst_pte2p, temp_pte2); 6064 dst_pmap->pm_stats.resident_count++; 6065 } else { 6066 SLIST_INIT(&free); 6067 if (pmap_unwire_pt2(dst_pmap, addr, 6068 dst_mpt2pg, &free)) { 6069 pmap_tlb_flush(dst_pmap, addr); 6070 vm_page_free_pages_toq(&free, 6071 false); 6072 } 6073 goto out; 6074 } 6075 if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= 6076 pt2_wirecount_get(src_mpt2pg, pte1_idx)) 6077 break; 6078 } 6079 addr += PAGE_SIZE; 6080 src_pte2p++; 6081 } 6082 } 6083 out: 6084 sched_unpin(); 6085 rw_wunlock(&pvh_global_lock); 6086 PMAP_UNLOCK(src_pmap); 6087 PMAP_UNLOCK(dst_pmap); 6088 } 6089 6090 /* 6091 * Increase the starting virtual address of the given mapping if a 6092 * different alignment might result in more section mappings. 6093 */ 6094 void 6095 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6096 vm_offset_t *addr, vm_size_t size) 6097 { 6098 vm_offset_t pte1_offset; 6099 6100 if (size < PTE1_SIZE) 6101 return; 6102 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6103 offset += ptoa(object->pg_color); 6104 pte1_offset = offset & PTE1_OFFSET; 6105 if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || 6106 (*addr & PTE1_OFFSET) == pte1_offset) 6107 return; 6108 if ((*addr & PTE1_OFFSET) < pte1_offset) 6109 *addr = pte1_trunc(*addr) + pte1_offset; 6110 else 6111 *addr = pte1_roundup(*addr) + pte1_offset; 6112 } 6113 6114 void 6115 pmap_activate(struct thread *td) 6116 { 6117 pmap_t pmap, oldpmap; 6118 u_int cpuid, ttb; 6119 6120 PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); 6121 6122 critical_enter(); 6123 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6124 oldpmap = PCPU_GET(curpmap); 6125 cpuid = PCPU_GET(cpuid); 6126 6127 #if defined(SMP) 6128 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6129 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6130 #else 6131 CPU_CLR(cpuid, &oldpmap->pm_active); 6132 CPU_SET(cpuid, &pmap->pm_active); 6133 #endif 6134 6135 ttb = pmap_ttb_get(pmap); 6136 6137 /* 6138 * pmap_activate is for the current thread on the current cpu 6139 */ 6140 td->td_pcb->pcb_pagedir = ttb; 6141 cp15_ttbr_set(ttb); 6142 PCPU_SET(curpmap, pmap); 6143 critical_exit(); 6144 } 6145 6146 /* 6147 * Perform the pmap work for mincore. 6148 */ 6149 int 6150 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 6151 { 6152 pt1_entry_t *pte1p, pte1; 6153 pt2_entry_t *pte2p, pte2; 6154 vm_paddr_t pa; 6155 bool managed; 6156 int val; 6157 6158 PMAP_LOCK(pmap); 6159 retry: 6160 pte1p = pmap_pte1(pmap, addr); 6161 pte1 = pte1_load(pte1p); 6162 if (pte1_is_section(pte1)) { 6163 pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); 6164 managed = pte1_is_managed(pte1); 6165 val = MINCORE_SUPER | MINCORE_INCORE; 6166 if (pte1_is_dirty(pte1)) 6167 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6168 if (pte1 & PTE1_A) 6169 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6170 } else if (pte1_is_link(pte1)) { 6171 pte2p = pmap_pte2(pmap, addr); 6172 pte2 = pte2_load(pte2p); 6173 pmap_pte2_release(pte2p); 6174 pa = pte2_pa(pte2); 6175 managed = pte2_is_managed(pte2); 6176 val = MINCORE_INCORE; 6177 if (pte2_is_dirty(pte2)) 6178 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6179 if (pte2 & PTE2_A) 6180 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6181 } else { 6182 managed = false; 6183 val = 0; 6184 } 6185 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6186 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6187 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 6188 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 6189 goto retry; 6190 } else 6191 PA_UNLOCK_COND(*locked_pa); 6192 PMAP_UNLOCK(pmap); 6193 return (val); 6194 } 6195 6196 void 6197 pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) 6198 { 6199 vm_offset_t sva; 6200 uint32_t l2attr; 6201 6202 KASSERT((size & PAGE_MASK) == 0, 6203 ("%s: device mapping not page-sized", __func__)); 6204 6205 sva = va; 6206 l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); 6207 while (size != 0) { 6208 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); 6209 va += PAGE_SIZE; 6210 pa += PAGE_SIZE; 6211 size -= PAGE_SIZE; 6212 } 6213 tlb_flush_range(sva, va - sva); 6214 } 6215 6216 void 6217 pmap_kremove_device(vm_offset_t va, vm_size_t size) 6218 { 6219 vm_offset_t sva; 6220 6221 KASSERT((size & PAGE_MASK) == 0, 6222 ("%s: device mapping not page-sized", __func__)); 6223 6224 sva = va; 6225 while (size != 0) { 6226 pmap_kremove(va); 6227 va += PAGE_SIZE; 6228 size -= PAGE_SIZE; 6229 } 6230 tlb_flush_range(sva, va - sva); 6231 } 6232 6233 void 6234 pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) 6235 { 6236 6237 pcb->pcb_pagedir = pmap_ttb_get(pmap); 6238 } 6239 6240 6241 /* 6242 * Clean L1 data cache range by physical address. 6243 * The range must be within a single page. 6244 */ 6245 static void 6246 pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) 6247 { 6248 pt2_entry_t *cmap2_pte2p; 6249 struct pcpu *pc; 6250 6251 KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, 6252 ("%s: not on single page", __func__)); 6253 6254 sched_pin(); 6255 pc = get_pcpu(); 6256 cmap2_pte2p = pc->pc_cmap2_pte2p; 6257 mtx_lock(&pc->pc_cmap_lock); 6258 if (pte2_load(cmap2_pte2p) != 0) 6259 panic("%s: CMAP2 busy", __func__); 6260 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); 6261 dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); 6262 pte2_clear(cmap2_pte2p); 6263 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6264 sched_unpin(); 6265 mtx_unlock(&pc->pc_cmap_lock); 6266 } 6267 6268 /* 6269 * Sync instruction cache range which is not mapped yet. 6270 */ 6271 void 6272 cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) 6273 { 6274 uint32_t len, offset; 6275 vm_page_t m; 6276 6277 /* Write back d-cache on given address range. */ 6278 offset = pa & PAGE_MASK; 6279 for ( ; size != 0; size -= len, pa += len, offset = 0) { 6280 len = min(PAGE_SIZE - offset, size); 6281 m = PHYS_TO_VM_PAGE(pa); 6282 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6283 __func__, pa)); 6284 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6285 } 6286 /* 6287 * I-cache is VIPT. Only way how to flush all virtual mappings 6288 * on given physical address is to invalidate all i-cache. 6289 */ 6290 icache_inv_all(); 6291 } 6292 6293 void 6294 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) 6295 { 6296 6297 /* Write back d-cache on given address range. */ 6298 if (va >= VM_MIN_KERNEL_ADDRESS) { 6299 dcache_wb_pou(va, size); 6300 } else { 6301 uint32_t len, offset; 6302 vm_paddr_t pa; 6303 vm_page_t m; 6304 6305 offset = va & PAGE_MASK; 6306 for ( ; size != 0; size -= len, va += len, offset = 0) { 6307 pa = pmap_extract(pmap, va); /* offset is preserved */ 6308 len = min(PAGE_SIZE - offset, size); 6309 m = PHYS_TO_VM_PAGE(pa); 6310 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6311 __func__, pa)); 6312 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6313 } 6314 } 6315 /* 6316 * I-cache is VIPT. Only way how to flush all virtual mappings 6317 * on given physical address is to invalidate all i-cache. 6318 */ 6319 icache_inv_all(); 6320 } 6321 6322 /* 6323 * The implementation of pmap_fault() uses IN_RANGE2() macro which 6324 * depends on the fact that given range size is a power of 2. 6325 */ 6326 CTASSERT(powerof2(NB_IN_PT1)); 6327 CTASSERT(powerof2(PT2MAP_SIZE)); 6328 6329 #define IN_RANGE2(addr, start, size) \ 6330 ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) 6331 6332 /* 6333 * Handle access and R/W emulation faults. 6334 */ 6335 int 6336 pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) 6337 { 6338 pt1_entry_t *pte1p, pte1; 6339 pt2_entry_t *pte2p, pte2; 6340 6341 if (pmap == NULL) 6342 pmap = kernel_pmap; 6343 6344 /* 6345 * In kernel, we should never get abort with FAR which is in range of 6346 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here 6347 * and print out a useful abort message and even get to the debugger 6348 * otherwise it likely ends with never ending loop of aborts. 6349 */ 6350 if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { 6351 /* 6352 * All L1 tables should always be mapped and present. 6353 * However, we check only current one herein. For user mode, 6354 * only permission abort from malicious user is not fatal. 6355 * And alignment abort as it may have higher priority. 6356 */ 6357 if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { 6358 CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", 6359 __func__, pmap, pmap->pm_pt1, far); 6360 panic("%s: pm_pt1 abort", __func__); 6361 } 6362 return (KERN_INVALID_ADDRESS); 6363 } 6364 if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { 6365 /* 6366 * PT2MAP should be always mapped and present in current 6367 * L1 table. However, only existing L2 tables are mapped 6368 * in PT2MAP. For user mode, only L2 translation abort and 6369 * permission abort from malicious user is not fatal. 6370 * And alignment abort as it may have higher priority. 6371 */ 6372 if (!usermode || (idx != FAULT_ALIGN && 6373 idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { 6374 CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", 6375 __func__, pmap, PT2MAP, far); 6376 panic("%s: PT2MAP abort", __func__); 6377 } 6378 return (KERN_INVALID_ADDRESS); 6379 } 6380 6381 /* 6382 * A pmap lock is used below for handling of access and R/W emulation 6383 * aborts. They were handled by atomic operations before so some 6384 * analysis of new situation is needed to answer the following question: 6385 * Is it safe to use the lock even for these aborts? 6386 * 6387 * There may happen two cases in general: 6388 * 6389 * (1) Aborts while the pmap lock is locked already - this should not 6390 * happen as pmap lock is not recursive. However, under pmap lock only 6391 * internal kernel data should be accessed and such data should be 6392 * mapped with A bit set and NM bit cleared. If double abort happens, 6393 * then a mapping of data which has caused it must be fixed. Further, 6394 * all new mappings are always made with A bit set and the bit can be 6395 * cleared only on managed mappings. 6396 * 6397 * (2) Aborts while another lock(s) is/are locked - this already can 6398 * happen. However, there is no difference here if it's either access or 6399 * R/W emulation abort, or if it's some other abort. 6400 */ 6401 6402 PMAP_LOCK(pmap); 6403 #ifdef INVARIANTS 6404 pte1 = pte1_load(pmap_pte1(pmap, far)); 6405 if (pte1_is_link(pte1)) { 6406 /* 6407 * Check in advance that associated L2 page table is mapped into 6408 * PT2MAP space. Note that faulty access to not mapped L2 page 6409 * table is caught in more general check above where "far" is 6410 * checked that it does not lay in PT2MAP space. Note also that 6411 * L1 page table and PT2TAB always exist and are mapped. 6412 */ 6413 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); 6414 if (!pte2_is_valid(pte2)) 6415 panic("%s: missing L2 page table (%p, %#x)", 6416 __func__, pmap, far); 6417 } 6418 #endif 6419 #ifdef SMP 6420 /* 6421 * Special treatment is due to break-before-make approach done when 6422 * pte1 is updated for userland mapping during section promotion or 6423 * demotion. If not caught here, pmap_enter() can find a section 6424 * mapping on faulting address. That is not allowed. 6425 */ 6426 if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { 6427 PMAP_UNLOCK(pmap); 6428 return (KERN_SUCCESS); 6429 } 6430 #endif 6431 /* 6432 * Accesss bits for page and section. Note that the entry 6433 * is not in TLB yet, so TLB flush is not necessary. 6434 * 6435 * QQQ: This is hardware emulation, we do not call userret() 6436 * for aborts from user mode. 6437 */ 6438 if (idx == FAULT_ACCESS_L2) { 6439 pte1 = pte1_load(pmap_pte1(pmap, far)); 6440 if (pte1_is_link(pte1)) { 6441 /* L2 page table should exist and be mapped. */ 6442 pte2p = pt2map_entry(far); 6443 pte2 = pte2_load(pte2p); 6444 if (pte2_is_valid(pte2)) { 6445 pte2_store(pte2p, pte2 | PTE2_A); 6446 PMAP_UNLOCK(pmap); 6447 return (KERN_SUCCESS); 6448 } 6449 } else { 6450 /* 6451 * We got L2 access fault but PTE1 is not a link. 6452 * Probably some race happened, do nothing. 6453 */ 6454 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L2 - pmap %#x far %#x", 6455 __func__, pmap, far); 6456 PMAP_UNLOCK(pmap); 6457 return (KERN_SUCCESS); 6458 } 6459 } 6460 if (idx == FAULT_ACCESS_L1) { 6461 pte1p = pmap_pte1(pmap, far); 6462 pte1 = pte1_load(pte1p); 6463 if (pte1_is_section(pte1)) { 6464 pte1_store(pte1p, pte1 | PTE1_A); 6465 PMAP_UNLOCK(pmap); 6466 return (KERN_SUCCESS); 6467 } else { 6468 /* 6469 * We got L1 access fault but PTE1 is not section 6470 * mapping. Probably some race happened, do nothing. 6471 */ 6472 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L1 - pmap %#x far %#x", 6473 __func__, pmap, far); 6474 PMAP_UNLOCK(pmap); 6475 return (KERN_SUCCESS); 6476 } 6477 } 6478 6479 /* 6480 * Handle modify bits for page and section. Note that the modify 6481 * bit is emulated by software. So PTEx_RO is software read only 6482 * bit and PTEx_NM flag is real hardware read only bit. 6483 * 6484 * QQQ: This is hardware emulation, we do not call userret() 6485 * for aborts from user mode. 6486 */ 6487 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { 6488 pte1 = pte1_load(pmap_pte1(pmap, far)); 6489 if (pte1_is_link(pte1)) { 6490 /* L2 page table should exist and be mapped. */ 6491 pte2p = pt2map_entry(far); 6492 pte2 = pte2_load(pte2p); 6493 if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && 6494 (pte2 & PTE2_NM)) { 6495 pte2_store(pte2p, pte2 & ~PTE2_NM); 6496 tlb_flush(trunc_page(far)); 6497 PMAP_UNLOCK(pmap); 6498 return (KERN_SUCCESS); 6499 } 6500 } else { 6501 /* 6502 * We got L2 permission fault but PTE1 is not a link. 6503 * Probably some race happened, do nothing. 6504 */ 6505 CTR3(KTR_PMAP, "%s: FAULT_PERM_L2 - pmap %#x far %#x", 6506 __func__, pmap, far); 6507 PMAP_UNLOCK(pmap); 6508 return (KERN_SUCCESS); 6509 } 6510 } 6511 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { 6512 pte1p = pmap_pte1(pmap, far); 6513 pte1 = pte1_load(pte1p); 6514 if (pte1_is_section(pte1)) { 6515 if (!(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { 6516 pte1_store(pte1p, pte1 & ~PTE1_NM); 6517 tlb_flush(pte1_trunc(far)); 6518 PMAP_UNLOCK(pmap); 6519 return (KERN_SUCCESS); 6520 } 6521 } else { 6522 /* 6523 * We got L1 permission fault but PTE1 is not section 6524 * mapping. Probably some race happened, do nothing. 6525 */ 6526 CTR3(KTR_PMAP, "%s: FAULT_PERM_L1 - pmap %#x far %#x", 6527 __func__, pmap, far); 6528 PMAP_UNLOCK(pmap); 6529 return (KERN_SUCCESS); 6530 } 6531 } 6532 6533 /* 6534 * QQQ: The previous code, mainly fast handling of access and 6535 * modify bits aborts, could be moved to ASM. Now we are 6536 * starting to deal with not fast aborts. 6537 */ 6538 PMAP_UNLOCK(pmap); 6539 return (KERN_FAILURE); 6540 } 6541 6542 #if defined(PMAP_DEBUG) 6543 /* 6544 * Reusing of KVA used in pmap_zero_page function !!! 6545 */ 6546 static void 6547 pmap_zero_page_check(vm_page_t m) 6548 { 6549 pt2_entry_t *cmap2_pte2p; 6550 uint32_t *p, *end; 6551 struct pcpu *pc; 6552 6553 sched_pin(); 6554 pc = get_pcpu(); 6555 cmap2_pte2p = pc->pc_cmap2_pte2p; 6556 mtx_lock(&pc->pc_cmap_lock); 6557 if (pte2_load(cmap2_pte2p) != 0) 6558 panic("%s: CMAP2 busy", __func__); 6559 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6560 vm_page_pte2_attr(m))); 6561 end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); 6562 for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) 6563 if (*p != 0) 6564 panic("%s: page %p not zero, va: %p", __func__, m, 6565 pc->pc_cmap2_addr); 6566 pte2_clear(cmap2_pte2p); 6567 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6568 sched_unpin(); 6569 mtx_unlock(&pc->pc_cmap_lock); 6570 } 6571 6572 int 6573 pmap_pid_dump(int pid) 6574 { 6575 pmap_t pmap; 6576 struct proc *p; 6577 int npte2 = 0; 6578 int i, j, index; 6579 6580 sx_slock(&allproc_lock); 6581 FOREACH_PROC_IN_SYSTEM(p) { 6582 if (p->p_pid != pid || p->p_vmspace == NULL) 6583 continue; 6584 index = 0; 6585 pmap = vmspace_pmap(p->p_vmspace); 6586 for (i = 0; i < NPTE1_IN_PT1; i++) { 6587 pt1_entry_t pte1; 6588 pt2_entry_t *pte2p, pte2; 6589 vm_offset_t base, va; 6590 vm_paddr_t pa; 6591 vm_page_t m; 6592 6593 base = i << PTE1_SHIFT; 6594 pte1 = pte1_load(&pmap->pm_pt1[i]); 6595 6596 if (pte1_is_section(pte1)) { 6597 /* 6598 * QQQ: Do something here! 6599 */ 6600 } else if (pte1_is_link(pte1)) { 6601 for (j = 0; j < NPTE2_IN_PT2; j++) { 6602 va = base + (j << PAGE_SHIFT); 6603 if (va >= VM_MIN_KERNEL_ADDRESS) { 6604 if (index) { 6605 index = 0; 6606 printf("\n"); 6607 } 6608 sx_sunlock(&allproc_lock); 6609 return (npte2); 6610 } 6611 pte2p = pmap_pte2(pmap, va); 6612 pte2 = pte2_load(pte2p); 6613 pmap_pte2_release(pte2p); 6614 if (!pte2_is_valid(pte2)) 6615 continue; 6616 6617 pa = pte2_pa(pte2); 6618 m = PHYS_TO_VM_PAGE(pa); 6619 printf("va: 0x%x, pa: 0x%x, h: %d, w:" 6620 " %d, f: 0x%x", va, pa, 6621 m->hold_count, m->wire_count, 6622 m->flags); 6623 npte2++; 6624 index++; 6625 if (index >= 2) { 6626 index = 0; 6627 printf("\n"); 6628 } else { 6629 printf(" "); 6630 } 6631 } 6632 } 6633 } 6634 } 6635 sx_sunlock(&allproc_lock); 6636 return (npte2); 6637 } 6638 6639 #endif 6640 6641 #ifdef DDB 6642 static pt2_entry_t * 6643 pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) 6644 { 6645 pt1_entry_t pte1; 6646 vm_paddr_t pt2pg_pa; 6647 6648 pte1 = pte1_load(pmap_pte1(pmap, va)); 6649 if (!pte1_is_link(pte1)) 6650 return (NULL); 6651 6652 if (pmap_is_current(pmap)) 6653 return (pt2map_entry(va)); 6654 6655 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 6656 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 6657 if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { 6658 pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); 6659 #ifdef SMP 6660 PMAP3cpu = PCPU_GET(cpuid); 6661 #endif 6662 tlb_flush_local((vm_offset_t)PADDR3); 6663 } 6664 #ifdef SMP 6665 else if (PMAP3cpu != PCPU_GET(cpuid)) { 6666 PMAP3cpu = PCPU_GET(cpuid); 6667 tlb_flush_local((vm_offset_t)PADDR3); 6668 } 6669 #endif 6670 return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 6671 } 6672 6673 static void 6674 dump_pmap(pmap_t pmap) 6675 { 6676 6677 printf("pmap %p\n", pmap); 6678 printf(" pm_pt1: %p\n", pmap->pm_pt1); 6679 printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); 6680 printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); 6681 } 6682 6683 DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) 6684 { 6685 6686 pmap_t pmap; 6687 LIST_FOREACH(pmap, &allpmaps, pm_list) { 6688 dump_pmap(pmap); 6689 } 6690 } 6691 6692 static int 6693 pte2_class(pt2_entry_t pte2) 6694 { 6695 int cls; 6696 6697 cls = (pte2 >> 2) & 0x03; 6698 cls |= (pte2 >> 4) & 0x04; 6699 return (cls); 6700 } 6701 6702 static void 6703 dump_section(pmap_t pmap, uint32_t pte1_idx) 6704 { 6705 } 6706 6707 static void 6708 dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) 6709 { 6710 uint32_t i; 6711 vm_offset_t va; 6712 pt2_entry_t *pte2p, pte2; 6713 vm_page_t m; 6714 6715 va = pte1_idx << PTE1_SHIFT; 6716 pte2p = pmap_pte2_ddb(pmap, va); 6717 for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { 6718 pte2 = pte2_load(pte2p); 6719 if (pte2 == 0) 6720 continue; 6721 if (!pte2_is_valid(pte2)) { 6722 printf(" 0x%08X: 0x%08X", va, pte2); 6723 if (!invalid_ok) 6724 printf(" - not valid !!!"); 6725 printf("\n"); 6726 continue; 6727 } 6728 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 6729 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, 6730 pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); 6731 if (m != NULL) { 6732 printf(" v:%d h:%d w:%d f:0x%04X\n", m->valid, 6733 m->hold_count, m->wire_count, m->flags); 6734 } else { 6735 printf("\n"); 6736 } 6737 } 6738 } 6739 6740 static __inline boolean_t 6741 is_pv_chunk_space(vm_offset_t va) 6742 { 6743 6744 if ((((vm_offset_t)pv_chunkbase) <= va) && 6745 (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) 6746 return (TRUE); 6747 return (FALSE); 6748 } 6749 6750 DB_SHOW_COMMAND(pmap, pmap_pmap_print) 6751 { 6752 /* XXX convert args. */ 6753 pmap_t pmap = (pmap_t)addr; 6754 pt1_entry_t pte1; 6755 pt2_entry_t pte2; 6756 vm_offset_t va, eva; 6757 vm_page_t m; 6758 uint32_t i; 6759 boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; 6760 6761 if (have_addr) { 6762 pmap_t pm; 6763 6764 LIST_FOREACH(pm, &allpmaps, pm_list) 6765 if (pm == pmap) break; 6766 if (pm == NULL) { 6767 printf("given pmap %p is not in allpmaps list\n", pmap); 6768 return; 6769 } 6770 } else 6771 pmap = PCPU_GET(curpmap); 6772 6773 eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; 6774 dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ 6775 6776 printf("pmap: 0x%08X\n", (uint32_t)pmap); 6777 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6778 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6779 6780 for(i = 0; i < NPTE1_IN_PT1; i++) { 6781 pte1 = pte1_load(&pmap->pm_pt1[i]); 6782 if (pte1 == 0) 6783 continue; 6784 va = i << PTE1_SHIFT; 6785 if (va >= eva) 6786 break; 6787 6788 if (pte1_is_section(pte1)) { 6789 printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, 6790 !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); 6791 dump_section(pmap, i); 6792 } else if (pte1_is_link(pte1)) { 6793 dump_link_ok = TRUE; 6794 invalid_ok = FALSE; 6795 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6796 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 6797 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", 6798 va, pte1, pte2, m); 6799 if (is_pv_chunk_space(va)) { 6800 printf(" - pv_chunk space"); 6801 if (dump_pv_chunk) 6802 invalid_ok = TRUE; 6803 else 6804 dump_link_ok = FALSE; 6805 } 6806 else if (m != NULL) 6807 printf(" w:%d w2:%u", m->wire_count, 6808 pt2_wirecount_get(m, pte1_index(va))); 6809 if (pte2 == 0) 6810 printf(" !!! pt2tab entry is ZERO"); 6811 else if (pte2_pa(pte1) != pte2_pa(pte2)) 6812 printf(" !!! pt2tab entry is DIFFERENT - m: %p", 6813 PHYS_TO_VM_PAGE(pte2_pa(pte2))); 6814 printf("\n"); 6815 if (dump_link_ok) 6816 dump_link(pmap, i, invalid_ok); 6817 } else 6818 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6819 } 6820 } 6821 6822 static void 6823 dump_pt2tab(pmap_t pmap) 6824 { 6825 uint32_t i; 6826 pt2_entry_t pte2; 6827 vm_offset_t va; 6828 vm_paddr_t pa; 6829 vm_page_t m; 6830 6831 printf("PT2TAB:\n"); 6832 for (i = 0; i < PT2TAB_ENTRIES; i++) { 6833 pte2 = pte2_load(&pmap->pm_pt2tab[i]); 6834 if (!pte2_is_valid(pte2)) 6835 continue; 6836 va = i << PT2TAB_SHIFT; 6837 pa = pte2_pa(pte2); 6838 m = PHYS_TO_VM_PAGE(pa); 6839 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, 6840 pte2_class(pte2), !!(pte2 & PTE2_S), m); 6841 if (m != NULL) 6842 printf(" , h: %d, w: %d, f: 0x%04X pidx: %lld", 6843 m->hold_count, m->wire_count, m->flags, m->pindex); 6844 printf("\n"); 6845 } 6846 } 6847 6848 DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) 6849 { 6850 /* XXX convert args. */ 6851 pmap_t pmap = (pmap_t)addr; 6852 pt1_entry_t pte1; 6853 pt2_entry_t pte2; 6854 vm_offset_t va; 6855 uint32_t i, start; 6856 6857 if (have_addr) { 6858 printf("supported only on current pmap\n"); 6859 return; 6860 } 6861 6862 pmap = PCPU_GET(curpmap); 6863 printf("curpmap: 0x%08X\n", (uint32_t)pmap); 6864 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6865 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6866 6867 start = pte1_index((vm_offset_t)PT2MAP); 6868 for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { 6869 pte1 = pte1_load(&pmap->pm_pt1[i]); 6870 if (pte1 == 0) 6871 continue; 6872 va = i << PTE1_SHIFT; 6873 if (pte1_is_section(pte1)) { 6874 printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, 6875 !!(pte1 & PTE1_S)); 6876 dump_section(pmap, i); 6877 } else if (pte1_is_link(pte1)) { 6878 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6879 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, 6880 pte1, pte2); 6881 if (pte2 == 0) 6882 printf(" !!! pt2tab entry is ZERO\n"); 6883 } else 6884 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6885 } 6886 dump_pt2tab(pmap); 6887 } 6888 #endif 6889