1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 8 * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org> 9 * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org> 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 41 */ 42 /*- 43 * Copyright (c) 2003 Networks Associates Technology, Inc. 44 * All rights reserved. 45 * 46 * This software was developed for the FreeBSD Project by Jake Burkholder, 47 * Safeport Network Services, and Network Associates Laboratories, the 48 * Security Research Division of Network Associates, Inc. under 49 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 50 * CHATS research program. 51 * 52 * Redistribution and use in source and binary forms, with or without 53 * modification, are permitted provided that the following conditions 54 * are met: 55 * 1. Redistributions of source code must retain the above copyright 56 * notice, this list of conditions and the following disclaimer. 57 * 2. Redistributions in binary form must reproduce the above copyright 58 * notice, this list of conditions and the following disclaimer in the 59 * documentation and/or other materials provided with the distribution. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 */ 73 74 #include <sys/cdefs.h> 75 __FBSDID("$FreeBSD$"); 76 77 /* 78 * Manages physical address maps. 79 * 80 * Since the information managed by this module is 81 * also stored by the logical address mapping module, 82 * this module may throw away valid virtual-to-physical 83 * mappings at almost any time. However, invalidations 84 * of virtual-to-physical mappings must be done as 85 * requested. 86 * 87 * In order to cope with hardware architectures which 88 * make virtual-to-physical map invalidates expensive, 89 * this module may delay invalidate or reduced protection 90 * operations until such time as they are actually 91 * necessary. This module is given full information as 92 * to which processors are currently using which maps, 93 * and to when physical maps must be made correct. 94 */ 95 96 #include "opt_vm.h" 97 #include "opt_pmap.h" 98 #include "opt_ddb.h" 99 100 #include <sys/param.h> 101 #include <sys/systm.h> 102 #include <sys/kernel.h> 103 #include <sys/ktr.h> 104 #include <sys/lock.h> 105 #include <sys/proc.h> 106 #include <sys/rwlock.h> 107 #include <sys/malloc.h> 108 #include <sys/vmmeter.h> 109 #include <sys/malloc.h> 110 #include <sys/mman.h> 111 #include <sys/sf_buf.h> 112 #include <sys/smp.h> 113 #include <sys/sched.h> 114 #include <sys/sysctl.h> 115 116 #ifdef DDB 117 #include <ddb/ddb.h> 118 #endif 119 120 #include <vm/vm.h> 121 #include <vm/uma.h> 122 #include <vm/pmap.h> 123 #include <vm/vm_param.h> 124 #include <vm/vm_kern.h> 125 #include <vm/vm_object.h> 126 #include <vm/vm_map.h> 127 #include <vm/vm_page.h> 128 #include <vm/vm_pageout.h> 129 #include <vm/vm_phys.h> 130 #include <vm/vm_extern.h> 131 #include <vm/vm_reserv.h> 132 #include <sys/lock.h> 133 #include <sys/mutex.h> 134 135 #include <machine/md_var.h> 136 #include <machine/pmap_var.h> 137 #include <machine/cpu.h> 138 #include <machine/pcb.h> 139 #include <machine/sf_buf.h> 140 #ifdef SMP 141 #include <machine/smp.h> 142 #endif 143 #ifndef PMAP_SHPGPERPROC 144 #define PMAP_SHPGPERPROC 200 145 #endif 146 147 #ifndef DIAGNOSTIC 148 #define PMAP_INLINE __inline 149 #else 150 #define PMAP_INLINE 151 #endif 152 153 #ifdef PMAP_DEBUG 154 static void pmap_zero_page_check(vm_page_t m); 155 void pmap_debug(int level); 156 int pmap_pid_dump(int pid); 157 158 #define PDEBUG(_lev_,_stat_) \ 159 if (pmap_debug_level >= (_lev_)) \ 160 ((_stat_)) 161 #define dprintf printf 162 int pmap_debug_level = 1; 163 #else /* PMAP_DEBUG */ 164 #define PDEBUG(_lev_,_stat_) /* Nothing */ 165 #define dprintf(x, arg...) 166 #endif /* PMAP_DEBUG */ 167 168 /* 169 * Level 2 page tables map definion ('max' is excluded). 170 */ 171 172 #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 173 #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) 174 175 #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 176 #define UPT2V_MAX_ADDRESS \ 177 ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) 178 179 /* 180 * Promotion to a 1MB (PTE1) page mapping requires that the corresponding 181 * 4KB (PTE2) page mappings have identical settings for the following fields: 182 */ 183 #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ 184 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ 185 PTE2_ATTR_MASK) 186 187 #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ 188 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ 189 PTE1_ATTR_MASK) 190 191 #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ 192 (((l2_attr) & L2_C) ? L1_S_C : 0) | \ 193 (((l2_attr) & L2_B) ? L1_S_B : 0) | \ 194 (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ 195 (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ 196 (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ 197 (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ 198 (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ 199 (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ 200 (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ 201 (((l2_attr) & PTE2_W) ? PTE1_W : 0)) 202 203 #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ 204 (((l1_attr) & L1_S_C) ? L2_C : 0) | \ 205 (((l1_attr) & L1_S_B) ? L2_B : 0) | \ 206 (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ 207 (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ 208 (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ 209 (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ 210 (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ 211 (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ 212 (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ 213 (((l1_attr) & PTE1_W) ? PTE2_W : 0)) 214 215 /* 216 * PTE2 descriptors creation macros. 217 */ 218 #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) 219 #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) 220 221 #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 222 #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 223 224 #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) 225 #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) 226 227 #define PV_STATS 228 #ifdef PV_STATS 229 #define PV_STAT(x) do { x ; } while (0) 230 #else 231 #define PV_STAT(x) do { } while (0) 232 #endif 233 234 /* 235 * The boot_pt1 is used temporary in very early boot stage as L1 page table. 236 * We can init many things with no memory allocation thanks to its static 237 * allocation and this brings two main advantages: 238 * (1) other cores can be started very simply, 239 * (2) various boot loaders can be supported as its arguments can be processed 240 * in virtual address space and can be moved to safe location before 241 * first allocation happened. 242 * Only disadvantage is that boot_pt1 is used only in very early boot stage. 243 * However, the table is uninitialized and so lays in bss. Therefore kernel 244 * image size is not influenced. 245 * 246 * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and 247 * CPU suspend/resume game. 248 */ 249 extern pt1_entry_t boot_pt1[]; 250 251 vm_paddr_t base_pt1; 252 pt1_entry_t *kern_pt1; 253 pt2_entry_t *kern_pt2tab; 254 pt2_entry_t *PT2MAP; 255 256 static uint32_t ttb_flags; 257 static vm_memattr_t pt_memattr; 258 ttb_entry_t pmap_kern_ttb; 259 260 struct pmap kernel_pmap_store; 261 LIST_HEAD(pmaplist, pmap); 262 static struct pmaplist allpmaps; 263 static struct mtx allpmaps_lock; 264 265 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 266 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 267 268 static vm_offset_t kernel_vm_end_new; 269 vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; 270 vm_offset_t vm_max_kernel_address; 271 vm_paddr_t kernel_l1pa; 272 273 static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; 274 275 /* 276 * Data for the pv entry allocation mechanism 277 */ 278 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 279 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 280 static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ 281 static int shpgperproc = PMAP_SHPGPERPROC; 282 283 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 284 int pv_maxchunks; /* How many chunks we have KVA for */ 285 vm_offset_t pv_vafree; /* freelist stored in the PTE */ 286 287 vm_paddr_t first_managed_pa; 288 #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) 289 290 /* 291 * All those kernel PT submaps that BSD is so fond of 292 */ 293 caddr_t _tmppt = 0; 294 295 /* 296 * Crashdump maps. 297 */ 298 static caddr_t crashdumpmap; 299 300 static pt2_entry_t *PMAP1 = NULL, *PMAP2; 301 static pt2_entry_t *PADDR1 = NULL, *PADDR2; 302 #ifdef DDB 303 static pt2_entry_t *PMAP3; 304 static pt2_entry_t *PADDR3; 305 static int PMAP3cpu __unused; /* for SMP only */ 306 #endif 307 #ifdef SMP 308 static int PMAP1cpu; 309 static int PMAP1changedcpu; 310 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 311 &PMAP1changedcpu, 0, 312 "Number of times pmap_pte2_quick changed CPU with same PMAP1"); 313 #endif 314 static int PMAP1changed; 315 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 316 &PMAP1changed, 0, 317 "Number of times pmap_pte2_quick changed PMAP1"); 318 static int PMAP1unchanged; 319 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 320 &PMAP1unchanged, 0, 321 "Number of times pmap_pte2_quick didn't change PMAP1"); 322 static struct mtx PMAP2mutex; 323 324 /* 325 * Internal flags for pmap_enter()'s helper functions. 326 */ 327 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 328 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 329 330 static __inline void pt2_wirecount_init(vm_page_t m); 331 static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, 332 vm_offset_t va); 333 static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, 334 u_int flags, vm_page_t m); 335 void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); 336 337 /* 338 * Function to set the debug level of the pmap code. 339 */ 340 #ifdef PMAP_DEBUG 341 void 342 pmap_debug(int level) 343 { 344 345 pmap_debug_level = level; 346 dprintf("pmap_debug: level=%d\n", pmap_debug_level); 347 } 348 #endif /* PMAP_DEBUG */ 349 350 /* 351 * This table must corespond with memory attribute configuration in vm.h. 352 * First entry is used for normal system mapping. 353 * 354 * Device memory is always marked as shared. 355 * Normal memory is shared only in SMP . 356 * Not outer shareable bits are not used yet. 357 * Class 6 cannot be used on ARM11. 358 */ 359 #define TEXDEF_TYPE_SHIFT 0 360 #define TEXDEF_TYPE_MASK 0x3 361 #define TEXDEF_INNER_SHIFT 2 362 #define TEXDEF_INNER_MASK 0x3 363 #define TEXDEF_OUTER_SHIFT 4 364 #define TEXDEF_OUTER_MASK 0x3 365 #define TEXDEF_NOS_SHIFT 6 366 #define TEXDEF_NOS_MASK 0x1 367 368 #define TEX(t, i, o, s) \ 369 ((t) << TEXDEF_TYPE_SHIFT) | \ 370 ((i) << TEXDEF_INNER_SHIFT) | \ 371 ((o) << TEXDEF_OUTER_SHIFT | \ 372 ((s) << TEXDEF_NOS_SHIFT)) 373 374 static uint32_t tex_class[8] = { 375 /* type inner cache outer cache */ 376 TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ 377 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ 378 TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ 379 TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ 380 TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ 381 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ 382 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ 383 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ 384 }; 385 #undef TEX 386 387 static uint32_t pte2_attr_tab[8] = { 388 PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ 389 PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ 390 PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ 391 PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ 392 PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 393 0, /* 5 - NOT USED YET */ 394 0, /* 6 - NOT USED YET */ 395 0 /* 7 - NOT USED YET */ 396 }; 397 CTASSERT(VM_MEMATTR_WB_WA == 0); 398 CTASSERT(VM_MEMATTR_NOCACHE == 1); 399 CTASSERT(VM_MEMATTR_DEVICE == 2); 400 CTASSERT(VM_MEMATTR_SO == 3); 401 CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); 402 #define VM_MEMATTR_END (VM_MEMATTR_WRITE_THROUGH + 1) 403 404 boolean_t 405 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 406 { 407 408 return (mode >= 0 && mode < VM_MEMATTR_END); 409 } 410 411 static inline uint32_t 412 vm_memattr_to_pte2(vm_memattr_t ma) 413 { 414 415 KASSERT((u_int)ma < VM_MEMATTR_END, 416 ("%s: bad vm_memattr_t %d", __func__, ma)); 417 return (pte2_attr_tab[(u_int)ma]); 418 } 419 420 static inline uint32_t 421 vm_page_pte2_attr(vm_page_t m) 422 { 423 424 return (vm_memattr_to_pte2(m->md.pat_mode)); 425 } 426 427 /* 428 * Convert TEX definition entry to TTB flags. 429 */ 430 static uint32_t 431 encode_ttb_flags(int idx) 432 { 433 uint32_t inner, outer, nos, reg; 434 435 inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & 436 TEXDEF_INNER_MASK; 437 outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & 438 TEXDEF_OUTER_MASK; 439 nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & 440 TEXDEF_NOS_MASK; 441 442 reg = nos << 5; 443 reg |= outer << 3; 444 if (cpuinfo.coherent_walk) 445 reg |= (inner & 0x1) << 6; 446 reg |= (inner & 0x2) >> 1; 447 #ifdef SMP 448 ARM_SMP_UP( 449 reg |= 1 << 1, 450 ); 451 #endif 452 return reg; 453 } 454 455 /* 456 * Set TEX remapping registers in current CPU. 457 */ 458 void 459 pmap_set_tex(void) 460 { 461 uint32_t prrr, nmrr; 462 uint32_t type, inner, outer, nos; 463 int i; 464 465 #ifdef PMAP_PTE_NOCACHE 466 /* XXX fixme */ 467 if (cpuinfo.coherent_walk) { 468 pt_memattr = VM_MEMATTR_WB_WA; 469 ttb_flags = encode_ttb_flags(0); 470 } 471 else { 472 pt_memattr = VM_MEMATTR_NOCACHE; 473 ttb_flags = encode_ttb_flags(1); 474 } 475 #else 476 pt_memattr = VM_MEMATTR_WB_WA; 477 ttb_flags = encode_ttb_flags(0); 478 #endif 479 480 prrr = 0; 481 nmrr = 0; 482 483 /* Build remapping register from TEX classes. */ 484 for (i = 0; i < 8; i++) { 485 type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & 486 TEXDEF_TYPE_MASK; 487 inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & 488 TEXDEF_INNER_MASK; 489 outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & 490 TEXDEF_OUTER_MASK; 491 nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & 492 TEXDEF_NOS_MASK; 493 494 prrr |= type << (i * 2); 495 prrr |= nos << (i + 24); 496 nmrr |= inner << (i * 2); 497 nmrr |= outer << (i * 2 + 16); 498 } 499 /* Add shareable bits for device memory. */ 500 prrr |= PRRR_DS0 | PRRR_DS1; 501 502 /* Add shareable bits for normal memory in SMP case. */ 503 #ifdef SMP 504 ARM_SMP_UP( 505 prrr |= PRRR_NS1, 506 ); 507 #endif 508 cp15_prrr_set(prrr); 509 cp15_nmrr_set(nmrr); 510 511 /* Caches are disabled, so full TLB flush should be enough. */ 512 tlb_flush_all_local(); 513 } 514 515 /* 516 * Remap one vm_meattr class to another one. This can be useful as 517 * workaround for SOC errata, e.g. if devices must be accessed using 518 * SO memory class. 519 * 520 * !!! Please note that this function is absolutely last resort thing. 521 * It should not be used under normal circumstances. !!! 522 * 523 * Usage rules: 524 * - it shall be called after pmap_bootstrap_prepare() and before 525 * cpu_mp_start() (thus only on boot CPU). In practice, it's expected 526 * to be called from platform_attach() or platform_late_init(). 527 * 528 * - if remapping doesn't change caching mode, or until uncached class 529 * is remapped to any kind of cached one, then no other restriction exists. 530 * 531 * - if pmap_remap_vm_attr() changes caching mode, but both (original and 532 * remapped) remain cached, then caller is resposible for calling 533 * of dcache_wbinv_poc_all(). 534 * 535 * - remapping of any kind of cached class to uncached is not permitted. 536 */ 537 void 538 pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) 539 { 540 int old_idx, new_idx; 541 542 /* Map VM memattrs to indexes to tex_class table. */ 543 old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); 544 new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); 545 546 /* Replace TEX attribute and apply it. */ 547 tex_class[old_idx] = tex_class[new_idx]; 548 pmap_set_tex(); 549 } 550 551 /* 552 * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, 553 * KERNBASE is mapped by first L2 page table in L2 page table page. It 554 * meets same constrain due to PT2MAP being placed just under KERNBASE. 555 */ 556 CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); 557 CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); 558 559 /* 560 * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. 561 * For now, anyhow, the following check must be fulfilled. 562 */ 563 CTASSERT(PAGE_SIZE == PTE2_SIZE); 564 /* 565 * We don't want to mess up MI code with all MMU and PMAP definitions, 566 * so some things, which depend on other ones, are defined independently. 567 * Now, it is time to check that we don't screw up something. 568 */ 569 CTASSERT(PDRSHIFT == PTE1_SHIFT); 570 /* 571 * Check L1 and L2 page table entries definitions consistency. 572 */ 573 CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); 574 CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); 575 /* 576 * Check L2 page tables page consistency. 577 */ 578 CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); 579 CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); 580 /* 581 * Check PT2TAB consistency. 582 * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. 583 * This should be done without remainder. 584 */ 585 CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); 586 587 /* 588 * A PT2MAP magic. 589 * 590 * All level 2 page tables (PT2s) are mapped continuously and accordingly 591 * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can 592 * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page 593 * must be used together, but not necessary at once. The first PT2 in a page 594 * must map things on correctly aligned address and the others must follow 595 * in right order. 596 */ 597 #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) 598 #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) 599 #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) 600 601 /* 602 * Check PT2TAB consistency. 603 * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. 604 * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. 605 * The both should be done without remainder. 606 */ 607 CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); 608 CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); 609 /* 610 * The implementation was made general, however, with the assumption 611 * bellow in mind. In case of another value of NPG_IN_PT2TAB, 612 * the code should be once more rechecked. 613 */ 614 CTASSERT(NPG_IN_PT2TAB == 1); 615 616 /* 617 * Get offset of PT2 in a page 618 * associated with given PT1 index. 619 */ 620 static __inline u_int 621 page_pt2off(u_int pt1_idx) 622 { 623 624 return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); 625 } 626 627 /* 628 * Get physical address of PT2 629 * associated with given PT2s page and PT1 index. 630 */ 631 static __inline vm_paddr_t 632 page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) 633 { 634 635 return (pgpa + page_pt2off(pt1_idx)); 636 } 637 638 /* 639 * Get first entry of PT2 640 * associated with given PT2s page and PT1 index. 641 */ 642 static __inline pt2_entry_t * 643 page_pt2(vm_offset_t pgva, u_int pt1_idx) 644 { 645 646 return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); 647 } 648 649 /* 650 * Get virtual address of PT2s page (mapped in PT2MAP) 651 * which holds PT2 which holds entry which maps given virtual address. 652 */ 653 static __inline vm_offset_t 654 pt2map_pt2pg(vm_offset_t va) 655 { 656 657 va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); 658 return ((vm_offset_t)pt2map_entry(va)); 659 } 660 661 /***************************************************************************** 662 * 663 * THREE pmap initialization milestones exist: 664 * 665 * locore.S 666 * -> fundamental init (including MMU) in ASM 667 * 668 * initarm() 669 * -> fundamental init continues in C 670 * -> first available physical address is known 671 * 672 * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) 673 * -> basic (safe) interface for physical address allocation is made 674 * -> basic (safe) interface for virtual mapping is made 675 * -> limited not SMP coherent work is possible 676 * 677 * -> more fundamental init continues in C 678 * -> locks and some more things are available 679 * -> all fundamental allocations and mappings are done 680 * 681 * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) 682 * -> phys_avail[] and virtual_avail is set 683 * -> control is passed to vm subsystem 684 * -> physical and virtual address allocation are off limit 685 * -> low level mapping functions, some SMP coherent, 686 * are available, which cannot be used before vm subsystem 687 * is being inited 688 * 689 * mi_startup() 690 * -> vm subsystem is being inited 691 * 692 * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) 693 * -> pmap is fully inited 694 * 695 *****************************************************************************/ 696 697 /***************************************************************************** 698 * 699 * PMAP first stage initialization and utility functions 700 * for pre-bootstrap epoch. 701 * 702 * After pmap_bootstrap_prepare() is called, the following functions 703 * can be used: 704 * 705 * (1) strictly only for this stage functions for physical page allocations, 706 * virtual space allocations, and mappings: 707 * 708 * vm_paddr_t pmap_preboot_get_pages(u_int num); 709 * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); 710 * vm_offset_t pmap_preboot_reserve_pages(u_int num); 711 * vm_offset_t pmap_preboot_get_vpages(u_int num); 712 * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 713 * vm_prot_t prot, vm_memattr_t attr); 714 * 715 * (2) for all stages: 716 * 717 * vm_paddr_t pmap_kextract(vm_offset_t va); 718 * 719 * NOTE: This is not SMP coherent stage. 720 * 721 *****************************************************************************/ 722 723 #define KERNEL_P2V(pa) \ 724 ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) 725 #define KERNEL_V2P(va) \ 726 ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) 727 728 static vm_paddr_t last_paddr; 729 730 /* 731 * Pre-bootstrap epoch page allocator. 732 */ 733 vm_paddr_t 734 pmap_preboot_get_pages(u_int num) 735 { 736 vm_paddr_t ret; 737 738 ret = last_paddr; 739 last_paddr += num * PAGE_SIZE; 740 741 return (ret); 742 } 743 744 /* 745 * The fundamental initialization of PMAP stuff. 746 * 747 * Some things already happened in locore.S and some things could happen 748 * before pmap_bootstrap_prepare() is called, so let's recall what is done: 749 * 1. Caches are disabled. 750 * 2. We are running on virtual addresses already with 'boot_pt1' 751 * as L1 page table. 752 * 3. So far, all virtual addresses can be converted to physical ones and 753 * vice versa by the following macros: 754 * KERNEL_P2V(pa) .... physical to virtual ones, 755 * KERNEL_V2P(va) .... virtual to physical ones. 756 * 757 * What is done herein: 758 * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. 759 * 2. PT2MAP magic is brought to live. 760 * 3. Basic preboot functions for page allocations and mappings can be used. 761 * 4. Everything is prepared for L1 cache enabling. 762 * 763 * Variations: 764 * 1. To use second TTB register, so kernel and users page tables will be 765 * separated. This way process forking - pmap_pinit() - could be faster, 766 * it saves physical pages and KVA per a process, and it's simple change. 767 * However, it will lead, due to hardware matter, to the following: 768 * (a) 2G space for kernel and 2G space for users. 769 * (b) 1G space for kernel in low addresses and 3G for users above it. 770 * A question is: Is the case (b) really an option? Note that case (b) 771 * does save neither physical memory and KVA. 772 */ 773 void 774 pmap_bootstrap_prepare(vm_paddr_t last) 775 { 776 vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; 777 vm_offset_t pt2pg_va; 778 pt1_entry_t *pte1p; 779 pt2_entry_t *pte2p; 780 u_int i; 781 uint32_t l1_attr; 782 783 /* 784 * Now, we are going to make real kernel mapping. Note that we are 785 * already running on some mapping made in locore.S and we expect 786 * that it's large enough to ensure nofault access to physical memory 787 * allocated herein before switch. 788 * 789 * As kernel image and everything needed before are and will be mapped 790 * by section mappings, we align last physical address to PTE1_SIZE. 791 */ 792 last_paddr = pte1_roundup(last); 793 794 /* 795 * Allocate and zero page(s) for kernel L1 page table. 796 * 797 * Note that it's first allocation on space which was PTE1_SIZE 798 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. 799 */ 800 base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); 801 kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); 802 bzero((void*)kern_pt1, NB_IN_PT1); 803 pte1_sync_range(kern_pt1, NB_IN_PT1); 804 805 /* Allocate and zero page(s) for kernel PT2TAB. */ 806 pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); 807 kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); 808 bzero(kern_pt2tab, NB_IN_PT2TAB); 809 pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); 810 811 /* Allocate and zero page(s) for kernel L2 page tables. */ 812 pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); 813 pt2pg_va = KERNEL_P2V(pt2pg_pa); 814 size = NKPT2PG * PAGE_SIZE; 815 bzero((void*)pt2pg_va, size); 816 pte2_sync_range((pt2_entry_t *)pt2pg_va, size); 817 818 /* 819 * Add a physical memory segment (vm_phys_seg) corresponding to the 820 * preallocated pages for kernel L2 page tables so that vm_page 821 * structures representing these pages will be created. The vm_page 822 * structures are required for promotion of the corresponding kernel 823 * virtual addresses to section mappings. 824 */ 825 vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); 826 827 /* 828 * Insert allocated L2 page table pages to PT2TAB and make 829 * link to all PT2s in L1 page table. See how kernel_vm_end 830 * is initialized. 831 * 832 * We play simple and safe. So every KVA will have underlaying 833 * L2 page table, even kernel image mapped by sections. 834 */ 835 pte2p = kern_pt2tab_entry(KERNBASE); 836 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) 837 pt2tab_store(pte2p++, PTE2_KPT(pa)); 838 839 pte1p = kern_pte1(KERNBASE); 840 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) 841 pte1_store(pte1p++, PTE1_LINK(pa)); 842 843 /* Make section mappings for kernel. */ 844 l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); 845 pte1p = kern_pte1(KERNBASE); 846 for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) 847 pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); 848 849 /* 850 * Get free and aligned space for PT2MAP and make L1 page table links 851 * to L2 page tables held in PT2TAB. 852 * 853 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t 854 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus 855 * each entry in PT2TAB maps all PT2s in a page. This implies that 856 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. 857 */ 858 PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); 859 pte1p = kern_pte1((vm_offset_t)PT2MAP); 860 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 861 pte1_store(pte1p++, PTE1_LINK(pa)); 862 } 863 864 /* 865 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. 866 * Each pmap will hold own PT2TAB, so the mapping should be not global. 867 */ 868 pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); 869 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 870 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 871 } 872 873 /* 874 * Choose correct L2 page table and make mappings for allocations 875 * made herein which replaces temporary locore.S mappings after a while. 876 * Note that PT2MAP cannot be used until we switch to kern_pt1. 877 * 878 * Note, that these allocations started aligned on 1M section and 879 * kernel PT1 was allocated first. Making of mappings must follow 880 * order of physical allocations as we've used KERNEL_P2V() macro 881 * for virtual addresses resolution. 882 */ 883 pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); 884 pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); 885 886 pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); 887 888 /* Make mapping for kernel L1 page table. */ 889 for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) 890 pte2_store(pte2p++, PTE2_KPT(pa)); 891 892 /* Make mapping for kernel PT2TAB. */ 893 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) 894 pte2_store(pte2p++, PTE2_KPT(pa)); 895 896 /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ 897 pmap_kern_ttb = base_pt1 | ttb_flags; 898 cpuinfo_reinit_mmu(pmap_kern_ttb); 899 /* 900 * Initialize the first available KVA. As kernel image is mapped by 901 * sections, we are leaving some gap behind. 902 */ 903 virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; 904 } 905 906 /* 907 * Setup L2 page table page for given KVA. 908 * Used in pre-bootstrap epoch. 909 * 910 * Note that we have allocated NKPT2PG pages for L2 page tables in advance 911 * and used them for mapping KVA starting from KERNBASE. However, this is not 912 * enough. Vectors and devices need L2 page tables too. Note that they are 913 * even above VM_MAX_KERNEL_ADDRESS. 914 */ 915 static __inline vm_paddr_t 916 pmap_preboot_pt2pg_setup(vm_offset_t va) 917 { 918 pt2_entry_t *pte2p, pte2; 919 vm_paddr_t pt2pg_pa; 920 921 /* Get associated entry in PT2TAB. */ 922 pte2p = kern_pt2tab_entry(va); 923 924 /* Just return, if PT2s page exists already. */ 925 pte2 = pt2tab_load(pte2p); 926 if (pte2_is_valid(pte2)) 927 return (pte2_pa(pte2)); 928 929 KASSERT(va >= VM_MAX_KERNEL_ADDRESS, 930 ("%s: NKPT2PG too small", __func__)); 931 932 /* 933 * Allocate page for PT2s and insert it to PT2TAB. 934 * In other words, map it into PT2MAP space. 935 */ 936 pt2pg_pa = pmap_preboot_get_pages(1); 937 pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); 938 939 /* Zero all PT2s in allocated page. */ 940 bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); 941 pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); 942 943 return (pt2pg_pa); 944 } 945 946 /* 947 * Setup L2 page table for given KVA. 948 * Used in pre-bootstrap epoch. 949 */ 950 static void 951 pmap_preboot_pt2_setup(vm_offset_t va) 952 { 953 pt1_entry_t *pte1p; 954 vm_paddr_t pt2pg_pa, pt2_pa; 955 956 /* Setup PT2's page. */ 957 pt2pg_pa = pmap_preboot_pt2pg_setup(va); 958 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); 959 960 /* Insert PT2 to PT1. */ 961 pte1p = kern_pte1(va); 962 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 963 } 964 965 /* 966 * Get L2 page entry associated with given KVA. 967 * Used in pre-bootstrap epoch. 968 */ 969 static __inline pt2_entry_t* 970 pmap_preboot_vtopte2(vm_offset_t va) 971 { 972 pt1_entry_t *pte1p; 973 974 /* Setup PT2 if needed. */ 975 pte1p = kern_pte1(va); 976 if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ 977 pmap_preboot_pt2_setup(va); 978 979 return (pt2map_entry(va)); 980 } 981 982 /* 983 * Pre-bootstrap epoch page(s) mapping(s). 984 */ 985 void 986 pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) 987 { 988 u_int i; 989 pt2_entry_t *pte2p; 990 991 /* Map all the pages. */ 992 for (i = 0; i < num; i++) { 993 pte2p = pmap_preboot_vtopte2(va); 994 pte2_store(pte2p, PTE2_KRW(pa)); 995 va += PAGE_SIZE; 996 pa += PAGE_SIZE; 997 } 998 } 999 1000 /* 1001 * Pre-bootstrap epoch virtual space alocator. 1002 */ 1003 vm_offset_t 1004 pmap_preboot_reserve_pages(u_int num) 1005 { 1006 u_int i; 1007 vm_offset_t start, va; 1008 pt2_entry_t *pte2p; 1009 1010 /* Allocate virtual space. */ 1011 start = va = virtual_avail; 1012 virtual_avail += num * PAGE_SIZE; 1013 1014 /* Zero the mapping. */ 1015 for (i = 0; i < num; i++) { 1016 pte2p = pmap_preboot_vtopte2(va); 1017 pte2_store(pte2p, 0); 1018 va += PAGE_SIZE; 1019 } 1020 1021 return (start); 1022 } 1023 1024 /* 1025 * Pre-bootstrap epoch page(s) allocation and mapping(s). 1026 */ 1027 vm_offset_t 1028 pmap_preboot_get_vpages(u_int num) 1029 { 1030 vm_paddr_t pa; 1031 vm_offset_t va; 1032 1033 /* Allocate physical page(s). */ 1034 pa = pmap_preboot_get_pages(num); 1035 1036 /* Allocate virtual space. */ 1037 va = virtual_avail; 1038 virtual_avail += num * PAGE_SIZE; 1039 1040 /* Map and zero all. */ 1041 pmap_preboot_map_pages(pa, va, num); 1042 bzero((void *)va, num * PAGE_SIZE); 1043 1044 return (va); 1045 } 1046 1047 /* 1048 * Pre-bootstrap epoch page mapping(s) with attributes. 1049 */ 1050 void 1051 pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 1052 vm_prot_t prot, vm_memattr_t attr) 1053 { 1054 u_int num; 1055 u_int l1_attr, l1_prot, l2_prot, l2_attr; 1056 pt1_entry_t *pte1p; 1057 pt2_entry_t *pte2p; 1058 1059 l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; 1060 l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1061 l2_attr = vm_memattr_to_pte2(attr); 1062 l1_prot = ATTR_TO_L1(l2_prot); 1063 l1_attr = ATTR_TO_L1(l2_attr); 1064 1065 /* Map all the pages. */ 1066 num = round_page(size); 1067 while (num > 0) { 1068 if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { 1069 pte1p = kern_pte1(va); 1070 pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); 1071 va += PTE1_SIZE; 1072 pa += PTE1_SIZE; 1073 num -= PTE1_SIZE; 1074 } else { 1075 pte2p = pmap_preboot_vtopte2(va); 1076 pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); 1077 va += PAGE_SIZE; 1078 pa += PAGE_SIZE; 1079 num -= PAGE_SIZE; 1080 } 1081 } 1082 } 1083 1084 /* 1085 * Extract from the kernel page table the physical address 1086 * that is mapped by the given virtual address "va". 1087 */ 1088 vm_paddr_t 1089 pmap_kextract(vm_offset_t va) 1090 { 1091 vm_paddr_t pa; 1092 pt1_entry_t pte1; 1093 pt2_entry_t pte2; 1094 1095 pte1 = pte1_load(kern_pte1(va)); 1096 if (pte1_is_section(pte1)) { 1097 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1098 } else if (pte1_is_link(pte1)) { 1099 /* 1100 * We should beware of concurrent promotion that changes 1101 * pte1 at this point. However, it's not a problem as PT2 1102 * page is preserved by promotion in PT2TAB. So even if 1103 * it happens, using of PT2MAP is still safe. 1104 * 1105 * QQQ: However, concurrent removing is a problem which 1106 * ends in abort on PT2MAP space. Locking must be used 1107 * to deal with this. 1108 */ 1109 pte2 = pte2_load(pt2map_entry(va)); 1110 pa = pte2_pa(pte2) | (va & PTE2_OFFSET); 1111 } 1112 else { 1113 panic("%s: va %#x pte1 %#x", __func__, va, pte1); 1114 } 1115 return (pa); 1116 } 1117 1118 /* 1119 * Extract from the kernel page table the physical address 1120 * that is mapped by the given virtual address "va". Also 1121 * return L2 page table entry which maps the address. 1122 * 1123 * This is only intended to be used for panic dumps. 1124 */ 1125 vm_paddr_t 1126 pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) 1127 { 1128 vm_paddr_t pa; 1129 pt1_entry_t pte1; 1130 pt2_entry_t pte2; 1131 1132 pte1 = pte1_load(kern_pte1(va)); 1133 if (pte1_is_section(pte1)) { 1134 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1135 pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; 1136 } else if (pte1_is_link(pte1)) { 1137 pte2 = pte2_load(pt2map_entry(va)); 1138 pa = pte2_pa(pte2); 1139 } else { 1140 pte2 = 0; 1141 pa = 0; 1142 } 1143 if (pte2p != NULL) 1144 *pte2p = pte2; 1145 return (pa); 1146 } 1147 1148 /***************************************************************************** 1149 * 1150 * PMAP second stage initialization and utility functions 1151 * for bootstrap epoch. 1152 * 1153 * After pmap_bootstrap() is called, the following functions for 1154 * mappings can be used: 1155 * 1156 * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); 1157 * void pmap_kremove(vm_offset_t va); 1158 * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1159 * int prot); 1160 * 1161 * NOTE: This is not SMP coherent stage. And physical page allocation is not 1162 * allowed during this stage. 1163 * 1164 *****************************************************************************/ 1165 1166 /* 1167 * Initialize kernel PMAP locks and lists, kernel_pmap itself, and 1168 * reserve various virtual spaces for temporary mappings. 1169 */ 1170 void 1171 pmap_bootstrap(vm_offset_t firstaddr) 1172 { 1173 pt2_entry_t *unused __unused; 1174 struct pcpu *pc; 1175 1176 /* 1177 * Initialize the kernel pmap (which is statically allocated). 1178 */ 1179 PMAP_LOCK_INIT(kernel_pmap); 1180 kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ 1181 kernel_pmap->pm_pt1 = kern_pt1; 1182 kernel_pmap->pm_pt2tab = kern_pt2tab; 1183 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1184 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1185 1186 /* 1187 * Initialize the global pv list lock. 1188 */ 1189 rw_init(&pvh_global_lock, "pmap pv global"); 1190 1191 LIST_INIT(&allpmaps); 1192 1193 /* 1194 * Request a spin mutex so that changes to allpmaps cannot be 1195 * preempted by smp_rendezvous_cpus(). 1196 */ 1197 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 1198 mtx_lock_spin(&allpmaps_lock); 1199 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 1200 mtx_unlock_spin(&allpmaps_lock); 1201 1202 /* 1203 * Reserve some special page table entries/VA space for temporary 1204 * mapping of pages. 1205 */ 1206 #define SYSMAP(c, p, v, n) do { \ 1207 v = (c)pmap_preboot_reserve_pages(n); \ 1208 p = pt2map_entry((vm_offset_t)v); \ 1209 } while (0) 1210 1211 /* 1212 * Local CMAP1/CMAP2 are used for zeroing and copying pages. 1213 * Local CMAP2 is also used for data cache cleaning. 1214 */ 1215 pc = get_pcpu(); 1216 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1217 SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); 1218 SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); 1219 SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); 1220 1221 /* 1222 * Crashdump maps. 1223 */ 1224 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); 1225 1226 /* 1227 * _tmppt is used for reading arbitrary physical pages via /dev/mem. 1228 */ 1229 SYSMAP(caddr_t, unused, _tmppt, 1); 1230 1231 /* 1232 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), 1233 * respectively. PADDR3 is used by pmap_pte2_ddb(). 1234 */ 1235 SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); 1236 SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); 1237 #ifdef DDB 1238 SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); 1239 #endif 1240 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 1241 1242 /* 1243 * Note that in very short time in initarm(), we are going to 1244 * initialize phys_avail[] array and no further page allocation 1245 * can happen after that until vm subsystem will be initialized. 1246 */ 1247 kernel_vm_end_new = kernel_vm_end; 1248 virtual_end = vm_max_kernel_address; 1249 } 1250 1251 static void 1252 pmap_init_reserved_pages(void) 1253 { 1254 struct pcpu *pc; 1255 vm_offset_t pages; 1256 int i; 1257 1258 CPU_FOREACH(i) { 1259 pc = pcpu_find(i); 1260 /* 1261 * Skip if the mapping has already been initialized, 1262 * i.e. this is the BSP. 1263 */ 1264 if (pc->pc_cmap1_addr != 0) 1265 continue; 1266 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1267 pages = kva_alloc(PAGE_SIZE * 3); 1268 if (pages == 0) 1269 panic("%s: unable to allocate KVA", __func__); 1270 pc->pc_cmap1_pte2p = pt2map_entry(pages); 1271 pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); 1272 pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); 1273 pc->pc_cmap1_addr = (caddr_t)pages; 1274 pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); 1275 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 1276 } 1277 } 1278 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 1279 1280 /* 1281 * The function can already be use in second initialization stage. 1282 * As such, the function DOES NOT call pmap_growkernel() where PT2 1283 * allocation can happen. So if used, be sure that PT2 for given 1284 * virtual address is allocated already! 1285 * 1286 * Add a wired page to the kva. 1287 * Note: not SMP coherent. 1288 */ 1289 static __inline void 1290 pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, 1291 uint32_t attr) 1292 { 1293 pt1_entry_t *pte1p; 1294 pt2_entry_t *pte2p; 1295 1296 pte1p = kern_pte1(va); 1297 if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ 1298 /* 1299 * This is a very low level function, so PT2 and particularly 1300 * PT2PG associated with given virtual address must be already 1301 * allocated. It's a pain mainly during pmap initialization 1302 * stage. However, called after pmap initialization with 1303 * virtual address not under kernel_vm_end will lead to 1304 * the same misery. 1305 */ 1306 if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) 1307 panic("%s: kernel PT2 not allocated!", __func__); 1308 } 1309 1310 pte2p = pt2map_entry(va); 1311 pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); 1312 } 1313 1314 PMAP_INLINE void 1315 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1316 { 1317 1318 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); 1319 } 1320 1321 /* 1322 * Remove a page from the kernel pagetables. 1323 * Note: not SMP coherent. 1324 */ 1325 PMAP_INLINE void 1326 pmap_kremove(vm_offset_t va) 1327 { 1328 pt1_entry_t *pte1p; 1329 pt2_entry_t *pte2p; 1330 1331 pte1p = kern_pte1(va); 1332 if (pte1_is_section(pte1_load(pte1p))) { 1333 pte1_clear(pte1p); 1334 } else { 1335 pte2p = pt2map_entry(va); 1336 pte2_clear(pte2p); 1337 } 1338 } 1339 1340 /* 1341 * Share new kernel PT2PG with all pmaps. 1342 * The caller is responsible for maintaining TLB consistency. 1343 */ 1344 static void 1345 pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) 1346 { 1347 pmap_t pmap; 1348 pt2_entry_t *pte2p; 1349 1350 mtx_lock_spin(&allpmaps_lock); 1351 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1352 pte2p = pmap_pt2tab_entry(pmap, va); 1353 pt2tab_store(pte2p, npte2); 1354 } 1355 mtx_unlock_spin(&allpmaps_lock); 1356 } 1357 1358 /* 1359 * Share new kernel PTE1 with all pmaps. 1360 * The caller is responsible for maintaining TLB consistency. 1361 */ 1362 static void 1363 pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) 1364 { 1365 pmap_t pmap; 1366 pt1_entry_t *pte1p; 1367 1368 mtx_lock_spin(&allpmaps_lock); 1369 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1370 pte1p = pmap_pte1(pmap, va); 1371 pte1_store(pte1p, npte1); 1372 } 1373 mtx_unlock_spin(&allpmaps_lock); 1374 } 1375 1376 /* 1377 * Used to map a range of physical addresses into kernel 1378 * virtual address space. 1379 * 1380 * The value passed in '*virt' is a suggested virtual address for 1381 * the mapping. Architectures which can support a direct-mapped 1382 * physical to virtual region can return the appropriate address 1383 * within that region, leaving '*virt' unchanged. Other 1384 * architectures should map the pages starting at '*virt' and 1385 * update '*virt' with the first usable address after the mapped 1386 * region. 1387 * 1388 * NOTE: Read the comments above pmap_kenter_prot_attr() as 1389 * the function is used herein! 1390 */ 1391 vm_offset_t 1392 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1393 { 1394 vm_offset_t va, sva; 1395 vm_paddr_t pte1_offset; 1396 pt1_entry_t npte1; 1397 uint32_t l1prot, l2prot; 1398 uint32_t l1attr, l2attr; 1399 1400 PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," 1401 " prot = %d\n", __func__, *virt, start, end, end - start, prot)); 1402 1403 l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; 1404 l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1405 l1prot = ATTR_TO_L1(l2prot); 1406 1407 l2attr = PTE2_ATTR_DEFAULT; 1408 l1attr = ATTR_TO_L1(l2attr); 1409 1410 va = *virt; 1411 /* 1412 * Does the physical address range's size and alignment permit at 1413 * least one section mapping to be created? 1414 */ 1415 pte1_offset = start & PTE1_OFFSET; 1416 if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= 1417 PTE1_SIZE) { 1418 /* 1419 * Increase the starting virtual address so that its alignment 1420 * does not preclude the use of section mappings. 1421 */ 1422 if ((va & PTE1_OFFSET) < pte1_offset) 1423 va = pte1_trunc(va) + pte1_offset; 1424 else if ((va & PTE1_OFFSET) > pte1_offset) 1425 va = pte1_roundup(va) + pte1_offset; 1426 } 1427 sva = va; 1428 while (start < end) { 1429 if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { 1430 KASSERT((va & PTE1_OFFSET) == 0, 1431 ("%s: misaligned va %#x", __func__, va)); 1432 npte1 = PTE1_KERN(start, l1prot, l1attr); 1433 pmap_kenter_pte1(va, npte1); 1434 va += PTE1_SIZE; 1435 start += PTE1_SIZE; 1436 } else { 1437 pmap_kenter_prot_attr(va, start, l2prot, l2attr); 1438 va += PAGE_SIZE; 1439 start += PAGE_SIZE; 1440 } 1441 } 1442 tlb_flush_range(sva, va - sva); 1443 *virt = va; 1444 return (sva); 1445 } 1446 1447 /* 1448 * Make a temporary mapping for a physical address. 1449 * This is only intended to be used for panic dumps. 1450 */ 1451 void * 1452 pmap_kenter_temporary(vm_paddr_t pa, int i) 1453 { 1454 vm_offset_t va; 1455 1456 /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ 1457 1458 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 1459 pmap_kenter(va, pa); 1460 tlb_flush_local(va); 1461 return ((void *)crashdumpmap); 1462 } 1463 1464 /************************************* 1465 * 1466 * TLB & cache maintenance routines. 1467 * 1468 *************************************/ 1469 1470 /* 1471 * We inline these within pmap.c for speed. 1472 */ 1473 PMAP_INLINE void 1474 pmap_tlb_flush(pmap_t pmap, vm_offset_t va) 1475 { 1476 1477 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1478 tlb_flush(va); 1479 } 1480 1481 PMAP_INLINE void 1482 pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) 1483 { 1484 1485 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1486 tlb_flush_range(sva, size); 1487 } 1488 1489 /* 1490 * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. 1491 * Requirements: 1492 * - Must deal with pages in order to ensure that none of the PTE2_* bits 1493 * are ever set, PTE2_V in particular. 1494 * - Assumes we can write to pte2s without pte2_store() atomic ops. 1495 * - Assumes nothing will ever test these addresses for 0 to indicate 1496 * no mapping instead of correctly checking PTE2_V. 1497 * - Assumes a vm_offset_t will fit in a pte2 (true for arm). 1498 * Because PTE2_V is never set, there can be no mappings to invalidate. 1499 */ 1500 static vm_offset_t 1501 pmap_pte2list_alloc(vm_offset_t *head) 1502 { 1503 pt2_entry_t *pte2p; 1504 vm_offset_t va; 1505 1506 va = *head; 1507 if (va == 0) 1508 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 1509 pte2p = pt2map_entry(va); 1510 *head = *pte2p; 1511 if (*head & PTE2_V) 1512 panic("%s: va with PTE2_V set!", __func__); 1513 *pte2p = 0; 1514 return (va); 1515 } 1516 1517 static void 1518 pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) 1519 { 1520 pt2_entry_t *pte2p; 1521 1522 if (va & PTE2_V) 1523 panic("%s: freeing va with PTE2_V set!", __func__); 1524 pte2p = pt2map_entry(va); 1525 *pte2p = *head; /* virtual! PTE2_V is 0 though */ 1526 *head = va; 1527 } 1528 1529 static void 1530 pmap_pte2list_init(vm_offset_t *head, void *base, int npages) 1531 { 1532 int i; 1533 vm_offset_t va; 1534 1535 *head = 0; 1536 for (i = npages - 1; i >= 0; i--) { 1537 va = (vm_offset_t)base + i * PAGE_SIZE; 1538 pmap_pte2list_free(head, va); 1539 } 1540 } 1541 1542 /***************************************************************************** 1543 * 1544 * PMAP third and final stage initialization. 1545 * 1546 * After pmap_init() is called, PMAP subsystem is fully initialized. 1547 * 1548 *****************************************************************************/ 1549 1550 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1551 "VM/pmap parameters"); 1552 1553 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1554 "Max number of PV entries"); 1555 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1556 "Page share factor per proc"); 1557 1558 static u_long nkpt2pg = NKPT2PG; 1559 SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, 1560 &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); 1561 1562 static int sp_enabled = 1; 1563 SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 1564 &sp_enabled, 0, "Are large page mappings enabled?"); 1565 1566 bool 1567 pmap_ps_enabled(pmap_t pmap __unused) 1568 { 1569 1570 return (sp_enabled != 0); 1571 } 1572 1573 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1574 "1MB page mapping counters"); 1575 1576 static u_long pmap_pte1_demotions; 1577 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, 1578 &pmap_pte1_demotions, 0, "1MB page demotions"); 1579 1580 static u_long pmap_pte1_mappings; 1581 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, 1582 &pmap_pte1_mappings, 0, "1MB page mappings"); 1583 1584 static u_long pmap_pte1_p_failures; 1585 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, 1586 &pmap_pte1_p_failures, 0, "1MB page promotion failures"); 1587 1588 static u_long pmap_pte1_promotions; 1589 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, 1590 &pmap_pte1_promotions, 0, "1MB page promotions"); 1591 1592 static u_long pmap_pte1_kern_demotions; 1593 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, 1594 &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); 1595 1596 static u_long pmap_pte1_kern_promotions; 1597 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, 1598 &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); 1599 1600 static __inline ttb_entry_t 1601 pmap_ttb_get(pmap_t pmap) 1602 { 1603 1604 return (vtophys(pmap->pm_pt1) | ttb_flags); 1605 } 1606 1607 /* 1608 * Initialize a vm_page's machine-dependent fields. 1609 * 1610 * Variations: 1611 * 1. Pages for L2 page tables are always not managed. So, pv_list and 1612 * pt2_wirecount can share same physical space. However, proper 1613 * initialization on a page alloc for page tables and reinitialization 1614 * on the page free must be ensured. 1615 */ 1616 void 1617 pmap_page_init(vm_page_t m) 1618 { 1619 1620 TAILQ_INIT(&m->md.pv_list); 1621 pt2_wirecount_init(m); 1622 m->md.pat_mode = VM_MEMATTR_DEFAULT; 1623 } 1624 1625 /* 1626 * Virtualization for faster way how to zero whole page. 1627 */ 1628 static __inline void 1629 pagezero(void *page) 1630 { 1631 1632 bzero(page, PAGE_SIZE); 1633 } 1634 1635 /* 1636 * Zero L2 page table page. 1637 * Use same KVA as in pmap_zero_page(). 1638 */ 1639 static __inline vm_paddr_t 1640 pmap_pt2pg_zero(vm_page_t m) 1641 { 1642 pt2_entry_t *cmap2_pte2p; 1643 vm_paddr_t pa; 1644 struct pcpu *pc; 1645 1646 pa = VM_PAGE_TO_PHYS(m); 1647 1648 /* 1649 * XXX: For now, we map whole page even if it's already zero, 1650 * to sync it even if the sync is only DSB. 1651 */ 1652 sched_pin(); 1653 pc = get_pcpu(); 1654 cmap2_pte2p = pc->pc_cmap2_pte2p; 1655 mtx_lock(&pc->pc_cmap_lock); 1656 if (pte2_load(cmap2_pte2p) != 0) 1657 panic("%s: CMAP2 busy", __func__); 1658 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 1659 vm_page_pte2_attr(m))); 1660 /* Even VM_ALLOC_ZERO request is only advisory. */ 1661 if ((m->flags & PG_ZERO) == 0) 1662 pagezero(pc->pc_cmap2_addr); 1663 pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); 1664 pte2_clear(cmap2_pte2p); 1665 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 1666 1667 /* 1668 * Unpin the thread before releasing the lock. Otherwise the thread 1669 * could be rescheduled while still bound to the current CPU, only 1670 * to unpin itself immediately upon resuming execution. 1671 */ 1672 sched_unpin(); 1673 mtx_unlock(&pc->pc_cmap_lock); 1674 1675 return (pa); 1676 } 1677 1678 /* 1679 * Init just allocated page as L2 page table(s) holder 1680 * and return its physical address. 1681 */ 1682 static __inline vm_paddr_t 1683 pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) 1684 { 1685 vm_paddr_t pa; 1686 pt2_entry_t *pte2p; 1687 1688 /* Check page attributes. */ 1689 if (m->md.pat_mode != pt_memattr) 1690 pmap_page_set_memattr(m, pt_memattr); 1691 1692 /* Zero page and init wire counts. */ 1693 pa = pmap_pt2pg_zero(m); 1694 pt2_wirecount_init(m); 1695 1696 /* 1697 * Map page to PT2MAP address space for given pmap. 1698 * Note that PT2MAP space is shared with all pmaps. 1699 */ 1700 if (pmap == kernel_pmap) 1701 pmap_kenter_pt2tab(va, PTE2_KPT(pa)); 1702 else { 1703 pte2p = pmap_pt2tab_entry(pmap, va); 1704 pt2tab_store(pte2p, PTE2_KPT_NG(pa)); 1705 } 1706 1707 return (pa); 1708 } 1709 1710 /* 1711 * Initialize the pmap module. 1712 * Called by vm_init, to initialize any structures that the pmap 1713 * system needs to map virtual memory. 1714 */ 1715 void 1716 pmap_init(void) 1717 { 1718 vm_size_t s; 1719 pt2_entry_t *pte2p, pte2; 1720 u_int i, pte1_idx, pv_npg; 1721 1722 PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); 1723 1724 /* 1725 * Initialize the vm page array entries for kernel pmap's 1726 * L2 page table pages allocated in advance. 1727 */ 1728 pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); 1729 pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); 1730 for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { 1731 vm_paddr_t pa; 1732 vm_page_t m; 1733 1734 pte2 = pte2_load(pte2p); 1735 KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); 1736 1737 pa = pte2_pa(pte2); 1738 m = PHYS_TO_VM_PAGE(pa); 1739 KASSERT(m >= vm_page_array && 1740 m < &vm_page_array[vm_page_array_size], 1741 ("%s: L2 page table page is out of range", __func__)); 1742 1743 m->pindex = pte1_idx; 1744 m->phys_addr = pa; 1745 pte1_idx += NPT2_IN_PG; 1746 } 1747 1748 /* 1749 * Initialize the address space (zone) for the pv entries. Set a 1750 * high water mark so that the system can recover from excessive 1751 * numbers of pv entries. 1752 */ 1753 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1754 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1755 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1756 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1757 pv_entry_high_water = 9 * (pv_entry_max / 10); 1758 1759 /* 1760 * Are large page mappings enabled? 1761 */ 1762 TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); 1763 if (sp_enabled) { 1764 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1765 ("%s: can't assign to pagesizes[1]", __func__)); 1766 pagesizes[1] = PTE1_SIZE; 1767 } 1768 1769 /* 1770 * Calculate the size of the pv head table for sections. 1771 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1772 * Note that the table is only for sections which could be promoted. 1773 */ 1774 first_managed_pa = pte1_trunc(vm_phys_segs[0].start); 1775 pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) 1776 - first_managed_pa) / PTE1_SIZE + 1; 1777 1778 /* 1779 * Allocate memory for the pv head table for sections. 1780 */ 1781 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1782 s = round_page(s); 1783 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1784 for (i = 0; i < pv_npg; i++) 1785 TAILQ_INIT(&pv_table[i].pv_list); 1786 1787 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1788 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1789 if (pv_chunkbase == NULL) 1790 panic("%s: not enough kvm for pv chunks", __func__); 1791 pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1792 } 1793 1794 /* 1795 * Add a list of wired pages to the kva 1796 * this routine is only used for temporary 1797 * kernel mappings that do not need to have 1798 * page modification or references recorded. 1799 * Note that old mappings are simply written 1800 * over. The page *must* be wired. 1801 * Note: SMP coherent. Uses a ranged shootdown IPI. 1802 */ 1803 void 1804 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1805 { 1806 u_int anychanged; 1807 pt2_entry_t *epte2p, *pte2p, pte2; 1808 vm_page_t m; 1809 vm_paddr_t pa; 1810 1811 anychanged = 0; 1812 pte2p = pt2map_entry(sva); 1813 epte2p = pte2p + count; 1814 while (pte2p < epte2p) { 1815 m = *ma++; 1816 pa = VM_PAGE_TO_PHYS(m); 1817 pte2 = pte2_load(pte2p); 1818 if ((pte2_pa(pte2) != pa) || 1819 (pte2_attr(pte2) != vm_page_pte2_attr(m))) { 1820 anychanged++; 1821 pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, 1822 vm_page_pte2_attr(m))); 1823 } 1824 pte2p++; 1825 } 1826 if (__predict_false(anychanged)) 1827 tlb_flush_range(sva, count * PAGE_SIZE); 1828 } 1829 1830 /* 1831 * This routine tears out page mappings from the 1832 * kernel -- it is meant only for temporary mappings. 1833 * Note: SMP coherent. Uses a ranged shootdown IPI. 1834 */ 1835 void 1836 pmap_qremove(vm_offset_t sva, int count) 1837 { 1838 vm_offset_t va; 1839 1840 va = sva; 1841 while (count-- > 0) { 1842 pmap_kremove(va); 1843 va += PAGE_SIZE; 1844 } 1845 tlb_flush_range(sva, va - sva); 1846 } 1847 1848 /* 1849 * Are we current address space or kernel? 1850 */ 1851 static __inline int 1852 pmap_is_current(pmap_t pmap) 1853 { 1854 1855 return (pmap == kernel_pmap || 1856 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); 1857 } 1858 1859 /* 1860 * If the given pmap is not the current or kernel pmap, the returned 1861 * pte2 must be released by passing it to pmap_pte2_release(). 1862 */ 1863 static pt2_entry_t * 1864 pmap_pte2(pmap_t pmap, vm_offset_t va) 1865 { 1866 pt1_entry_t pte1; 1867 vm_paddr_t pt2pg_pa; 1868 1869 pte1 = pte1_load(pmap_pte1(pmap, va)); 1870 if (pte1_is_section(pte1)) 1871 panic("%s: attempt to map PTE1", __func__); 1872 if (pte1_is_link(pte1)) { 1873 /* Are we current address space or kernel? */ 1874 if (pmap_is_current(pmap)) 1875 return (pt2map_entry(va)); 1876 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1877 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1878 mtx_lock(&PMAP2mutex); 1879 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 1880 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 1881 tlb_flush((vm_offset_t)PADDR2); 1882 } 1883 return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1884 } 1885 return (NULL); 1886 } 1887 1888 /* 1889 * Releases a pte2 that was obtained from pmap_pte2(). 1890 * Be prepared for the pte2p being NULL. 1891 */ 1892 static __inline void 1893 pmap_pte2_release(pt2_entry_t *pte2p) 1894 { 1895 1896 if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { 1897 mtx_unlock(&PMAP2mutex); 1898 } 1899 } 1900 1901 /* 1902 * Super fast pmap_pte2 routine best used when scanning 1903 * the pv lists. This eliminates many coarse-grained 1904 * invltlb calls. Note that many of the pv list 1905 * scans are across different pmaps. It is very wasteful 1906 * to do an entire tlb flush for checking a single mapping. 1907 * 1908 * If the given pmap is not the current pmap, pvh_global_lock 1909 * must be held and curthread pinned to a CPU. 1910 */ 1911 static pt2_entry_t * 1912 pmap_pte2_quick(pmap_t pmap, vm_offset_t va) 1913 { 1914 pt1_entry_t pte1; 1915 vm_paddr_t pt2pg_pa; 1916 1917 pte1 = pte1_load(pmap_pte1(pmap, va)); 1918 if (pte1_is_section(pte1)) 1919 panic("%s: attempt to map PTE1", __func__); 1920 if (pte1_is_link(pte1)) { 1921 /* Are we current address space or kernel? */ 1922 if (pmap_is_current(pmap)) 1923 return (pt2map_entry(va)); 1924 rw_assert(&pvh_global_lock, RA_WLOCKED); 1925 KASSERT(curthread->td_pinned > 0, 1926 ("%s: curthread not pinned", __func__)); 1927 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1928 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1929 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 1930 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 1931 #ifdef SMP 1932 PMAP1cpu = PCPU_GET(cpuid); 1933 #endif 1934 tlb_flush_local((vm_offset_t)PADDR1); 1935 PMAP1changed++; 1936 } else 1937 #ifdef SMP 1938 if (PMAP1cpu != PCPU_GET(cpuid)) { 1939 PMAP1cpu = PCPU_GET(cpuid); 1940 tlb_flush_local((vm_offset_t)PADDR1); 1941 PMAP1changedcpu++; 1942 } else 1943 #endif 1944 PMAP1unchanged++; 1945 return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1946 } 1947 return (NULL); 1948 } 1949 1950 /* 1951 * Routine: pmap_extract 1952 * Function: 1953 * Extract the physical page address associated 1954 * with the given map/virtual_address pair. 1955 */ 1956 vm_paddr_t 1957 pmap_extract(pmap_t pmap, vm_offset_t va) 1958 { 1959 vm_paddr_t pa; 1960 pt1_entry_t pte1; 1961 pt2_entry_t *pte2p; 1962 1963 PMAP_LOCK(pmap); 1964 pte1 = pte1_load(pmap_pte1(pmap, va)); 1965 if (pte1_is_section(pte1)) 1966 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1967 else if (pte1_is_link(pte1)) { 1968 pte2p = pmap_pte2(pmap, va); 1969 pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); 1970 pmap_pte2_release(pte2p); 1971 } else 1972 pa = 0; 1973 PMAP_UNLOCK(pmap); 1974 return (pa); 1975 } 1976 1977 /* 1978 * Routine: pmap_extract_and_hold 1979 * Function: 1980 * Atomically extract and hold the physical page 1981 * with the given pmap and virtual address pair 1982 * if that mapping permits the given protection. 1983 */ 1984 vm_page_t 1985 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1986 { 1987 vm_paddr_t pa; 1988 pt1_entry_t pte1; 1989 pt2_entry_t pte2, *pte2p; 1990 vm_page_t m; 1991 1992 m = NULL; 1993 PMAP_LOCK(pmap); 1994 pte1 = pte1_load(pmap_pte1(pmap, va)); 1995 if (pte1_is_section(pte1)) { 1996 if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { 1997 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1998 m = PHYS_TO_VM_PAGE(pa); 1999 if (!vm_page_wire_mapped(m)) 2000 m = NULL; 2001 } 2002 } else if (pte1_is_link(pte1)) { 2003 pte2p = pmap_pte2(pmap, va); 2004 pte2 = pte2_load(pte2p); 2005 pmap_pte2_release(pte2p); 2006 if (pte2_is_valid(pte2) && 2007 (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { 2008 pa = pte2_pa(pte2); 2009 m = PHYS_TO_VM_PAGE(pa); 2010 if (!vm_page_wire_mapped(m)) 2011 m = NULL; 2012 } 2013 } 2014 PMAP_UNLOCK(pmap); 2015 return (m); 2016 } 2017 2018 /* 2019 * Grow the number of kernel L2 page table entries, if needed. 2020 */ 2021 void 2022 pmap_growkernel(vm_offset_t addr) 2023 { 2024 vm_page_t m; 2025 vm_paddr_t pt2pg_pa, pt2_pa; 2026 pt1_entry_t pte1; 2027 pt2_entry_t pte2; 2028 2029 PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); 2030 /* 2031 * All the time kernel_vm_end is first KVA for which underlying 2032 * L2 page table is either not allocated or linked from L1 page table 2033 * (not considering sections). Except for two possible cases: 2034 * 2035 * (1) in the very beginning as long as pmap_growkernel() was 2036 * not called, it could be first unused KVA (which is not 2037 * rounded up to PTE1_SIZE), 2038 * 2039 * (2) when all KVA space is mapped and vm_map_max(kernel_map) 2040 * address is not rounded up to PTE1_SIZE. (For example, 2041 * it could be 0xFFFFFFFF.) 2042 */ 2043 kernel_vm_end = pte1_roundup(kernel_vm_end); 2044 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2045 addr = roundup2(addr, PTE1_SIZE); 2046 if (addr - 1 >= vm_map_max(kernel_map)) 2047 addr = vm_map_max(kernel_map); 2048 while (kernel_vm_end < addr) { 2049 pte1 = pte1_load(kern_pte1(kernel_vm_end)); 2050 if (pte1_is_valid(pte1)) { 2051 kernel_vm_end += PTE1_SIZE; 2052 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2053 kernel_vm_end = vm_map_max(kernel_map); 2054 break; 2055 } 2056 continue; 2057 } 2058 2059 /* 2060 * kernel_vm_end_new is used in pmap_pinit() when kernel 2061 * mappings are entered to new pmap all at once to avoid race 2062 * between pmap_kenter_pte1() and kernel_vm_end increase. 2063 * The same aplies to pmap_kenter_pt2tab(). 2064 */ 2065 kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; 2066 2067 pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); 2068 if (!pte2_is_valid(pte2)) { 2069 /* 2070 * Install new PT2s page into kernel PT2TAB. 2071 */ 2072 m = vm_page_alloc(NULL, 2073 pte1_index(kernel_vm_end) & ~PT2PG_MASK, 2074 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2075 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2076 if (m == NULL) 2077 panic("%s: no memory to grow kernel", __func__); 2078 /* 2079 * QQQ: To link all new L2 page tables from L1 page 2080 * table now and so pmap_kenter_pte1() them 2081 * at once together with pmap_kenter_pt2tab() 2082 * could be nice speed up. However, 2083 * pmap_growkernel() does not happen so often... 2084 * QQQ: The other TTBR is another option. 2085 */ 2086 pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, 2087 m); 2088 } else 2089 pt2pg_pa = pte2_pa(pte2); 2090 2091 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); 2092 pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); 2093 2094 kernel_vm_end = kernel_vm_end_new; 2095 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2096 kernel_vm_end = vm_map_max(kernel_map); 2097 break; 2098 } 2099 } 2100 } 2101 2102 static int 2103 kvm_size(SYSCTL_HANDLER_ARGS) 2104 { 2105 unsigned long ksize = vm_max_kernel_address - KERNBASE; 2106 2107 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2108 } 2109 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, 2110 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_size, "IU", 2111 "Size of KVM"); 2112 2113 static int 2114 kvm_free(SYSCTL_HANDLER_ARGS) 2115 { 2116 unsigned long kfree = vm_max_kernel_address - kernel_vm_end; 2117 2118 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2119 } 2120 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, 2121 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_free, "IU", 2122 "Amount of KVM free"); 2123 2124 /*********************************************** 2125 * 2126 * Pmap allocation/deallocation routines. 2127 * 2128 ***********************************************/ 2129 2130 /* 2131 * Initialize the pmap for the swapper process. 2132 */ 2133 void 2134 pmap_pinit0(pmap_t pmap) 2135 { 2136 PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); 2137 2138 PMAP_LOCK_INIT(pmap); 2139 2140 /* 2141 * Kernel page table directory and pmap stuff around is already 2142 * initialized, we are using it right now and here. So, finish 2143 * only PMAP structures initialization for process0 ... 2144 * 2145 * Since the L1 page table and PT2TAB is shared with the kernel pmap, 2146 * which is already included in the list "allpmaps", this pmap does 2147 * not need to be inserted into that list. 2148 */ 2149 pmap->pm_pt1 = kern_pt1; 2150 pmap->pm_pt2tab = kern_pt2tab; 2151 CPU_ZERO(&pmap->pm_active); 2152 PCPU_SET(curpmap, pmap); 2153 TAILQ_INIT(&pmap->pm_pvchunk); 2154 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2155 CPU_SET(0, &pmap->pm_active); 2156 } 2157 2158 static __inline void 2159 pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, 2160 vm_offset_t eva) 2161 { 2162 u_int idx, count; 2163 2164 idx = pte1_index(sva); 2165 count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); 2166 bcopy(spte1p + idx, dpte1p + idx, count); 2167 } 2168 2169 static __inline void 2170 pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, 2171 vm_offset_t eva) 2172 { 2173 u_int idx, count; 2174 2175 idx = pt2tab_index(sva); 2176 count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); 2177 bcopy(spte2p + idx, dpte2p + idx, count); 2178 } 2179 2180 /* 2181 * Initialize a preallocated and zeroed pmap structure, 2182 * such as one in a vmspace structure. 2183 */ 2184 int 2185 pmap_pinit(pmap_t pmap) 2186 { 2187 pt1_entry_t *pte1p; 2188 pt2_entry_t *pte2p; 2189 vm_paddr_t pa, pt2tab_pa; 2190 u_int i; 2191 2192 PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, 2193 pmap->pm_pt1)); 2194 2195 /* 2196 * No need to allocate L2 page table space yet but we do need 2197 * a valid L1 page table and PT2TAB table. 2198 * 2199 * Install shared kernel mappings to these tables. It's a little 2200 * tricky as some parts of KVA are reserved for vectors, devices, 2201 * and whatever else. These parts are supposed to be above 2202 * vm_max_kernel_address. Thus two regions should be installed: 2203 * 2204 * (1) <KERNBASE, kernel_vm_end), 2205 * (2) <vm_max_kernel_address, 0xFFFFFFFF>. 2206 * 2207 * QQQ: The second region should be stable enough to be installed 2208 * only once in time when the tables are allocated. 2209 * QQQ: Maybe copy of both regions at once could be faster ... 2210 * QQQ: Maybe the other TTBR is an option. 2211 * 2212 * Finally, install own PT2TAB table to these tables. 2213 */ 2214 2215 if (pmap->pm_pt1 == NULL) { 2216 pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(NB_IN_PT1, 2217 M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); 2218 if (pmap->pm_pt1 == NULL) 2219 return (0); 2220 } 2221 if (pmap->pm_pt2tab == NULL) { 2222 /* 2223 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page 2224 * only, what should be the only size for 32 bit systems, 2225 * then we could allocate it with vm_page_alloc() and all 2226 * the stuff needed as other L2 page table pages. 2227 * (2) Note that a process PT2TAB is special L2 page table 2228 * page. Its mapping in kernel_arena is permanent and can 2229 * be used no matter which process is current. Its mapping 2230 * in PT2MAP can be used only for current process. 2231 */ 2232 pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(NB_IN_PT2TAB, 2233 M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); 2234 if (pmap->pm_pt2tab == NULL) { 2235 /* 2236 * QQQ: As struct pmap is allocated from UMA with 2237 * UMA_ZONE_NOFREE flag, it's important to leave 2238 * no allocation in pmap if initialization failed. 2239 */ 2240 kmem_free((vm_offset_t)pmap->pm_pt1, NB_IN_PT1); 2241 pmap->pm_pt1 = NULL; 2242 return (0); 2243 } 2244 /* 2245 * QQQ: Each L2 page table page vm_page_t has pindex set to 2246 * pte1 index of virtual address mapped by this page. 2247 * It's not valid for non kernel PT2TABs themselves. 2248 * The pindex of these pages can not be altered because 2249 * of the way how they are allocated now. However, it 2250 * should not be a problem. 2251 */ 2252 } 2253 2254 mtx_lock_spin(&allpmaps_lock); 2255 /* 2256 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), 2257 * kernel_vm_end_new is used here instead of kernel_vm_end. 2258 */ 2259 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, 2260 kernel_vm_end_new - 1); 2261 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 2262 0xFFFFFFFF); 2263 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, 2264 kernel_vm_end_new - 1); 2265 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 2266 0xFFFFFFFF); 2267 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 2268 mtx_unlock_spin(&allpmaps_lock); 2269 2270 /* 2271 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. 2272 * I.e. self reference mapping. The PT2TAB is private, however mapped 2273 * into shared PT2MAP space, so the mapping should be not global. 2274 */ 2275 pt2tab_pa = vtophys(pmap->pm_pt2tab); 2276 pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); 2277 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 2278 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 2279 } 2280 2281 /* Insert PT2MAP PT2s into pmap PT1. */ 2282 pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); 2283 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 2284 pte1_store(pte1p++, PTE1_LINK(pa)); 2285 } 2286 2287 /* 2288 * Now synchronize new mapping which was made above. 2289 */ 2290 pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); 2291 pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); 2292 2293 CPU_ZERO(&pmap->pm_active); 2294 TAILQ_INIT(&pmap->pm_pvchunk); 2295 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2296 2297 return (1); 2298 } 2299 2300 #ifdef INVARIANTS 2301 static boolean_t 2302 pt2tab_user_is_empty(pt2_entry_t *tab) 2303 { 2304 u_int i, end; 2305 2306 end = pt2tab_index(VM_MAXUSER_ADDRESS); 2307 for (i = 0; i < end; i++) 2308 if (tab[i] != 0) return (FALSE); 2309 return (TRUE); 2310 } 2311 #endif 2312 /* 2313 * Release any resources held by the given physical map. 2314 * Called when a pmap initialized by pmap_pinit is being released. 2315 * Should only be called if the map contains no valid mappings. 2316 */ 2317 void 2318 pmap_release(pmap_t pmap) 2319 { 2320 #ifdef INVARIANTS 2321 vm_offset_t start, end; 2322 #endif 2323 KASSERT(pmap->pm_stats.resident_count == 0, 2324 ("%s: pmap resident count %ld != 0", __func__, 2325 pmap->pm_stats.resident_count)); 2326 KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), 2327 ("%s: has allocated user PT2(s)", __func__)); 2328 KASSERT(CPU_EMPTY(&pmap->pm_active), 2329 ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); 2330 2331 mtx_lock_spin(&allpmaps_lock); 2332 LIST_REMOVE(pmap, pm_list); 2333 mtx_unlock_spin(&allpmaps_lock); 2334 2335 #ifdef INVARIANTS 2336 start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); 2337 end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); 2338 bzero((char *)pmap->pm_pt1 + start, end - start); 2339 2340 start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); 2341 end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); 2342 bzero((char *)pmap->pm_pt2tab + start, end - start); 2343 #endif 2344 /* 2345 * We are leaving PT1 and PT2TAB allocated on released pmap, 2346 * so hopefully UMA vmspace_zone will always be inited with 2347 * UMA_ZONE_NOFREE flag. 2348 */ 2349 } 2350 2351 /********************************************************* 2352 * 2353 * L2 table pages and their pages management routines. 2354 * 2355 *********************************************************/ 2356 2357 /* 2358 * Virtual interface for L2 page table wire counting. 2359 * 2360 * Each L2 page table in a page has own counter which counts a number of 2361 * valid mappings in a table. Global page counter counts mappings in all 2362 * tables in a page plus a single itself mapping in PT2TAB. 2363 * 2364 * During a promotion we leave the associated L2 page table counter 2365 * untouched, so the table (strictly speaking a page which holds it) 2366 * is never freed if promoted. 2367 * 2368 * If a page m->ref_count == 1 then no valid mappings exist in any L2 page 2369 * table in the page and the page itself is only mapped in PT2TAB. 2370 */ 2371 2372 static __inline void 2373 pt2_wirecount_init(vm_page_t m) 2374 { 2375 u_int i; 2376 2377 /* 2378 * Note: A page m is allocated with VM_ALLOC_WIRED flag and 2379 * m->ref_count should be already set correctly. 2380 * So, there is no need to set it again herein. 2381 */ 2382 for (i = 0; i < NPT2_IN_PG; i++) 2383 m->md.pt2_wirecount[i] = 0; 2384 } 2385 2386 static __inline void 2387 pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) 2388 { 2389 2390 /* 2391 * Note: A just modificated pte2 (i.e. already allocated) 2392 * is acquiring one extra reference which must be 2393 * explicitly cleared. It influences the KASSERTs herein. 2394 * All L2 page tables in a page always belong to the same 2395 * pmap, so we allow only one extra reference for the page. 2396 */ 2397 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), 2398 ("%s: PT2 is overflowing ...", __func__)); 2399 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2400 ("%s: PT2PG is overflowing ...", __func__)); 2401 2402 m->ref_count++; 2403 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; 2404 } 2405 2406 static __inline void 2407 pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) 2408 { 2409 2410 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, 2411 ("%s: PT2 is underflowing ...", __func__)); 2412 KASSERT(m->ref_count > 1, 2413 ("%s: PT2PG is underflowing ...", __func__)); 2414 2415 m->ref_count--; 2416 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; 2417 } 2418 2419 static __inline void 2420 pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) 2421 { 2422 2423 KASSERT(count <= NPTE2_IN_PT2, 2424 ("%s: invalid count %u", __func__, count)); 2425 KASSERT(m->ref_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], 2426 ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->ref_count, 2427 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); 2428 2429 m->ref_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; 2430 m->ref_count += count; 2431 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; 2432 2433 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2434 ("%s: PT2PG is overflowed (%u) ...", __func__, m->ref_count)); 2435 } 2436 2437 static __inline uint32_t 2438 pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) 2439 { 2440 2441 return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); 2442 } 2443 2444 static __inline boolean_t 2445 pt2_is_empty(vm_page_t m, vm_offset_t va) 2446 { 2447 2448 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); 2449 } 2450 2451 static __inline boolean_t 2452 pt2_is_full(vm_page_t m, vm_offset_t va) 2453 { 2454 2455 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 2456 NPTE2_IN_PT2); 2457 } 2458 2459 static __inline boolean_t 2460 pt2pg_is_empty(vm_page_t m) 2461 { 2462 2463 return (m->ref_count == 1); 2464 } 2465 2466 /* 2467 * This routine is called if the L2 page table 2468 * is not mapped correctly. 2469 */ 2470 static vm_page_t 2471 _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2472 { 2473 uint32_t pte1_idx; 2474 pt1_entry_t *pte1p; 2475 pt2_entry_t pte2; 2476 vm_page_t m; 2477 vm_paddr_t pt2pg_pa, pt2_pa; 2478 2479 pte1_idx = pte1_index(va); 2480 pte1p = pmap->pm_pt1 + pte1_idx; 2481 2482 KASSERT(pte1_load(pte1p) == 0, 2483 ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, 2484 pte1_load(pte1p))); 2485 2486 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); 2487 if (!pte2_is_valid(pte2)) { 2488 /* 2489 * Install new PT2s page into pmap PT2TAB. 2490 */ 2491 m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, 2492 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2493 if (m == NULL) { 2494 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2495 PMAP_UNLOCK(pmap); 2496 rw_wunlock(&pvh_global_lock); 2497 vm_wait(NULL); 2498 rw_wlock(&pvh_global_lock); 2499 PMAP_LOCK(pmap); 2500 } 2501 2502 /* 2503 * Indicate the need to retry. While waiting, 2504 * the L2 page table page may have been allocated. 2505 */ 2506 return (NULL); 2507 } 2508 pmap->pm_stats.resident_count++; 2509 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 2510 } else { 2511 pt2pg_pa = pte2_pa(pte2); 2512 m = PHYS_TO_VM_PAGE(pt2pg_pa); 2513 } 2514 2515 pt2_wirecount_inc(m, pte1_idx); 2516 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 2517 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 2518 2519 return (m); 2520 } 2521 2522 static vm_page_t 2523 pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2524 { 2525 u_int pte1_idx; 2526 pt1_entry_t *pte1p, pte1; 2527 vm_page_t m; 2528 2529 pte1_idx = pte1_index(va); 2530 retry: 2531 pte1p = pmap->pm_pt1 + pte1_idx; 2532 pte1 = pte1_load(pte1p); 2533 2534 /* 2535 * This supports switching from a 1MB page to a 2536 * normal 4K page. 2537 */ 2538 if (pte1_is_section(pte1)) { 2539 (void)pmap_demote_pte1(pmap, pte1p, va); 2540 /* 2541 * Reload pte1 after demotion. 2542 * 2543 * Note: Demotion can even fail as either PT2 is not find for 2544 * the virtual address or PT2PG can not be allocated. 2545 */ 2546 pte1 = pte1_load(pte1p); 2547 } 2548 2549 /* 2550 * If the L2 page table page is mapped, we just increment the 2551 * hold count, and activate it. 2552 */ 2553 if (pte1_is_link(pte1)) { 2554 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2555 pt2_wirecount_inc(m, pte1_idx); 2556 } else { 2557 /* 2558 * Here if the PT2 isn't mapped, or if it has 2559 * been deallocated. 2560 */ 2561 m = _pmap_allocpte2(pmap, va, flags); 2562 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2563 goto retry; 2564 } 2565 2566 return (m); 2567 } 2568 2569 /* 2570 * Schedule the specified unused L2 page table page to be freed. Specifically, 2571 * add the page to the specified list of pages that will be released to the 2572 * physical memory manager after the TLB has been updated. 2573 */ 2574 static __inline void 2575 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) 2576 { 2577 2578 /* 2579 * Put page on a list so that it is released after 2580 * *ALL* TLB shootdown is done 2581 */ 2582 #ifdef PMAP_DEBUG 2583 pmap_zero_page_check(m); 2584 #endif 2585 m->flags |= PG_ZERO; 2586 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2587 } 2588 2589 /* 2590 * Unwire L2 page tables page. 2591 */ 2592 static void 2593 pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) 2594 { 2595 pt1_entry_t *pte1p, opte1 __unused; 2596 pt2_entry_t *pte2p; 2597 uint32_t i; 2598 2599 KASSERT(pt2pg_is_empty(m), 2600 ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); 2601 2602 /* 2603 * Unmap all L2 page tables in the page from L1 page table. 2604 * 2605 * QQQ: Individual L2 page tables (except the last one) can be unmapped 2606 * earlier. However, we are doing that this way. 2607 */ 2608 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 2609 ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); 2610 pte1p = pmap->pm_pt1 + m->pindex; 2611 for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { 2612 KASSERT(m->md.pt2_wirecount[i] == 0, 2613 ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); 2614 opte1 = pte1_load(pte1p); 2615 if (pte1_is_link(opte1)) { 2616 pte1_clear(pte1p); 2617 /* 2618 * Flush intermediate TLB cache. 2619 */ 2620 pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); 2621 } 2622 #ifdef INVARIANTS 2623 else 2624 KASSERT((opte1 == 0) || pte1_is_section(opte1), 2625 ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, 2626 pmap, va, opte1, i)); 2627 #endif 2628 } 2629 2630 /* 2631 * Unmap the page from PT2TAB. 2632 */ 2633 pte2p = pmap_pt2tab_entry(pmap, va); 2634 (void)pt2tab_load_clear(pte2p); 2635 pmap_tlb_flush(pmap, pt2map_pt2pg(va)); 2636 2637 m->ref_count = 0; 2638 pmap->pm_stats.resident_count--; 2639 2640 /* 2641 * This barrier is so that the ordinary store unmapping 2642 * the L2 page table page is globally performed before TLB shoot- 2643 * down is begun. 2644 */ 2645 wmb(); 2646 vm_wire_sub(1); 2647 } 2648 2649 /* 2650 * Decrements a L2 page table page's wire count, which is used to record the 2651 * number of valid page table entries within the page. If the wire count 2652 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2653 * page table page was unmapped and FALSE otherwise. 2654 */ 2655 static __inline boolean_t 2656 pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2657 { 2658 pt2_wirecount_dec(m, pte1_index(va)); 2659 if (pt2pg_is_empty(m)) { 2660 /* 2661 * QQQ: Wire count is zero, so whole page should be zero and 2662 * we can set PG_ZERO flag to it. 2663 * Note that when promotion is enabled, it takes some 2664 * more efforts. See pmap_unwire_pt2_all() below. 2665 */ 2666 pmap_unwire_pt2pg(pmap, va, m); 2667 pmap_add_delayed_free_list(m, free); 2668 return (TRUE); 2669 } else 2670 return (FALSE); 2671 } 2672 2673 /* 2674 * Drop a L2 page table page's wire count at once, which is used to record 2675 * the number of valid L2 page table entries within the page. If the wire 2676 * count drops to zero, then the L2 page table page is unmapped. 2677 */ 2678 static __inline void 2679 pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, 2680 struct spglist *free) 2681 { 2682 u_int pte1_idx = pte1_index(va); 2683 2684 KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), 2685 ("%s: PT2 page's pindex is wrong", __func__)); 2686 KASSERT(m->ref_count > pt2_wirecount_get(m, pte1_idx), 2687 ("%s: bad pt2 wire count %u > %u", __func__, m->ref_count, 2688 pt2_wirecount_get(m, pte1_idx))); 2689 2690 /* 2691 * It's possible that the L2 page table was never used. 2692 * It happened in case that a section was created without promotion. 2693 */ 2694 if (pt2_is_full(m, va)) { 2695 pt2_wirecount_set(m, pte1_idx, 0); 2696 2697 /* 2698 * QQQ: We clear L2 page table now, so when L2 page table page 2699 * is going to be freed, we can set it PG_ZERO flag ... 2700 * This function is called only on section mappings, so 2701 * hopefully it's not to big overload. 2702 * 2703 * XXX: If pmap is current, existing PT2MAP mapping could be 2704 * used for zeroing. 2705 */ 2706 pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); 2707 } 2708 #ifdef INVARIANTS 2709 else 2710 KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", 2711 __func__, pt2_wirecount_get(m, pte1_idx))); 2712 #endif 2713 if (pt2pg_is_empty(m)) { 2714 pmap_unwire_pt2pg(pmap, va, m); 2715 pmap_add_delayed_free_list(m, free); 2716 } 2717 } 2718 2719 /* 2720 * After removing a L2 page table entry, this routine is used to 2721 * conditionally free the page, and manage the hold/wire counts. 2722 */ 2723 static boolean_t 2724 pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) 2725 { 2726 pt1_entry_t pte1; 2727 vm_page_t mpte; 2728 2729 if (va >= VM_MAXUSER_ADDRESS) 2730 return (FALSE); 2731 pte1 = pte1_load(pmap_pte1(pmap, va)); 2732 mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2733 return (pmap_unwire_pt2(pmap, va, mpte, free)); 2734 } 2735 2736 /************************************* 2737 * 2738 * Page management routines. 2739 * 2740 *************************************/ 2741 2742 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2743 CTASSERT(_NPCM == 11); 2744 CTASSERT(_NPCPV == 336); 2745 2746 static __inline struct pv_chunk * 2747 pv_to_chunk(pv_entry_t pv) 2748 { 2749 2750 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2751 } 2752 2753 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2754 2755 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2756 #define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2757 2758 static const uint32_t pc_freemask[_NPCM] = { 2759 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2760 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2761 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2762 PC_FREE0_9, PC_FREE10 2763 }; 2764 2765 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2766 "Current number of pv entries"); 2767 2768 #ifdef PV_STATS 2769 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2770 2771 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2772 "Current number of pv entry chunks"); 2773 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2774 "Current number of pv entry chunks allocated"); 2775 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2776 "Current number of pv entry chunks frees"); 2777 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 2778 0, "Number of times tried to get a chunk page but failed."); 2779 2780 static long pv_entry_frees, pv_entry_allocs; 2781 static int pv_entry_spare; 2782 2783 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2784 "Current number of pv entry frees"); 2785 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 2786 0, "Current number of pv entry allocs"); 2787 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2788 "Current number of spare pv entries"); 2789 #endif 2790 2791 /* 2792 * Is given page managed? 2793 */ 2794 static __inline bool 2795 is_managed(vm_paddr_t pa) 2796 { 2797 vm_page_t m; 2798 2799 m = PHYS_TO_VM_PAGE(pa); 2800 if (m == NULL) 2801 return (false); 2802 return ((m->oflags & VPO_UNMANAGED) == 0); 2803 } 2804 2805 static __inline bool 2806 pte1_is_managed(pt1_entry_t pte1) 2807 { 2808 2809 return (is_managed(pte1_pa(pte1))); 2810 } 2811 2812 static __inline bool 2813 pte2_is_managed(pt2_entry_t pte2) 2814 { 2815 2816 return (is_managed(pte2_pa(pte2))); 2817 } 2818 2819 /* 2820 * We are in a serious low memory condition. Resort to 2821 * drastic measures to free some pages so we can allocate 2822 * another pv entry chunk. 2823 */ 2824 static vm_page_t 2825 pmap_pv_reclaim(pmap_t locked_pmap) 2826 { 2827 struct pch newtail; 2828 struct pv_chunk *pc; 2829 struct md_page *pvh; 2830 pt1_entry_t *pte1p; 2831 pmap_t pmap; 2832 pt2_entry_t *pte2p, tpte2; 2833 pv_entry_t pv; 2834 vm_offset_t va; 2835 vm_page_t m, m_pc; 2836 struct spglist free; 2837 uint32_t inuse; 2838 int bit, field, freed; 2839 2840 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2841 pmap = NULL; 2842 m_pc = NULL; 2843 SLIST_INIT(&free); 2844 TAILQ_INIT(&newtail); 2845 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2846 SLIST_EMPTY(&free))) { 2847 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2848 if (pmap != pc->pc_pmap) { 2849 if (pmap != NULL) { 2850 if (pmap != locked_pmap) 2851 PMAP_UNLOCK(pmap); 2852 } 2853 pmap = pc->pc_pmap; 2854 /* Avoid deadlock and lock recursion. */ 2855 if (pmap > locked_pmap) 2856 PMAP_LOCK(pmap); 2857 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2858 pmap = NULL; 2859 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2860 continue; 2861 } 2862 } 2863 2864 /* 2865 * Destroy every non-wired, 4 KB page mapping in the chunk. 2866 */ 2867 freed = 0; 2868 for (field = 0; field < _NPCM; field++) { 2869 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2870 inuse != 0; inuse &= ~(1UL << bit)) { 2871 bit = ffs(inuse) - 1; 2872 pv = &pc->pc_pventry[field * 32 + bit]; 2873 va = pv->pv_va; 2874 pte1p = pmap_pte1(pmap, va); 2875 if (pte1_is_section(pte1_load(pte1p))) 2876 continue; 2877 pte2p = pmap_pte2(pmap, va); 2878 tpte2 = pte2_load(pte2p); 2879 if ((tpte2 & PTE2_W) == 0) 2880 tpte2 = pte2_load_clear(pte2p); 2881 pmap_pte2_release(pte2p); 2882 if ((tpte2 & PTE2_W) != 0) 2883 continue; 2884 KASSERT(tpte2 != 0, 2885 ("pmap_pv_reclaim: pmap %p va %#x zero pte", 2886 pmap, va)); 2887 pmap_tlb_flush(pmap, va); 2888 m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); 2889 if (pte2_is_dirty(tpte2)) 2890 vm_page_dirty(m); 2891 if ((tpte2 & PTE2_A) != 0) 2892 vm_page_aflag_set(m, PGA_REFERENCED); 2893 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2894 if (TAILQ_EMPTY(&m->md.pv_list) && 2895 (m->flags & PG_FICTITIOUS) == 0) { 2896 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2897 if (TAILQ_EMPTY(&pvh->pv_list)) { 2898 vm_page_aflag_clear(m, 2899 PGA_WRITEABLE); 2900 } 2901 } 2902 pc->pc_map[field] |= 1UL << bit; 2903 pmap_unuse_pt2(pmap, va, &free); 2904 freed++; 2905 } 2906 } 2907 if (freed == 0) { 2908 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2909 continue; 2910 } 2911 /* Every freed mapping is for a 4 KB page. */ 2912 pmap->pm_stats.resident_count -= freed; 2913 PV_STAT(pv_entry_frees += freed); 2914 PV_STAT(pv_entry_spare += freed); 2915 pv_entry_count -= freed; 2916 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2917 for (field = 0; field < _NPCM; field++) 2918 if (pc->pc_map[field] != pc_freemask[field]) { 2919 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2920 pc_list); 2921 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2922 2923 /* 2924 * One freed pv entry in locked_pmap is 2925 * sufficient. 2926 */ 2927 if (pmap == locked_pmap) 2928 goto out; 2929 break; 2930 } 2931 if (field == _NPCM) { 2932 PV_STAT(pv_entry_spare -= _NPCPV); 2933 PV_STAT(pc_chunk_count--); 2934 PV_STAT(pc_chunk_frees++); 2935 /* Entire chunk is free; return it. */ 2936 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2937 pmap_qremove((vm_offset_t)pc, 1); 2938 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2939 break; 2940 } 2941 } 2942 out: 2943 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2944 if (pmap != NULL) { 2945 if (pmap != locked_pmap) 2946 PMAP_UNLOCK(pmap); 2947 } 2948 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2949 m_pc = SLIST_FIRST(&free); 2950 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2951 /* Recycle a freed page table page. */ 2952 m_pc->ref_count = 1; 2953 vm_wire_add(1); 2954 } 2955 vm_page_free_pages_toq(&free, false); 2956 return (m_pc); 2957 } 2958 2959 static void 2960 free_pv_chunk(struct pv_chunk *pc) 2961 { 2962 vm_page_t m; 2963 2964 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2965 PV_STAT(pv_entry_spare -= _NPCPV); 2966 PV_STAT(pc_chunk_count--); 2967 PV_STAT(pc_chunk_frees++); 2968 /* entire chunk is free, return it */ 2969 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2970 pmap_qremove((vm_offset_t)pc, 1); 2971 vm_page_unwire_noq(m); 2972 vm_page_free(m); 2973 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2974 } 2975 2976 /* 2977 * Free the pv_entry back to the free list. 2978 */ 2979 static void 2980 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2981 { 2982 struct pv_chunk *pc; 2983 int idx, field, bit; 2984 2985 rw_assert(&pvh_global_lock, RA_WLOCKED); 2986 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2987 PV_STAT(pv_entry_frees++); 2988 PV_STAT(pv_entry_spare++); 2989 pv_entry_count--; 2990 pc = pv_to_chunk(pv); 2991 idx = pv - &pc->pc_pventry[0]; 2992 field = idx / 32; 2993 bit = idx % 32; 2994 pc->pc_map[field] |= 1ul << bit; 2995 for (idx = 0; idx < _NPCM; idx++) 2996 if (pc->pc_map[idx] != pc_freemask[idx]) { 2997 /* 2998 * 98% of the time, pc is already at the head of the 2999 * list. If it isn't already, move it to the head. 3000 */ 3001 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 3002 pc)) { 3003 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3004 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 3005 pc_list); 3006 } 3007 return; 3008 } 3009 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3010 free_pv_chunk(pc); 3011 } 3012 3013 /* 3014 * Get a new pv_entry, allocating a block from the system 3015 * when needed. 3016 */ 3017 static pv_entry_t 3018 get_pv_entry(pmap_t pmap, boolean_t try) 3019 { 3020 static const struct timeval printinterval = { 60, 0 }; 3021 static struct timeval lastprint; 3022 int bit, field; 3023 pv_entry_t pv; 3024 struct pv_chunk *pc; 3025 vm_page_t m; 3026 3027 rw_assert(&pvh_global_lock, RA_WLOCKED); 3028 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3029 PV_STAT(pv_entry_allocs++); 3030 pv_entry_count++; 3031 if (pv_entry_count > pv_entry_high_water) 3032 if (ratecheck(&lastprint, &printinterval)) 3033 printf("Approaching the limit on PV entries, consider " 3034 "increasing either the vm.pmap.shpgperproc or the " 3035 "vm.pmap.pv_entries tunable.\n"); 3036 retry: 3037 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3038 if (pc != NULL) { 3039 for (field = 0; field < _NPCM; field++) { 3040 if (pc->pc_map[field]) { 3041 bit = ffs(pc->pc_map[field]) - 1; 3042 break; 3043 } 3044 } 3045 if (field < _NPCM) { 3046 pv = &pc->pc_pventry[field * 32 + bit]; 3047 pc->pc_map[field] &= ~(1ul << bit); 3048 /* If this was the last item, move it to tail */ 3049 for (field = 0; field < _NPCM; field++) 3050 if (pc->pc_map[field] != 0) { 3051 PV_STAT(pv_entry_spare--); 3052 return (pv); /* not full, return */ 3053 } 3054 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3055 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3056 PV_STAT(pv_entry_spare--); 3057 return (pv); 3058 } 3059 } 3060 /* 3061 * Access to the pte2list "pv_vafree" is synchronized by the pvh 3062 * global lock. If "pv_vafree" is currently non-empty, it will 3063 * remain non-empty until pmap_pte2list_alloc() completes. 3064 */ 3065 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 3066 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3067 if (try) { 3068 pv_entry_count--; 3069 PV_STAT(pc_chunk_tryfail++); 3070 return (NULL); 3071 } 3072 m = pmap_pv_reclaim(pmap); 3073 if (m == NULL) 3074 goto retry; 3075 } 3076 PV_STAT(pc_chunk_count++); 3077 PV_STAT(pc_chunk_allocs++); 3078 pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); 3079 pmap_qenter((vm_offset_t)pc, &m, 1); 3080 pc->pc_pmap = pmap; 3081 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 3082 for (field = 1; field < _NPCM; field++) 3083 pc->pc_map[field] = pc_freemask[field]; 3084 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3085 pv = &pc->pc_pventry[0]; 3086 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3087 PV_STAT(pv_entry_spare += _NPCPV - 1); 3088 return (pv); 3089 } 3090 3091 /* 3092 * Create a pv entry for page at pa for 3093 * (pmap, va). 3094 */ 3095 static void 3096 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3097 { 3098 pv_entry_t pv; 3099 3100 rw_assert(&pvh_global_lock, RA_WLOCKED); 3101 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3102 pv = get_pv_entry(pmap, FALSE); 3103 pv->pv_va = va; 3104 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3105 } 3106 3107 static __inline pv_entry_t 3108 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3109 { 3110 pv_entry_t pv; 3111 3112 rw_assert(&pvh_global_lock, RA_WLOCKED); 3113 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3114 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3115 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3116 break; 3117 } 3118 } 3119 return (pv); 3120 } 3121 3122 static void 3123 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3124 { 3125 pv_entry_t pv; 3126 3127 pv = pmap_pvh_remove(pvh, pmap, va); 3128 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3129 free_pv_entry(pmap, pv); 3130 } 3131 3132 static void 3133 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 3134 { 3135 struct md_page *pvh; 3136 3137 rw_assert(&pvh_global_lock, RA_WLOCKED); 3138 pmap_pvh_free(&m->md, pmap, va); 3139 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 3140 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3141 if (TAILQ_EMPTY(&pvh->pv_list)) 3142 vm_page_aflag_clear(m, PGA_WRITEABLE); 3143 } 3144 } 3145 3146 static void 3147 pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3148 { 3149 struct md_page *pvh; 3150 pv_entry_t pv; 3151 vm_offset_t va_last; 3152 vm_page_t m; 3153 3154 rw_assert(&pvh_global_lock, RA_WLOCKED); 3155 KASSERT((pa & PTE1_OFFSET) == 0, 3156 ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); 3157 3158 /* 3159 * Transfer the 1mpage's pv entry for this mapping to the first 3160 * page's pv list. 3161 */ 3162 pvh = pa_to_pvh(pa); 3163 va = pte1_trunc(va); 3164 pv = pmap_pvh_remove(pvh, pmap, va); 3165 KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); 3166 m = PHYS_TO_VM_PAGE(pa); 3167 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3168 /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3169 va_last = va + PTE1_SIZE - PAGE_SIZE; 3170 do { 3171 m++; 3172 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3173 ("pmap_pv_demote_pte1: page %p is not managed", m)); 3174 va += PAGE_SIZE; 3175 pmap_insert_entry(pmap, va, m); 3176 } while (va < va_last); 3177 } 3178 3179 #if VM_NRESERVLEVEL > 0 3180 static void 3181 pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3182 { 3183 struct md_page *pvh; 3184 pv_entry_t pv; 3185 vm_offset_t va_last; 3186 vm_page_t m; 3187 3188 rw_assert(&pvh_global_lock, RA_WLOCKED); 3189 KASSERT((pa & PTE1_OFFSET) == 0, 3190 ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); 3191 3192 /* 3193 * Transfer the first page's pv entry for this mapping to the 3194 * 1mpage's pv list. Aside from avoiding the cost of a call 3195 * to get_pv_entry(), a transfer avoids the possibility that 3196 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() 3197 * removes one of the mappings that is being promoted. 3198 */ 3199 m = PHYS_TO_VM_PAGE(pa); 3200 va = pte1_trunc(va); 3201 pv = pmap_pvh_remove(&m->md, pmap, va); 3202 KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); 3203 pvh = pa_to_pvh(pa); 3204 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3205 /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3206 va_last = va + PTE1_SIZE - PAGE_SIZE; 3207 do { 3208 m++; 3209 va += PAGE_SIZE; 3210 pmap_pvh_free(&m->md, pmap, va); 3211 } while (va < va_last); 3212 } 3213 #endif 3214 3215 /* 3216 * Conditionally create a pv entry. 3217 */ 3218 static boolean_t 3219 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3220 { 3221 pv_entry_t pv; 3222 3223 rw_assert(&pvh_global_lock, RA_WLOCKED); 3224 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3225 if (pv_entry_count < pv_entry_high_water && 3226 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3227 pv->pv_va = va; 3228 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3229 return (TRUE); 3230 } else 3231 return (FALSE); 3232 } 3233 3234 /* 3235 * Create the pv entries for each of the pages within a section. 3236 */ 3237 static bool 3238 pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags) 3239 { 3240 struct md_page *pvh; 3241 pv_entry_t pv; 3242 bool noreclaim; 3243 3244 rw_assert(&pvh_global_lock, RA_WLOCKED); 3245 noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; 3246 if ((noreclaim && pv_entry_count >= pv_entry_high_water) || 3247 (pv = get_pv_entry(pmap, noreclaim)) == NULL) 3248 return (false); 3249 pv->pv_va = va; 3250 pvh = pa_to_pvh(pte1_pa(pte1)); 3251 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3252 return (true); 3253 } 3254 3255 static inline void 3256 pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) 3257 { 3258 3259 /* Kill all the small mappings or the big one only. */ 3260 if (pte1_is_section(npte1)) 3261 pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); 3262 else 3263 pmap_tlb_flush(pmap, pte1_trunc(va)); 3264 } 3265 3266 /* 3267 * Update kernel pte1 on all pmaps. 3268 * 3269 * The following function is called only on one cpu with disabled interrupts. 3270 * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way 3271 * nobody can invoke explicit hardware table walk during the update of pte1. 3272 * Unsolicited hardware table walk can still happen, invoked by speculative 3273 * data or instruction prefetch or even by speculative hardware table walk. 3274 * 3275 * The break-before-make approach should be implemented here. However, it's 3276 * not so easy to do that for kernel mappings as it would be unhappy to unmap 3277 * itself unexpectedly but voluntarily. 3278 */ 3279 static void 3280 pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) 3281 { 3282 pmap_t pmap; 3283 pt1_entry_t *pte1p; 3284 3285 /* 3286 * Get current pmap. Interrupts should be disabled here 3287 * so PCPU_GET() is done atomically. 3288 */ 3289 pmap = PCPU_GET(curpmap); 3290 if (pmap == NULL) 3291 pmap = kernel_pmap; 3292 3293 /* 3294 * (1) Change pte1 on current pmap. 3295 * (2) Flush all obsolete TLB entries on current CPU. 3296 * (3) Change pte1 on all pmaps. 3297 * (4) Flush all obsolete TLB entries on all CPUs in SMP case. 3298 */ 3299 3300 pte1p = pmap_pte1(pmap, va); 3301 pte1_store(pte1p, npte1); 3302 3303 /* Kill all the small mappings or the big one only. */ 3304 if (pte1_is_section(npte1)) { 3305 pmap_pte1_kern_promotions++; 3306 tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); 3307 } else { 3308 pmap_pte1_kern_demotions++; 3309 tlb_flush_local(pte1_trunc(va)); 3310 } 3311 3312 /* 3313 * In SMP case, this function is called when all cpus are at smp 3314 * rendezvous, so there is no need to use 'allpmaps_lock' lock here. 3315 * In UP case, the function is called with this lock locked. 3316 */ 3317 LIST_FOREACH(pmap, &allpmaps, pm_list) { 3318 pte1p = pmap_pte1(pmap, va); 3319 pte1_store(pte1p, npte1); 3320 } 3321 3322 #ifdef SMP 3323 /* Kill all the small mappings or the big one only. */ 3324 if (pte1_is_section(npte1)) 3325 tlb_flush_range(pte1_trunc(va), PTE1_SIZE); 3326 else 3327 tlb_flush(pte1_trunc(va)); 3328 #endif 3329 } 3330 3331 #ifdef SMP 3332 struct pte1_action { 3333 vm_offset_t va; 3334 pt1_entry_t npte1; 3335 u_int update; /* CPU that updates the PTE1 */ 3336 }; 3337 3338 static void 3339 pmap_update_pte1_action(void *arg) 3340 { 3341 struct pte1_action *act = arg; 3342 3343 if (act->update == PCPU_GET(cpuid)) 3344 pmap_update_pte1_kernel(act->va, act->npte1); 3345 } 3346 3347 /* 3348 * Change pte1 on current pmap. 3349 * Note that kernel pte1 must be changed on all pmaps. 3350 * 3351 * According to the architecture reference manual published by ARM, 3352 * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. 3353 * According to this manual, UNPREDICTABLE behaviours must never happen in 3354 * a viable system. In contrast, on x86 processors, it is not specified which 3355 * TLB entry mapping the virtual address will be used, but the MMU doesn't 3356 * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone 3357 * Black). 3358 * 3359 * It's a problem when either promotion or demotion is being done. The pte1 3360 * update and appropriate TLB flush must be done atomically in general. 3361 */ 3362 static void 3363 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3364 pt1_entry_t npte1) 3365 { 3366 3367 if (pmap == kernel_pmap) { 3368 struct pte1_action act; 3369 3370 sched_pin(); 3371 act.va = va; 3372 act.npte1 = npte1; 3373 act.update = PCPU_GET(cpuid); 3374 smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, 3375 pmap_update_pte1_action, NULL, &act); 3376 sched_unpin(); 3377 } else { 3378 register_t cspr; 3379 3380 /* 3381 * Use break-before-make approach for changing userland 3382 * mappings. It can cause L1 translation aborts on other 3383 * cores in SMP case. So, special treatment is implemented 3384 * in pmap_fault(). To reduce the likelihood that another core 3385 * will be affected by the broken mapping, disable interrupts 3386 * until the mapping change is completed. 3387 */ 3388 cspr = disable_interrupts(PSR_I | PSR_F); 3389 pte1_clear(pte1p); 3390 pmap_tlb_flush_pte1(pmap, va, npte1); 3391 pte1_store(pte1p, npte1); 3392 restore_interrupts(cspr); 3393 } 3394 } 3395 #else 3396 static void 3397 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3398 pt1_entry_t npte1) 3399 { 3400 3401 if (pmap == kernel_pmap) { 3402 mtx_lock_spin(&allpmaps_lock); 3403 pmap_update_pte1_kernel(va, npte1); 3404 mtx_unlock_spin(&allpmaps_lock); 3405 } else { 3406 register_t cspr; 3407 3408 /* 3409 * Use break-before-make approach for changing userland 3410 * mappings. It's absolutely safe in UP case when interrupts 3411 * are disabled. 3412 */ 3413 cspr = disable_interrupts(PSR_I | PSR_F); 3414 pte1_clear(pte1p); 3415 pmap_tlb_flush_pte1(pmap, va, npte1); 3416 pte1_store(pte1p, npte1); 3417 restore_interrupts(cspr); 3418 } 3419 } 3420 #endif 3421 3422 #if VM_NRESERVLEVEL > 0 3423 /* 3424 * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are 3425 * within a single page table page (PT2) to a single 1MB page mapping. 3426 * For promotion to occur, two conditions must be met: (1) the 4KB page 3427 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3428 * mappings must have identical characteristics. 3429 * 3430 * Managed (PG_MANAGED) mappings within the kernel address space are not 3431 * promoted. The reason is that kernel PTE1s are replicated in each pmap but 3432 * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only 3433 * read the PTE1 from the kernel pmap. 3434 */ 3435 static void 3436 pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3437 { 3438 pt1_entry_t npte1; 3439 pt2_entry_t *fpte2p, fpte2, fpte2_fav; 3440 pt2_entry_t *pte2p, pte2; 3441 vm_offset_t pteva __unused; 3442 vm_page_t m __unused; 3443 3444 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3445 pmap, va, pte1_load(pte1p), pte1p)); 3446 3447 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3448 3449 /* 3450 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is 3451 * either invalid, unused, or does not map the first 4KB physical page 3452 * within a 1MB page. 3453 */ 3454 fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); 3455 fpte2 = pte2_load(fpte2p); 3456 if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != 3457 (PTE2_A | PTE2_V)) { 3458 pmap_pte1_p_failures++; 3459 CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", 3460 __func__, va, pmap); 3461 return; 3462 } 3463 if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { 3464 pmap_pte1_p_failures++; 3465 CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", 3466 __func__, va, pmap); 3467 return; 3468 } 3469 if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3470 /* 3471 * When page is not modified, PTE2_RO can be set without 3472 * a TLB invalidation. 3473 */ 3474 fpte2 |= PTE2_RO; 3475 pte2_store(fpte2p, fpte2); 3476 } 3477 3478 /* 3479 * Examine each of the other PTE2s in the specified PT2. Abort if this 3480 * PTE2 maps an unexpected 4KB physical page or does not have identical 3481 * characteristics to the first PTE2. 3482 */ 3483 fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); 3484 fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ 3485 for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { 3486 pte2 = pte2_load(pte2p); 3487 if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { 3488 pmap_pte1_p_failures++; 3489 CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", 3490 __func__, va, pmap); 3491 return; 3492 } 3493 if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3494 /* 3495 * When page is not modified, PTE2_RO can be set 3496 * without a TLB invalidation. See note above. 3497 */ 3498 pte2 |= PTE2_RO; 3499 pte2_store(pte2p, pte2); 3500 pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & 3501 PTE2_FRAME); 3502 CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", 3503 __func__, pteva, pmap); 3504 } 3505 if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { 3506 pmap_pte1_p_failures++; 3507 CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", 3508 __func__, va, pmap); 3509 return; 3510 } 3511 3512 fpte2_fav -= PTE2_SIZE; 3513 } 3514 /* 3515 * The page table page in its current state will stay in PT2TAB 3516 * until the PTE1 mapping the section is demoted by pmap_demote_pte1() 3517 * or destroyed by pmap_remove_pte1(). 3518 * 3519 * Note that L2 page table size is not equal to PAGE_SIZE. 3520 */ 3521 m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); 3522 KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], 3523 ("%s: PT2 page is out of range", __func__)); 3524 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 3525 ("%s: PT2 page's pindex is wrong", __func__)); 3526 3527 /* 3528 * Get pte1 from pte2 format. 3529 */ 3530 npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; 3531 3532 /* 3533 * Promote the pv entries. 3534 */ 3535 if (pte2_is_managed(fpte2)) 3536 pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); 3537 3538 /* 3539 * Promote the mappings. 3540 */ 3541 pmap_change_pte1(pmap, pte1p, va, npte1); 3542 3543 pmap_pte1_promotions++; 3544 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3545 __func__, va, pmap); 3546 3547 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3548 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3549 } 3550 #endif /* VM_NRESERVLEVEL > 0 */ 3551 3552 /* 3553 * Zero L2 page table page. 3554 */ 3555 static __inline void 3556 pmap_clear_pt2(pt2_entry_t *fpte2p) 3557 { 3558 pt2_entry_t *pte2p; 3559 3560 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) 3561 pte2_clear(pte2p); 3562 3563 } 3564 3565 /* 3566 * Removes a 1MB page mapping from the kernel pmap. 3567 */ 3568 static void 3569 pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3570 { 3571 vm_page_t m; 3572 uint32_t pte1_idx; 3573 pt2_entry_t *fpte2p; 3574 vm_paddr_t pt2_pa; 3575 3576 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3577 m = pmap_pt2_page(pmap, va); 3578 if (m == NULL) 3579 /* 3580 * QQQ: Is this function called only on promoted pte1? 3581 * We certainly do section mappings directly 3582 * (without promotion) in kernel !!! 3583 */ 3584 panic("%s: missing pt2 page", __func__); 3585 3586 pte1_idx = pte1_index(va); 3587 3588 /* 3589 * Initialize the L2 page table. 3590 */ 3591 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3592 pmap_clear_pt2(fpte2p); 3593 3594 /* 3595 * Remove the mapping. 3596 */ 3597 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); 3598 pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); 3599 3600 /* 3601 * QQQ: We do not need to invalidate PT2MAP mapping 3602 * as we did not change it. I.e. the L2 page table page 3603 * was and still is mapped the same way. 3604 */ 3605 } 3606 3607 /* 3608 * Do the things to unmap a section in a process 3609 */ 3610 static void 3611 pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 3612 struct spglist *free) 3613 { 3614 pt1_entry_t opte1; 3615 struct md_page *pvh; 3616 vm_offset_t eva, va; 3617 vm_page_t m; 3618 3619 PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, 3620 pte1_load(pte1p), pte1p)); 3621 3622 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3623 KASSERT((sva & PTE1_OFFSET) == 0, 3624 ("%s: sva is not 1mpage aligned", __func__)); 3625 3626 /* 3627 * Clear and invalidate the mapping. It should occupy one and only TLB 3628 * entry. So, pmap_tlb_flush() called with aligned address should be 3629 * sufficient. 3630 */ 3631 opte1 = pte1_load_clear(pte1p); 3632 pmap_tlb_flush(pmap, sva); 3633 3634 if (pte1_is_wired(opte1)) 3635 pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; 3636 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 3637 if (pte1_is_managed(opte1)) { 3638 pvh = pa_to_pvh(pte1_pa(opte1)); 3639 pmap_pvh_free(pvh, pmap, sva); 3640 eva = sva + PTE1_SIZE; 3641 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 3642 va < eva; va += PAGE_SIZE, m++) { 3643 if (pte1_is_dirty(opte1)) 3644 vm_page_dirty(m); 3645 if (opte1 & PTE1_A) 3646 vm_page_aflag_set(m, PGA_REFERENCED); 3647 if (TAILQ_EMPTY(&m->md.pv_list) && 3648 TAILQ_EMPTY(&pvh->pv_list)) 3649 vm_page_aflag_clear(m, PGA_WRITEABLE); 3650 } 3651 } 3652 if (pmap == kernel_pmap) { 3653 /* 3654 * L2 page table(s) can't be removed from kernel map as 3655 * kernel counts on it (stuff around pmap_growkernel()). 3656 */ 3657 pmap_remove_kernel_pte1(pmap, pte1p, sva); 3658 } else { 3659 /* 3660 * Get associated L2 page table page. 3661 * It's possible that the page was never allocated. 3662 */ 3663 m = pmap_pt2_page(pmap, sva); 3664 if (m != NULL) 3665 pmap_unwire_pt2_all(pmap, sva, m, free); 3666 } 3667 } 3668 3669 /* 3670 * Fills L2 page table page with mappings to consecutive physical pages. 3671 */ 3672 static __inline void 3673 pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) 3674 { 3675 pt2_entry_t *pte2p; 3676 3677 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { 3678 pte2_store(pte2p, npte2); 3679 npte2 += PTE2_SIZE; 3680 } 3681 } 3682 3683 /* 3684 * Tries to demote a 1MB page mapping. If demotion fails, the 3685 * 1MB page mapping is invalidated. 3686 */ 3687 static boolean_t 3688 pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3689 { 3690 pt1_entry_t opte1, npte1; 3691 pt2_entry_t *fpte2p, npte2; 3692 vm_paddr_t pt2pg_pa, pt2_pa; 3693 vm_page_t m; 3694 struct spglist free; 3695 uint32_t pte1_idx, isnew = 0; 3696 3697 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3698 pmap, va, pte1_load(pte1p), pte1p)); 3699 3700 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3701 3702 opte1 = pte1_load(pte1p); 3703 KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); 3704 3705 if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { 3706 KASSERT(!pte1_is_wired(opte1), 3707 ("%s: PT2 page for a wired mapping is missing", __func__)); 3708 3709 /* 3710 * Invalidate the 1MB page mapping and return 3711 * "failure" if the mapping was never accessed or the 3712 * allocation of the new page table page fails. 3713 */ 3714 if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, 3715 pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | 3716 VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { 3717 SLIST_INIT(&free); 3718 pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); 3719 vm_page_free_pages_toq(&free, false); 3720 CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", 3721 __func__, va, pmap); 3722 return (FALSE); 3723 } 3724 if (va < VM_MAXUSER_ADDRESS) 3725 pmap->pm_stats.resident_count++; 3726 3727 isnew = 1; 3728 3729 /* 3730 * We init all L2 page tables in the page even if 3731 * we are going to change everything for one L2 page 3732 * table in a while. 3733 */ 3734 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 3735 } else { 3736 if (va < VM_MAXUSER_ADDRESS) { 3737 if (pt2_is_empty(m, va)) 3738 isnew = 1; /* Demoting section w/o promotion. */ 3739 #ifdef INVARIANTS 3740 else 3741 KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" 3742 " count %u", __func__, 3743 pt2_wirecount_get(m, pte1_index(va)))); 3744 #endif 3745 } 3746 } 3747 3748 pt2pg_pa = VM_PAGE_TO_PHYS(m); 3749 pte1_idx = pte1_index(va); 3750 /* 3751 * If the pmap is current, then the PT2MAP can provide access to 3752 * the page table page (promoted L2 page tables are not unmapped). 3753 * Otherwise, temporarily map the L2 page table page (m) into 3754 * the kernel's address space at either PADDR1 or PADDR2. 3755 * 3756 * Note that L2 page table size is not equal to PAGE_SIZE. 3757 */ 3758 if (pmap_is_current(pmap)) 3759 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3760 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 3761 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 3762 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 3763 #ifdef SMP 3764 PMAP1cpu = PCPU_GET(cpuid); 3765 #endif 3766 tlb_flush_local((vm_offset_t)PADDR1); 3767 PMAP1changed++; 3768 } else 3769 #ifdef SMP 3770 if (PMAP1cpu != PCPU_GET(cpuid)) { 3771 PMAP1cpu = PCPU_GET(cpuid); 3772 tlb_flush_local((vm_offset_t)PADDR1); 3773 PMAP1changedcpu++; 3774 } else 3775 #endif 3776 PMAP1unchanged++; 3777 fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); 3778 } else { 3779 mtx_lock(&PMAP2mutex); 3780 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 3781 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 3782 tlb_flush((vm_offset_t)PADDR2); 3783 } 3784 fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); 3785 } 3786 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 3787 npte1 = PTE1_LINK(pt2_pa); 3788 3789 KASSERT((opte1 & PTE1_A) != 0, 3790 ("%s: opte1 is missing PTE1_A", __func__)); 3791 KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, 3792 ("%s: opte1 has PTE1_NM", __func__)); 3793 3794 /* 3795 * Get pte2 from pte1 format. 3796 */ 3797 npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; 3798 3799 /* 3800 * If the L2 page table page is new, initialize it. If the mapping 3801 * has changed attributes, update the page table entries. 3802 */ 3803 if (isnew != 0) { 3804 pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); 3805 pmap_fill_pt2(fpte2p, npte2); 3806 } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != 3807 (npte2 & PTE2_PROMOTE)) 3808 pmap_fill_pt2(fpte2p, npte2); 3809 3810 KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), 3811 ("%s: fpte2p and npte2 map different physical addresses", 3812 __func__)); 3813 3814 if (fpte2p == PADDR2) 3815 mtx_unlock(&PMAP2mutex); 3816 3817 /* 3818 * Demote the mapping. This pmap is locked. The old PTE1 has 3819 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also 3820 * has not PTE1_NM set. Thus, there is no danger of a race with 3821 * another processor changing the setting of PTE1_A and/or PTE1_NM 3822 * between the read above and the store below. 3823 */ 3824 pmap_change_pte1(pmap, pte1p, va, npte1); 3825 3826 /* 3827 * Demote the pv entry. This depends on the earlier demotion 3828 * of the mapping. Specifically, the (re)creation of a per- 3829 * page pv entry might trigger the execution of pmap_pv_reclaim(), 3830 * which might reclaim a newly (re)created per-page pv entry 3831 * and destroy the associated mapping. In order to destroy 3832 * the mapping, the PTE1 must have already changed from mapping 3833 * the 1mpage to referencing the page table page. 3834 */ 3835 if (pte1_is_managed(opte1)) 3836 pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); 3837 3838 pmap_pte1_demotions++; 3839 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3840 __func__, va, pmap); 3841 3842 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3843 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3844 return (TRUE); 3845 } 3846 3847 /* 3848 * Insert the given physical page (p) at 3849 * the specified virtual address (v) in the 3850 * target physical map with the protection requested. 3851 * 3852 * If specified, the page will be wired down, meaning 3853 * that the related pte can not be reclaimed. 3854 * 3855 * NB: This is the only routine which MAY NOT lazy-evaluate 3856 * or lose information. That is, this routine must actually 3857 * insert this page into the given map NOW. 3858 */ 3859 int 3860 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3861 u_int flags, int8_t psind) 3862 { 3863 pt1_entry_t *pte1p; 3864 pt2_entry_t *pte2p; 3865 pt2_entry_t npte2, opte2; 3866 pv_entry_t pv; 3867 vm_paddr_t opa, pa; 3868 vm_page_t mpte2, om; 3869 int rv; 3870 3871 va = trunc_page(va); 3872 KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); 3873 KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, 3874 ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, 3875 va)); 3876 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 3877 va >= kmi.clean_eva, 3878 ("%s: managed mapping within the clean submap", __func__)); 3879 if ((m->oflags & VPO_UNMANAGED) == 0) 3880 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3881 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 3882 ("%s: flags %u has reserved bits set", __func__, flags)); 3883 pa = VM_PAGE_TO_PHYS(m); 3884 npte2 = PTE2(pa, PTE2_A, vm_page_pte2_attr(m)); 3885 if ((flags & VM_PROT_WRITE) == 0) 3886 npte2 |= PTE2_NM; 3887 if ((prot & VM_PROT_WRITE) == 0) 3888 npte2 |= PTE2_RO; 3889 KASSERT((npte2 & (PTE2_NM | PTE2_RO)) != PTE2_RO, 3890 ("%s: flags includes VM_PROT_WRITE but prot doesn't", __func__)); 3891 if ((prot & VM_PROT_EXECUTE) == 0) 3892 npte2 |= PTE2_NX; 3893 if ((flags & PMAP_ENTER_WIRED) != 0) 3894 npte2 |= PTE2_W; 3895 if (va < VM_MAXUSER_ADDRESS) 3896 npte2 |= PTE2_U; 3897 if (pmap != kernel_pmap) 3898 npte2 |= PTE2_NG; 3899 3900 rw_wlock(&pvh_global_lock); 3901 PMAP_LOCK(pmap); 3902 sched_pin(); 3903 if (psind == 1) { 3904 /* Assert the required virtual and physical alignment. */ 3905 KASSERT((va & PTE1_OFFSET) == 0, 3906 ("%s: va unaligned", __func__)); 3907 KASSERT(m->psind > 0, ("%s: m->psind < psind", __func__)); 3908 rv = pmap_enter_pte1(pmap, va, PTE1_PA(pa) | ATTR_TO_L1(npte2) | 3909 PTE1_V, flags, m); 3910 goto out; 3911 } 3912 3913 /* 3914 * In the case that a page table page is not 3915 * resident, we are creating it here. 3916 */ 3917 if (va < VM_MAXUSER_ADDRESS) { 3918 mpte2 = pmap_allocpte2(pmap, va, flags); 3919 if (mpte2 == NULL) { 3920 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3921 ("pmap_allocpte2 failed with sleep allowed")); 3922 rv = KERN_RESOURCE_SHORTAGE; 3923 goto out; 3924 } 3925 } else 3926 mpte2 = NULL; 3927 pte1p = pmap_pte1(pmap, va); 3928 if (pte1_is_section(pte1_load(pte1p))) 3929 panic("%s: attempted on 1MB page", __func__); 3930 pte2p = pmap_pte2_quick(pmap, va); 3931 if (pte2p == NULL) 3932 panic("%s: invalid L1 page table entry va=%#x", __func__, va); 3933 3934 om = NULL; 3935 opte2 = pte2_load(pte2p); 3936 opa = pte2_pa(opte2); 3937 /* 3938 * Mapping has not changed, must be protection or wiring change. 3939 */ 3940 if (pte2_is_valid(opte2) && (opa == pa)) { 3941 /* 3942 * Wiring change, just update stats. We don't worry about 3943 * wiring PT2 pages as they remain resident as long as there 3944 * are valid mappings in them. Hence, if a user page is wired, 3945 * the PT2 page will be also. 3946 */ 3947 if (pte2_is_wired(npte2) && !pte2_is_wired(opte2)) 3948 pmap->pm_stats.wired_count++; 3949 else if (!pte2_is_wired(npte2) && pte2_is_wired(opte2)) 3950 pmap->pm_stats.wired_count--; 3951 3952 /* 3953 * Remove extra pte2 reference 3954 */ 3955 if (mpte2) 3956 pt2_wirecount_dec(mpte2, pte1_index(va)); 3957 if ((m->oflags & VPO_UNMANAGED) == 0) 3958 om = m; 3959 goto validate; 3960 } 3961 3962 /* 3963 * QQQ: We think that changing physical address on writeable mapping 3964 * is not safe. Well, maybe on kernel address space with correct 3965 * locking, it can make a sense. However, we have no idea why 3966 * anyone should do that on user address space. Are we wrong? 3967 */ 3968 KASSERT((opa == 0) || (opa == pa) || 3969 !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), 3970 ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", 3971 __func__, pmap, va, opte2, opa, pa, flags, prot)); 3972 3973 pv = NULL; 3974 3975 /* 3976 * Mapping has changed, invalidate old range and fall through to 3977 * handle validating new mapping. 3978 */ 3979 if (opa) { 3980 if (pte2_is_wired(opte2)) 3981 pmap->pm_stats.wired_count--; 3982 om = PHYS_TO_VM_PAGE(opa); 3983 if (om != NULL && (om->oflags & VPO_UNMANAGED) != 0) 3984 om = NULL; 3985 if (om != NULL) 3986 pv = pmap_pvh_remove(&om->md, pmap, va); 3987 3988 /* 3989 * Remove extra pte2 reference 3990 */ 3991 if (mpte2 != NULL) 3992 pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); 3993 } else 3994 pmap->pm_stats.resident_count++; 3995 3996 /* 3997 * Enter on the PV list if part of our managed memory. 3998 */ 3999 if ((m->oflags & VPO_UNMANAGED) == 0) { 4000 if (pv == NULL) { 4001 pv = get_pv_entry(pmap, FALSE); 4002 pv->pv_va = va; 4003 } 4004 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4005 } else if (pv != NULL) 4006 free_pv_entry(pmap, pv); 4007 4008 /* 4009 * Increment counters 4010 */ 4011 if (pte2_is_wired(npte2)) 4012 pmap->pm_stats.wired_count++; 4013 4014 validate: 4015 /* 4016 * Now validate mapping with desired protection/wiring. 4017 */ 4018 if (prot & VM_PROT_WRITE) { 4019 if ((m->oflags & VPO_UNMANAGED) == 0) 4020 vm_page_aflag_set(m, PGA_WRITEABLE); 4021 } 4022 4023 /* 4024 * If the mapping or permission bits are different, we need 4025 * to update the pte2. 4026 * 4027 * QQQ: Think again and again what to do 4028 * if the mapping is going to be changed! 4029 */ 4030 if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { 4031 /* 4032 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4033 * is set. Do it now, before the mapping is stored and made 4034 * valid for hardware table walk. If done later, there is a race 4035 * for other threads of current process in lazy loading case. 4036 * Don't do it for kernel memory which is mapped with exec 4037 * permission even if the memory isn't going to hold executable 4038 * code. The only time when icache sync is needed is after 4039 * kernel module is loaded and the relocation info is processed. 4040 * And it's done in elf_cpu_load_file(). 4041 * 4042 * QQQ: (1) Does it exist any better way where 4043 * or how to sync icache? 4044 * (2) Now, we do it on a page basis. 4045 */ 4046 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4047 m->md.pat_mode == VM_MEMATTR_WB_WA && 4048 (opa != pa || (opte2 & PTE2_NX))) 4049 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4050 4051 if (opte2 & PTE2_V) { 4052 /* Change mapping with break-before-make approach. */ 4053 opte2 = pte2_load_clear(pte2p); 4054 pmap_tlb_flush(pmap, va); 4055 pte2_store(pte2p, npte2); 4056 if (om != NULL) { 4057 KASSERT((om->oflags & VPO_UNMANAGED) == 0, 4058 ("%s: om %p unmanaged", __func__, om)); 4059 if ((opte2 & PTE2_A) != 0) 4060 vm_page_aflag_set(om, PGA_REFERENCED); 4061 if (pte2_is_dirty(opte2)) 4062 vm_page_dirty(om); 4063 if (TAILQ_EMPTY(&om->md.pv_list) && 4064 ((om->flags & PG_FICTITIOUS) != 0 || 4065 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4066 vm_page_aflag_clear(om, PGA_WRITEABLE); 4067 } 4068 } else 4069 pte2_store(pte2p, npte2); 4070 } 4071 #if 0 4072 else { 4073 /* 4074 * QQQ: In time when both access and not mofified bits are 4075 * emulated by software, this should not happen. Some 4076 * analysis is need, if this really happen. Missing 4077 * tlb flush somewhere could be the reason. 4078 */ 4079 panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, 4080 va, opte2, npte2); 4081 } 4082 #endif 4083 4084 #if VM_NRESERVLEVEL > 0 4085 /* 4086 * If both the L2 page table page and the reservation are fully 4087 * populated, then attempt promotion. 4088 */ 4089 if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && 4090 sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && 4091 vm_reserv_level_iffullpop(m) == 0) 4092 pmap_promote_pte1(pmap, pte1p, va); 4093 #endif 4094 4095 rv = KERN_SUCCESS; 4096 out: 4097 sched_unpin(); 4098 rw_wunlock(&pvh_global_lock); 4099 PMAP_UNLOCK(pmap); 4100 return (rv); 4101 } 4102 4103 /* 4104 * Do the things to unmap a page in a process. 4105 */ 4106 static int 4107 pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, 4108 struct spglist *free) 4109 { 4110 pt2_entry_t opte2; 4111 vm_page_t m; 4112 4113 rw_assert(&pvh_global_lock, RA_WLOCKED); 4114 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4115 4116 /* Clear and invalidate the mapping. */ 4117 opte2 = pte2_load_clear(pte2p); 4118 pmap_tlb_flush(pmap, va); 4119 4120 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", 4121 __func__, pmap, va, opte2)); 4122 4123 if (opte2 & PTE2_W) 4124 pmap->pm_stats.wired_count -= 1; 4125 pmap->pm_stats.resident_count -= 1; 4126 if (pte2_is_managed(opte2)) { 4127 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4128 if (pte2_is_dirty(opte2)) 4129 vm_page_dirty(m); 4130 if (opte2 & PTE2_A) 4131 vm_page_aflag_set(m, PGA_REFERENCED); 4132 pmap_remove_entry(pmap, m, va); 4133 } 4134 return (pmap_unuse_pt2(pmap, va, free)); 4135 } 4136 4137 /* 4138 * Remove a single page from a process address space. 4139 */ 4140 static void 4141 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 4142 { 4143 pt2_entry_t *pte2p; 4144 4145 rw_assert(&pvh_global_lock, RA_WLOCKED); 4146 KASSERT(curthread->td_pinned > 0, 4147 ("%s: curthread not pinned", __func__)); 4148 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4149 if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || 4150 !pte2_is_valid(pte2_load(pte2p))) 4151 return; 4152 pmap_remove_pte2(pmap, pte2p, va, free); 4153 } 4154 4155 /* 4156 * Remove the given range of addresses from the specified map. 4157 * 4158 * It is assumed that the start and end are properly 4159 * rounded to the page size. 4160 */ 4161 void 4162 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4163 { 4164 vm_offset_t nextva; 4165 pt1_entry_t *pte1p, pte1; 4166 pt2_entry_t *pte2p, pte2; 4167 struct spglist free; 4168 4169 /* 4170 * Perform an unsynchronized read. This is, however, safe. 4171 */ 4172 if (pmap->pm_stats.resident_count == 0) 4173 return; 4174 4175 SLIST_INIT(&free); 4176 4177 rw_wlock(&pvh_global_lock); 4178 sched_pin(); 4179 PMAP_LOCK(pmap); 4180 4181 /* 4182 * Special handling of removing one page. A very common 4183 * operation and easy to short circuit some code. 4184 */ 4185 if (sva + PAGE_SIZE == eva) { 4186 pte1 = pte1_load(pmap_pte1(pmap, sva)); 4187 if (pte1_is_link(pte1)) { 4188 pmap_remove_page(pmap, sva, &free); 4189 goto out; 4190 } 4191 } 4192 4193 for (; sva < eva; sva = nextva) { 4194 /* 4195 * Calculate address for next L2 page table. 4196 */ 4197 nextva = pte1_trunc(sva + PTE1_SIZE); 4198 if (nextva < sva) 4199 nextva = eva; 4200 if (pmap->pm_stats.resident_count == 0) 4201 break; 4202 4203 pte1p = pmap_pte1(pmap, sva); 4204 pte1 = pte1_load(pte1p); 4205 4206 /* 4207 * Weed out invalid mappings. Note: we assume that the L1 page 4208 * table is always allocated, and in kernel virtual. 4209 */ 4210 if (pte1 == 0) 4211 continue; 4212 4213 if (pte1_is_section(pte1)) { 4214 /* 4215 * Are we removing the entire large page? If not, 4216 * demote the mapping and fall through. 4217 */ 4218 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4219 pmap_remove_pte1(pmap, pte1p, sva, &free); 4220 continue; 4221 } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4222 /* The large page mapping was destroyed. */ 4223 continue; 4224 } 4225 #ifdef INVARIANTS 4226 else { 4227 /* Update pte1 after demotion. */ 4228 pte1 = pte1_load(pte1p); 4229 } 4230 #endif 4231 } 4232 4233 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4234 " is not link", __func__, pmap, sva, pte1, pte1p)); 4235 4236 /* 4237 * Limit our scan to either the end of the va represented 4238 * by the current L2 page table page, or to the end of the 4239 * range being removed. 4240 */ 4241 if (nextva > eva) 4242 nextva = eva; 4243 4244 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; 4245 pte2p++, sva += PAGE_SIZE) { 4246 pte2 = pte2_load(pte2p); 4247 if (!pte2_is_valid(pte2)) 4248 continue; 4249 if (pmap_remove_pte2(pmap, pte2p, sva, &free)) 4250 break; 4251 } 4252 } 4253 out: 4254 sched_unpin(); 4255 rw_wunlock(&pvh_global_lock); 4256 PMAP_UNLOCK(pmap); 4257 vm_page_free_pages_toq(&free, false); 4258 } 4259 4260 /* 4261 * Routine: pmap_remove_all 4262 * Function: 4263 * Removes this physical page from 4264 * all physical maps in which it resides. 4265 * Reflects back modify bits to the pager. 4266 * 4267 * Notes: 4268 * Original versions of this routine were very 4269 * inefficient because they iteratively called 4270 * pmap_remove (slow...) 4271 */ 4272 4273 void 4274 pmap_remove_all(vm_page_t m) 4275 { 4276 struct md_page *pvh; 4277 pv_entry_t pv; 4278 pmap_t pmap; 4279 pt2_entry_t *pte2p, opte2; 4280 pt1_entry_t *pte1p; 4281 vm_offset_t va; 4282 struct spglist free; 4283 4284 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4285 ("%s: page %p is not managed", __func__, m)); 4286 SLIST_INIT(&free); 4287 rw_wlock(&pvh_global_lock); 4288 sched_pin(); 4289 if ((m->flags & PG_FICTITIOUS) != 0) 4290 goto small_mappings; 4291 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4292 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4293 va = pv->pv_va; 4294 pmap = PV_PMAP(pv); 4295 PMAP_LOCK(pmap); 4296 pte1p = pmap_pte1(pmap, va); 4297 (void)pmap_demote_pte1(pmap, pte1p, va); 4298 PMAP_UNLOCK(pmap); 4299 } 4300 small_mappings: 4301 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4302 pmap = PV_PMAP(pv); 4303 PMAP_LOCK(pmap); 4304 pmap->pm_stats.resident_count--; 4305 pte1p = pmap_pte1(pmap, pv->pv_va); 4306 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " 4307 "a 1mpage in page %p's pv list", __func__, m)); 4308 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 4309 opte2 = pte2_load_clear(pte2p); 4310 pmap_tlb_flush(pmap, pv->pv_va); 4311 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", 4312 __func__, pmap, pv->pv_va)); 4313 if (pte2_is_wired(opte2)) 4314 pmap->pm_stats.wired_count--; 4315 if (opte2 & PTE2_A) 4316 vm_page_aflag_set(m, PGA_REFERENCED); 4317 4318 /* 4319 * Update the vm_page_t clean and reference bits. 4320 */ 4321 if (pte2_is_dirty(opte2)) 4322 vm_page_dirty(m); 4323 pmap_unuse_pt2(pmap, pv->pv_va, &free); 4324 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4325 free_pv_entry(pmap, pv); 4326 PMAP_UNLOCK(pmap); 4327 } 4328 vm_page_aflag_clear(m, PGA_WRITEABLE); 4329 sched_unpin(); 4330 rw_wunlock(&pvh_global_lock); 4331 vm_page_free_pages_toq(&free, false); 4332 } 4333 4334 /* 4335 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4336 * good coding style, a.k.a. 80 character line width limit hell. 4337 */ 4338 static __inline void 4339 pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, 4340 struct spglist *free) 4341 { 4342 vm_paddr_t pa; 4343 vm_page_t m, mt, mpt2pg; 4344 struct md_page *pvh; 4345 4346 pa = pte1_pa(pte1); 4347 m = PHYS_TO_VM_PAGE(pa); 4348 4349 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4350 __func__, m, m->phys_addr, pa)); 4351 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4352 m < &vm_page_array[vm_page_array_size], 4353 ("%s: bad pte1 %#x", __func__, pte1)); 4354 4355 if (pte1_is_dirty(pte1)) { 4356 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4357 vm_page_dirty(mt); 4358 } 4359 4360 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 4361 pvh = pa_to_pvh(pa); 4362 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4363 if (TAILQ_EMPTY(&pvh->pv_list)) { 4364 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4365 if (TAILQ_EMPTY(&mt->md.pv_list)) 4366 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4367 } 4368 mpt2pg = pmap_pt2_page(pmap, pv->pv_va); 4369 if (mpt2pg != NULL) 4370 pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); 4371 } 4372 4373 /* 4374 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4375 * good coding style, a.k.a. 80 character line width limit hell. 4376 */ 4377 static __inline void 4378 pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, 4379 struct spglist *free) 4380 { 4381 vm_paddr_t pa; 4382 vm_page_t m; 4383 struct md_page *pvh; 4384 4385 pa = pte2_pa(pte2); 4386 m = PHYS_TO_VM_PAGE(pa); 4387 4388 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4389 __func__, m, m->phys_addr, pa)); 4390 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4391 m < &vm_page_array[vm_page_array_size], 4392 ("%s: bad pte2 %#x", __func__, pte2)); 4393 4394 if (pte2_is_dirty(pte2)) 4395 vm_page_dirty(m); 4396 4397 pmap->pm_stats.resident_count--; 4398 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4399 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 4400 pvh = pa_to_pvh(pa); 4401 if (TAILQ_EMPTY(&pvh->pv_list)) 4402 vm_page_aflag_clear(m, PGA_WRITEABLE); 4403 } 4404 pmap_unuse_pt2(pmap, pv->pv_va, free); 4405 } 4406 4407 /* 4408 * Remove all pages from specified address space this aids process 4409 * exit speeds. Also, this code is special cased for current process 4410 * only, but can have the more generic (and slightly slower) mode enabled. 4411 * This is much faster than pmap_remove in the case of running down 4412 * an entire address space. 4413 */ 4414 void 4415 pmap_remove_pages(pmap_t pmap) 4416 { 4417 pt1_entry_t *pte1p, pte1; 4418 pt2_entry_t *pte2p, pte2; 4419 pv_entry_t pv; 4420 struct pv_chunk *pc, *npc; 4421 struct spglist free; 4422 int field, idx; 4423 int32_t bit; 4424 uint32_t inuse, bitmask; 4425 boolean_t allfree; 4426 4427 /* 4428 * Assert that the given pmap is only active on the current 4429 * CPU. Unfortunately, we cannot block another CPU from 4430 * activating the pmap while this function is executing. 4431 */ 4432 KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), 4433 ("%s: non-current pmap %p", __func__, pmap)); 4434 #if defined(SMP) && defined(INVARIANTS) 4435 { 4436 cpuset_t other_cpus; 4437 4438 sched_pin(); 4439 other_cpus = pmap->pm_active; 4440 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 4441 sched_unpin(); 4442 KASSERT(CPU_EMPTY(&other_cpus), 4443 ("%s: pmap %p active on other cpus", __func__, pmap)); 4444 } 4445 #endif 4446 SLIST_INIT(&free); 4447 rw_wlock(&pvh_global_lock); 4448 PMAP_LOCK(pmap); 4449 sched_pin(); 4450 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4451 KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", 4452 __func__, pmap, pc->pc_pmap)); 4453 allfree = TRUE; 4454 for (field = 0; field < _NPCM; field++) { 4455 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4456 while (inuse != 0) { 4457 bit = ffs(inuse) - 1; 4458 bitmask = 1UL << bit; 4459 idx = field * 32 + bit; 4460 pv = &pc->pc_pventry[idx]; 4461 inuse &= ~bitmask; 4462 4463 /* 4464 * Note that we cannot remove wired pages 4465 * from a process' mapping at this time 4466 */ 4467 pte1p = pmap_pte1(pmap, pv->pv_va); 4468 pte1 = pte1_load(pte1p); 4469 if (pte1_is_section(pte1)) { 4470 if (pte1_is_wired(pte1)) { 4471 allfree = FALSE; 4472 continue; 4473 } 4474 pte1_clear(pte1p); 4475 pmap_remove_pte1_quick(pmap, pte1, pv, 4476 &free); 4477 } 4478 else if (pte1_is_link(pte1)) { 4479 pte2p = pt2map_entry(pv->pv_va); 4480 pte2 = pte2_load(pte2p); 4481 4482 if (!pte2_is_valid(pte2)) { 4483 printf("%s: pmap %p va %#x " 4484 "pte2 %#x\n", __func__, 4485 pmap, pv->pv_va, pte2); 4486 panic("bad pte2"); 4487 } 4488 4489 if (pte2_is_wired(pte2)) { 4490 allfree = FALSE; 4491 continue; 4492 } 4493 pte2_clear(pte2p); 4494 pmap_remove_pte2_quick(pmap, pte2, pv, 4495 &free); 4496 } else { 4497 printf("%s: pmap %p va %#x pte1 %#x\n", 4498 __func__, pmap, pv->pv_va, pte1); 4499 panic("bad pte1"); 4500 } 4501 4502 /* Mark free */ 4503 PV_STAT(pv_entry_frees++); 4504 PV_STAT(pv_entry_spare++); 4505 pv_entry_count--; 4506 pc->pc_map[field] |= bitmask; 4507 } 4508 } 4509 if (allfree) { 4510 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4511 free_pv_chunk(pc); 4512 } 4513 } 4514 tlb_flush_all_ng_local(); 4515 sched_unpin(); 4516 rw_wunlock(&pvh_global_lock); 4517 PMAP_UNLOCK(pmap); 4518 vm_page_free_pages_toq(&free, false); 4519 } 4520 4521 /* 4522 * This code makes some *MAJOR* assumptions: 4523 * 1. Current pmap & pmap exists. 4524 * 2. Not wired. 4525 * 3. Read access. 4526 * 4. No L2 page table pages. 4527 * but is *MUCH* faster than pmap_enter... 4528 */ 4529 static vm_page_t 4530 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4531 vm_prot_t prot, vm_page_t mpt2pg) 4532 { 4533 pt2_entry_t *pte2p, pte2; 4534 vm_paddr_t pa; 4535 struct spglist free; 4536 uint32_t l2prot; 4537 4538 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4539 (m->oflags & VPO_UNMANAGED) != 0, 4540 ("%s: managed mapping within the clean submap", __func__)); 4541 rw_assert(&pvh_global_lock, RA_WLOCKED); 4542 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4543 4544 /* 4545 * In the case that a L2 page table page is not 4546 * resident, we are creating it here. 4547 */ 4548 if (va < VM_MAXUSER_ADDRESS) { 4549 u_int pte1_idx; 4550 pt1_entry_t pte1, *pte1p; 4551 vm_paddr_t pt2_pa; 4552 4553 /* 4554 * Get L1 page table things. 4555 */ 4556 pte1_idx = pte1_index(va); 4557 pte1p = pmap_pte1(pmap, va); 4558 pte1 = pte1_load(pte1p); 4559 4560 if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { 4561 /* 4562 * Each of NPT2_IN_PG L2 page tables on the page can 4563 * come here. Make sure that associated L1 page table 4564 * link is established. 4565 * 4566 * QQQ: It comes that we don't establish all links to 4567 * L2 page tables for newly allocated L2 page 4568 * tables page. 4569 */ 4570 KASSERT(!pte1_is_section(pte1), 4571 ("%s: pte1 %#x is section", __func__, pte1)); 4572 if (!pte1_is_link(pte1)) { 4573 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), 4574 pte1_idx); 4575 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 4576 } 4577 pt2_wirecount_inc(mpt2pg, pte1_idx); 4578 } else { 4579 /* 4580 * If the L2 page table page is mapped, we just 4581 * increment the hold count, and activate it. 4582 */ 4583 if (pte1_is_section(pte1)) { 4584 return (NULL); 4585 } else if (pte1_is_link(pte1)) { 4586 mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 4587 pt2_wirecount_inc(mpt2pg, pte1_idx); 4588 } else { 4589 mpt2pg = _pmap_allocpte2(pmap, va, 4590 PMAP_ENTER_NOSLEEP); 4591 if (mpt2pg == NULL) 4592 return (NULL); 4593 } 4594 } 4595 } else { 4596 mpt2pg = NULL; 4597 } 4598 4599 /* 4600 * This call to pt2map_entry() makes the assumption that we are 4601 * entering the page into the current pmap. In order to support 4602 * quick entry into any pmap, one would likely use pmap_pte2_quick(). 4603 * But that isn't as quick as pt2map_entry(). 4604 */ 4605 pte2p = pt2map_entry(va); 4606 pte2 = pte2_load(pte2p); 4607 if (pte2_is_valid(pte2)) { 4608 if (mpt2pg != NULL) { 4609 /* 4610 * Remove extra pte2 reference 4611 */ 4612 pt2_wirecount_dec(mpt2pg, pte1_index(va)); 4613 mpt2pg = NULL; 4614 } 4615 return (NULL); 4616 } 4617 4618 /* 4619 * Enter on the PV list if part of our managed memory. 4620 */ 4621 if ((m->oflags & VPO_UNMANAGED) == 0 && 4622 !pmap_try_insert_pv_entry(pmap, va, m)) { 4623 if (mpt2pg != NULL) { 4624 SLIST_INIT(&free); 4625 if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { 4626 pmap_tlb_flush(pmap, va); 4627 vm_page_free_pages_toq(&free, false); 4628 } 4629 4630 mpt2pg = NULL; 4631 } 4632 return (NULL); 4633 } 4634 4635 /* 4636 * Increment counters 4637 */ 4638 pmap->pm_stats.resident_count++; 4639 4640 /* 4641 * Now validate mapping with RO protection 4642 */ 4643 pa = VM_PAGE_TO_PHYS(m); 4644 l2prot = PTE2_RO | PTE2_NM; 4645 if (va < VM_MAXUSER_ADDRESS) 4646 l2prot |= PTE2_U | PTE2_NG; 4647 if ((prot & VM_PROT_EXECUTE) == 0) 4648 l2prot |= PTE2_NX; 4649 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4650 /* 4651 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4652 * is set. QQQ: For more info, see comments in pmap_enter(). 4653 */ 4654 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4655 } 4656 pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); 4657 4658 return (mpt2pg); 4659 } 4660 4661 void 4662 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4663 { 4664 4665 rw_wlock(&pvh_global_lock); 4666 PMAP_LOCK(pmap); 4667 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4668 rw_wunlock(&pvh_global_lock); 4669 PMAP_UNLOCK(pmap); 4670 } 4671 4672 /* 4673 * Tries to create a read- and/or execute-only 1 MB page mapping. Returns 4674 * true if successful. Returns false if (1) a mapping already exists at the 4675 * specified virtual address or (2) a PV entry cannot be allocated without 4676 * reclaiming another PV entry. 4677 */ 4678 static bool 4679 pmap_enter_1mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4680 { 4681 pt1_entry_t pte1; 4682 vm_paddr_t pa; 4683 4684 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4685 pa = VM_PAGE_TO_PHYS(m); 4686 pte1 = PTE1(pa, PTE1_NM | PTE1_RO, ATTR_TO_L1(vm_page_pte2_attr(m))); 4687 if ((prot & VM_PROT_EXECUTE) == 0) 4688 pte1 |= PTE1_NX; 4689 if (va < VM_MAXUSER_ADDRESS) 4690 pte1 |= PTE1_U; 4691 if (pmap != kernel_pmap) 4692 pte1 |= PTE1_NG; 4693 return (pmap_enter_pte1(pmap, va, pte1, PMAP_ENTER_NOSLEEP | 4694 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m) == KERN_SUCCESS); 4695 } 4696 4697 /* 4698 * Tries to create the specified 1 MB page mapping. Returns KERN_SUCCESS if 4699 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4700 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4701 * a mapping already exists at the specified virtual address. Returns 4702 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NORECLAIM was specified and PV entry 4703 * allocation failed. 4704 */ 4705 static int 4706 pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, 4707 vm_page_t m) 4708 { 4709 struct spglist free; 4710 pt1_entry_t opte1, *pte1p; 4711 pt2_entry_t pte2, *pte2p; 4712 vm_offset_t cur, end; 4713 vm_page_t mt; 4714 4715 rw_assert(&pvh_global_lock, RA_WLOCKED); 4716 KASSERT((pte1 & (PTE1_NM | PTE1_RO)) == 0 || 4717 (pte1 & (PTE1_NM | PTE1_RO)) == (PTE1_NM | PTE1_RO), 4718 ("%s: pte1 has inconsistent NM and RO attributes", __func__)); 4719 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4720 pte1p = pmap_pte1(pmap, va); 4721 opte1 = pte1_load(pte1p); 4722 if (pte1_is_valid(opte1)) { 4723 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 4724 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4725 __func__, va, pmap); 4726 return (KERN_FAILURE); 4727 } 4728 /* Break the existing mapping(s). */ 4729 SLIST_INIT(&free); 4730 if (pte1_is_section(opte1)) { 4731 /* 4732 * If the section resulted from a promotion, then a 4733 * reserved PT page could be freed. 4734 */ 4735 pmap_remove_pte1(pmap, pte1p, va, &free); 4736 } else { 4737 sched_pin(); 4738 end = va + PTE1_SIZE; 4739 for (cur = va, pte2p = pmap_pte2_quick(pmap, va); 4740 cur != end; cur += PAGE_SIZE, pte2p++) { 4741 pte2 = pte2_load(pte2p); 4742 if (!pte2_is_valid(pte2)) 4743 continue; 4744 if (pmap_remove_pte2(pmap, pte2p, cur, &free)) 4745 break; 4746 } 4747 sched_unpin(); 4748 } 4749 vm_page_free_pages_toq(&free, false); 4750 } 4751 if ((m->oflags & VPO_UNMANAGED) == 0) { 4752 /* 4753 * Abort this mapping if its PV entry could not be created. 4754 */ 4755 if (!pmap_pv_insert_pte1(pmap, va, pte1, flags)) { 4756 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4757 __func__, va, pmap); 4758 return (KERN_RESOURCE_SHORTAGE); 4759 } 4760 if ((pte1 & PTE1_RO) == 0) { 4761 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4762 vm_page_aflag_set(mt, PGA_WRITEABLE); 4763 } 4764 } 4765 4766 /* 4767 * Increment counters. 4768 */ 4769 if (pte1_is_wired(pte1)) 4770 pmap->pm_stats.wired_count += PTE1_SIZE / PAGE_SIZE; 4771 pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; 4772 4773 /* 4774 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4775 * is set. QQQ: For more info, see comments in pmap_enter(). 4776 */ 4777 if ((pte1 & PTE1_NX) == 0 && m->md.pat_mode == VM_MEMATTR_WB_WA && 4778 pmap != kernel_pmap && (!pte1_is_section(opte1) || 4779 pte1_pa(opte1) != VM_PAGE_TO_PHYS(m) || (opte1 & PTE2_NX) != 0)) 4780 cache_icache_sync_fresh(va, VM_PAGE_TO_PHYS(m), PTE1_SIZE); 4781 4782 /* 4783 * Map the section. 4784 */ 4785 pte1_store(pte1p, pte1); 4786 4787 pmap_pte1_mappings++; 4788 CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, 4789 pmap); 4790 return (KERN_SUCCESS); 4791 } 4792 4793 /* 4794 * Maps a sequence of resident pages belonging to the same object. 4795 * The sequence begins with the given page m_start. This page is 4796 * mapped at the given virtual address start. Each subsequent page is 4797 * mapped at a virtual address that is offset from start by the same 4798 * amount as the page is offset from m_start within the object. The 4799 * last page in the sequence is the page with the largest offset from 4800 * m_start that can be mapped at a virtual address less than the given 4801 * virtual address end. Not every virtual page between start and end 4802 * is mapped; only those for which a resident page exists with the 4803 * corresponding offset from m_start are mapped. 4804 */ 4805 void 4806 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4807 vm_page_t m_start, vm_prot_t prot) 4808 { 4809 vm_offset_t va; 4810 vm_page_t m, mpt2pg; 4811 vm_pindex_t diff, psize; 4812 4813 PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", 4814 __func__, pmap, start, end, m_start, prot)); 4815 4816 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4817 psize = atop(end - start); 4818 mpt2pg = NULL; 4819 m = m_start; 4820 rw_wlock(&pvh_global_lock); 4821 PMAP_LOCK(pmap); 4822 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4823 va = start + ptoa(diff); 4824 if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && 4825 m->psind == 1 && sp_enabled && 4826 pmap_enter_1mpage(pmap, va, m, prot)) 4827 m = &m[PTE1_SIZE / PAGE_SIZE - 1]; 4828 else 4829 mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, 4830 mpt2pg); 4831 m = TAILQ_NEXT(m, listq); 4832 } 4833 rw_wunlock(&pvh_global_lock); 4834 PMAP_UNLOCK(pmap); 4835 } 4836 4837 /* 4838 * This code maps large physical mmap regions into the 4839 * processor address space. Note that some shortcuts 4840 * are taken, but the code works. 4841 */ 4842 void 4843 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4844 vm_pindex_t pindex, vm_size_t size) 4845 { 4846 pt1_entry_t *pte1p; 4847 vm_paddr_t pa, pte2_pa; 4848 vm_page_t p; 4849 vm_memattr_t pat_mode; 4850 u_int l1attr, l1prot; 4851 4852 VM_OBJECT_ASSERT_WLOCKED(object); 4853 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4854 ("%s: non-device object", __func__)); 4855 if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { 4856 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4857 return; 4858 p = vm_page_lookup(object, pindex); 4859 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4860 ("%s: invalid page %p", __func__, p)); 4861 pat_mode = p->md.pat_mode; 4862 4863 /* 4864 * Abort the mapping if the first page is not physically 4865 * aligned to a 1MB page boundary. 4866 */ 4867 pte2_pa = VM_PAGE_TO_PHYS(p); 4868 if (pte2_pa & PTE1_OFFSET) 4869 return; 4870 4871 /* 4872 * Skip the first page. Abort the mapping if the rest of 4873 * the pages are not physically contiguous or have differing 4874 * memory attributes. 4875 */ 4876 p = TAILQ_NEXT(p, listq); 4877 for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; 4878 pa += PAGE_SIZE) { 4879 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4880 ("%s: invalid page %p", __func__, p)); 4881 if (pa != VM_PAGE_TO_PHYS(p) || 4882 pat_mode != p->md.pat_mode) 4883 return; 4884 p = TAILQ_NEXT(p, listq); 4885 } 4886 4887 /* 4888 * Map using 1MB pages. 4889 * 4890 * QQQ: Well, we are mapping a section, so same condition must 4891 * be hold like during promotion. It looks that only RW mapping 4892 * is done here, so readonly mapping must be done elsewhere. 4893 */ 4894 l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; 4895 l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); 4896 PMAP_LOCK(pmap); 4897 for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { 4898 pte1p = pmap_pte1(pmap, addr); 4899 if (!pte1_is_valid(pte1_load(pte1p))) { 4900 pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); 4901 pmap->pm_stats.resident_count += PTE1_SIZE / 4902 PAGE_SIZE; 4903 pmap_pte1_mappings++; 4904 } 4905 /* Else continue on if the PTE1 is already valid. */ 4906 addr += PTE1_SIZE; 4907 } 4908 PMAP_UNLOCK(pmap); 4909 } 4910 } 4911 4912 /* 4913 * Do the things to protect a 1mpage in a process. 4914 */ 4915 static void 4916 pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 4917 vm_prot_t prot) 4918 { 4919 pt1_entry_t npte1, opte1; 4920 vm_offset_t eva, va; 4921 vm_page_t m; 4922 4923 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4924 KASSERT((sva & PTE1_OFFSET) == 0, 4925 ("%s: sva is not 1mpage aligned", __func__)); 4926 4927 opte1 = npte1 = pte1_load(pte1p); 4928 if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { 4929 eva = sva + PTE1_SIZE; 4930 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 4931 va < eva; va += PAGE_SIZE, m++) 4932 vm_page_dirty(m); 4933 } 4934 if ((prot & VM_PROT_WRITE) == 0) 4935 npte1 |= PTE1_RO | PTE1_NM; 4936 if ((prot & VM_PROT_EXECUTE) == 0) 4937 npte1 |= PTE1_NX; 4938 4939 /* 4940 * QQQ: Herein, execute permission is never set. 4941 * It only can be cleared. So, no icache 4942 * syncing is needed. 4943 */ 4944 4945 if (npte1 != opte1) { 4946 pte1_store(pte1p, npte1); 4947 pmap_tlb_flush(pmap, sva); 4948 } 4949 } 4950 4951 /* 4952 * Set the physical protection on the 4953 * specified range of this map as requested. 4954 */ 4955 void 4956 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4957 { 4958 boolean_t pv_lists_locked; 4959 vm_offset_t nextva; 4960 pt1_entry_t *pte1p, pte1; 4961 pt2_entry_t *pte2p, opte2, npte2; 4962 4963 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4964 if (prot == VM_PROT_NONE) { 4965 pmap_remove(pmap, sva, eva); 4966 return; 4967 } 4968 4969 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 4970 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 4971 return; 4972 4973 if (pmap_is_current(pmap)) 4974 pv_lists_locked = FALSE; 4975 else { 4976 pv_lists_locked = TRUE; 4977 resume: 4978 rw_wlock(&pvh_global_lock); 4979 sched_pin(); 4980 } 4981 4982 PMAP_LOCK(pmap); 4983 for (; sva < eva; sva = nextva) { 4984 /* 4985 * Calculate address for next L2 page table. 4986 */ 4987 nextva = pte1_trunc(sva + PTE1_SIZE); 4988 if (nextva < sva) 4989 nextva = eva; 4990 4991 pte1p = pmap_pte1(pmap, sva); 4992 pte1 = pte1_load(pte1p); 4993 4994 /* 4995 * Weed out invalid mappings. Note: we assume that L1 page 4996 * page table is always allocated, and in kernel virtual. 4997 */ 4998 if (pte1 == 0) 4999 continue; 5000 5001 if (pte1_is_section(pte1)) { 5002 /* 5003 * Are we protecting the entire large page? If not, 5004 * demote the mapping and fall through. 5005 */ 5006 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5007 pmap_protect_pte1(pmap, pte1p, sva, prot); 5008 continue; 5009 } else { 5010 if (!pv_lists_locked) { 5011 pv_lists_locked = TRUE; 5012 if (!rw_try_wlock(&pvh_global_lock)) { 5013 PMAP_UNLOCK(pmap); 5014 goto resume; 5015 } 5016 sched_pin(); 5017 } 5018 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5019 /* 5020 * The large page mapping 5021 * was destroyed. 5022 */ 5023 continue; 5024 } 5025 #ifdef INVARIANTS 5026 else { 5027 /* Update pte1 after demotion */ 5028 pte1 = pte1_load(pte1p); 5029 } 5030 #endif 5031 } 5032 } 5033 5034 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5035 " is not link", __func__, pmap, sva, pte1, pte1p)); 5036 5037 /* 5038 * Limit our scan to either the end of the va represented 5039 * by the current L2 page table page, or to the end of the 5040 * range being protected. 5041 */ 5042 if (nextva > eva) 5043 nextva = eva; 5044 5045 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5046 sva += PAGE_SIZE) { 5047 vm_page_t m; 5048 5049 opte2 = npte2 = pte2_load(pte2p); 5050 if (!pte2_is_valid(opte2)) 5051 continue; 5052 5053 if ((prot & VM_PROT_WRITE) == 0) { 5054 if (pte2_is_managed(opte2) && 5055 pte2_is_dirty(opte2)) { 5056 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 5057 vm_page_dirty(m); 5058 } 5059 npte2 |= PTE2_RO | PTE2_NM; 5060 } 5061 5062 if ((prot & VM_PROT_EXECUTE) == 0) 5063 npte2 |= PTE2_NX; 5064 5065 /* 5066 * QQQ: Herein, execute permission is never set. 5067 * It only can be cleared. So, no icache 5068 * syncing is needed. 5069 */ 5070 5071 if (npte2 != opte2) { 5072 pte2_store(pte2p, npte2); 5073 pmap_tlb_flush(pmap, sva); 5074 } 5075 } 5076 } 5077 if (pv_lists_locked) { 5078 sched_unpin(); 5079 rw_wunlock(&pvh_global_lock); 5080 } 5081 PMAP_UNLOCK(pmap); 5082 } 5083 5084 /* 5085 * pmap_pvh_wired_mappings: 5086 * 5087 * Return the updated number "count" of managed mappings that are wired. 5088 */ 5089 static int 5090 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 5091 { 5092 pmap_t pmap; 5093 pt1_entry_t pte1; 5094 pt2_entry_t pte2; 5095 pv_entry_t pv; 5096 5097 rw_assert(&pvh_global_lock, RA_WLOCKED); 5098 sched_pin(); 5099 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5100 pmap = PV_PMAP(pv); 5101 PMAP_LOCK(pmap); 5102 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5103 if (pte1_is_section(pte1)) { 5104 if (pte1_is_wired(pte1)) 5105 count++; 5106 } else { 5107 KASSERT(pte1_is_link(pte1), 5108 ("%s: pte1 %#x is not link", __func__, pte1)); 5109 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5110 if (pte2_is_wired(pte2)) 5111 count++; 5112 } 5113 PMAP_UNLOCK(pmap); 5114 } 5115 sched_unpin(); 5116 return (count); 5117 } 5118 5119 /* 5120 * pmap_page_wired_mappings: 5121 * 5122 * Return the number of managed mappings to the given physical page 5123 * that are wired. 5124 */ 5125 int 5126 pmap_page_wired_mappings(vm_page_t m) 5127 { 5128 int count; 5129 5130 count = 0; 5131 if ((m->oflags & VPO_UNMANAGED) != 0) 5132 return (count); 5133 rw_wlock(&pvh_global_lock); 5134 count = pmap_pvh_wired_mappings(&m->md, count); 5135 if ((m->flags & PG_FICTITIOUS) == 0) { 5136 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 5137 count); 5138 } 5139 rw_wunlock(&pvh_global_lock); 5140 return (count); 5141 } 5142 5143 /* 5144 * Returns TRUE if any of the given mappings were used to modify 5145 * physical memory. Otherwise, returns FALSE. Both page and 1mpage 5146 * mappings are supported. 5147 */ 5148 static boolean_t 5149 pmap_is_modified_pvh(struct md_page *pvh) 5150 { 5151 pv_entry_t pv; 5152 pt1_entry_t pte1; 5153 pt2_entry_t pte2; 5154 pmap_t pmap; 5155 boolean_t rv; 5156 5157 rw_assert(&pvh_global_lock, RA_WLOCKED); 5158 rv = FALSE; 5159 sched_pin(); 5160 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5161 pmap = PV_PMAP(pv); 5162 PMAP_LOCK(pmap); 5163 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5164 if (pte1_is_section(pte1)) { 5165 rv = pte1_is_dirty(pte1); 5166 } else { 5167 KASSERT(pte1_is_link(pte1), 5168 ("%s: pte1 %#x is not link", __func__, pte1)); 5169 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5170 rv = pte2_is_dirty(pte2); 5171 } 5172 PMAP_UNLOCK(pmap); 5173 if (rv) 5174 break; 5175 } 5176 sched_unpin(); 5177 return (rv); 5178 } 5179 5180 /* 5181 * pmap_is_modified: 5182 * 5183 * Return whether or not the specified physical page was modified 5184 * in any physical maps. 5185 */ 5186 boolean_t 5187 pmap_is_modified(vm_page_t m) 5188 { 5189 boolean_t rv; 5190 5191 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5192 ("%s: page %p is not managed", __func__, m)); 5193 5194 /* 5195 * If the page is not busied then this check is racy. 5196 */ 5197 if (!pmap_page_is_write_mapped(m)) 5198 return (FALSE); 5199 rw_wlock(&pvh_global_lock); 5200 rv = pmap_is_modified_pvh(&m->md) || 5201 ((m->flags & PG_FICTITIOUS) == 0 && 5202 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5203 rw_wunlock(&pvh_global_lock); 5204 return (rv); 5205 } 5206 5207 /* 5208 * pmap_is_prefaultable: 5209 * 5210 * Return whether or not the specified virtual address is eligible 5211 * for prefault. 5212 */ 5213 boolean_t 5214 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5215 { 5216 pt1_entry_t pte1; 5217 pt2_entry_t pte2; 5218 boolean_t rv; 5219 5220 rv = FALSE; 5221 PMAP_LOCK(pmap); 5222 pte1 = pte1_load(pmap_pte1(pmap, addr)); 5223 if (pte1_is_link(pte1)) { 5224 pte2 = pte2_load(pt2map_entry(addr)); 5225 rv = !pte2_is_valid(pte2) ; 5226 } 5227 PMAP_UNLOCK(pmap); 5228 return (rv); 5229 } 5230 5231 /* 5232 * Returns TRUE if any of the given mappings were referenced and FALSE 5233 * otherwise. Both page and 1mpage mappings are supported. 5234 */ 5235 static boolean_t 5236 pmap_is_referenced_pvh(struct md_page *pvh) 5237 { 5238 5239 pv_entry_t pv; 5240 pt1_entry_t pte1; 5241 pt2_entry_t pte2; 5242 pmap_t pmap; 5243 boolean_t rv; 5244 5245 rw_assert(&pvh_global_lock, RA_WLOCKED); 5246 rv = FALSE; 5247 sched_pin(); 5248 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5249 pmap = PV_PMAP(pv); 5250 PMAP_LOCK(pmap); 5251 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5252 if (pte1_is_section(pte1)) { 5253 rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); 5254 } else { 5255 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5256 rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); 5257 } 5258 PMAP_UNLOCK(pmap); 5259 if (rv) 5260 break; 5261 } 5262 sched_unpin(); 5263 return (rv); 5264 } 5265 5266 /* 5267 * pmap_is_referenced: 5268 * 5269 * Return whether or not the specified physical page was referenced 5270 * in any physical maps. 5271 */ 5272 boolean_t 5273 pmap_is_referenced(vm_page_t m) 5274 { 5275 boolean_t rv; 5276 5277 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5278 ("%s: page %p is not managed", __func__, m)); 5279 rw_wlock(&pvh_global_lock); 5280 rv = pmap_is_referenced_pvh(&m->md) || 5281 ((m->flags & PG_FICTITIOUS) == 0 && 5282 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5283 rw_wunlock(&pvh_global_lock); 5284 return (rv); 5285 } 5286 5287 /* 5288 * pmap_ts_referenced: 5289 * 5290 * Return a count of reference bits for a page, clearing those bits. 5291 * It is not necessary for every reference bit to be cleared, but it 5292 * is necessary that 0 only be returned when there are truly no 5293 * reference bits set. 5294 * 5295 * As an optimization, update the page's dirty field if a modified bit is 5296 * found while counting reference bits. This opportunistic update can be 5297 * performed at low cost and can eliminate the need for some future calls 5298 * to pmap_is_modified(). However, since this function stops after 5299 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5300 * dirty pages. Those dirty pages will only be detected by a future call 5301 * to pmap_is_modified(). 5302 */ 5303 int 5304 pmap_ts_referenced(vm_page_t m) 5305 { 5306 struct md_page *pvh; 5307 pv_entry_t pv, pvf; 5308 pmap_t pmap; 5309 pt1_entry_t *pte1p, opte1; 5310 pt2_entry_t *pte2p, opte2; 5311 vm_paddr_t pa; 5312 int rtval = 0; 5313 5314 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5315 ("%s: page %p is not managed", __func__, m)); 5316 pa = VM_PAGE_TO_PHYS(m); 5317 pvh = pa_to_pvh(pa); 5318 rw_wlock(&pvh_global_lock); 5319 sched_pin(); 5320 if ((m->flags & PG_FICTITIOUS) != 0 || 5321 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5322 goto small_mappings; 5323 pv = pvf; 5324 do { 5325 pmap = PV_PMAP(pv); 5326 PMAP_LOCK(pmap); 5327 pte1p = pmap_pte1(pmap, pv->pv_va); 5328 opte1 = pte1_load(pte1p); 5329 if (pte1_is_dirty(opte1)) { 5330 /* 5331 * Although "opte1" is mapping a 1MB page, because 5332 * this function is called at a 4KB page granularity, 5333 * we only update the 4KB page under test. 5334 */ 5335 vm_page_dirty(m); 5336 } 5337 if ((opte1 & PTE1_A) != 0) { 5338 /* 5339 * Since this reference bit is shared by 256 4KB pages, 5340 * it should not be cleared every time it is tested. 5341 * Apply a simple "hash" function on the physical page 5342 * number, the virtual section number, and the pmap 5343 * address to select one 4KB page out of the 256 5344 * on which testing the reference bit will result 5345 * in clearing that bit. This function is designed 5346 * to avoid the selection of the same 4KB page 5347 * for every 1MB page mapping. 5348 * 5349 * On demotion, a mapping that hasn't been referenced 5350 * is simply destroyed. To avoid the possibility of a 5351 * subsequent page fault on a demoted wired mapping, 5352 * always leave its reference bit set. Moreover, 5353 * since the section is wired, the current state of 5354 * its reference bit won't affect page replacement. 5355 */ 5356 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ 5357 (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && 5358 !pte1_is_wired(opte1)) { 5359 pte1_clear_bit(pte1p, PTE1_A); 5360 pmap_tlb_flush(pmap, pv->pv_va); 5361 } 5362 rtval++; 5363 } 5364 PMAP_UNLOCK(pmap); 5365 /* Rotate the PV list if it has more than one entry. */ 5366 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5367 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5368 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5369 } 5370 if (rtval >= PMAP_TS_REFERENCED_MAX) 5371 goto out; 5372 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5373 small_mappings: 5374 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5375 goto out; 5376 pv = pvf; 5377 do { 5378 pmap = PV_PMAP(pv); 5379 PMAP_LOCK(pmap); 5380 pte1p = pmap_pte1(pmap, pv->pv_va); 5381 KASSERT(pte1_is_link(pte1_load(pte1p)), 5382 ("%s: not found a link in page %p's pv list", __func__, m)); 5383 5384 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5385 opte2 = pte2_load(pte2p); 5386 if (pte2_is_dirty(opte2)) 5387 vm_page_dirty(m); 5388 if ((opte2 & PTE2_A) != 0) { 5389 pte2_clear_bit(pte2p, PTE2_A); 5390 pmap_tlb_flush(pmap, pv->pv_va); 5391 rtval++; 5392 } 5393 PMAP_UNLOCK(pmap); 5394 /* Rotate the PV list if it has more than one entry. */ 5395 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5396 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5397 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5398 } 5399 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5400 PMAP_TS_REFERENCED_MAX); 5401 out: 5402 sched_unpin(); 5403 rw_wunlock(&pvh_global_lock); 5404 return (rtval); 5405 } 5406 5407 /* 5408 * Clear the wired attribute from the mappings for the specified range of 5409 * addresses in the given pmap. Every valid mapping within that range 5410 * must have the wired attribute set. In contrast, invalid mappings 5411 * cannot have the wired attribute set, so they are ignored. 5412 * 5413 * The wired attribute of the page table entry is not a hardware feature, 5414 * so there is no need to invalidate any TLB entries. 5415 */ 5416 void 5417 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5418 { 5419 vm_offset_t nextva; 5420 pt1_entry_t *pte1p, pte1; 5421 pt2_entry_t *pte2p, pte2; 5422 boolean_t pv_lists_locked; 5423 5424 if (pmap_is_current(pmap)) 5425 pv_lists_locked = FALSE; 5426 else { 5427 pv_lists_locked = TRUE; 5428 resume: 5429 rw_wlock(&pvh_global_lock); 5430 sched_pin(); 5431 } 5432 PMAP_LOCK(pmap); 5433 for (; sva < eva; sva = nextva) { 5434 nextva = pte1_trunc(sva + PTE1_SIZE); 5435 if (nextva < sva) 5436 nextva = eva; 5437 5438 pte1p = pmap_pte1(pmap, sva); 5439 pte1 = pte1_load(pte1p); 5440 5441 /* 5442 * Weed out invalid mappings. Note: we assume that L1 page 5443 * page table is always allocated, and in kernel virtual. 5444 */ 5445 if (pte1 == 0) 5446 continue; 5447 5448 if (pte1_is_section(pte1)) { 5449 if (!pte1_is_wired(pte1)) 5450 panic("%s: pte1 %#x not wired", __func__, pte1); 5451 5452 /* 5453 * Are we unwiring the entire large page? If not, 5454 * demote the mapping and fall through. 5455 */ 5456 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5457 pte1_clear_bit(pte1p, PTE1_W); 5458 pmap->pm_stats.wired_count -= PTE1_SIZE / 5459 PAGE_SIZE; 5460 continue; 5461 } else { 5462 if (!pv_lists_locked) { 5463 pv_lists_locked = TRUE; 5464 if (!rw_try_wlock(&pvh_global_lock)) { 5465 PMAP_UNLOCK(pmap); 5466 /* Repeat sva. */ 5467 goto resume; 5468 } 5469 sched_pin(); 5470 } 5471 if (!pmap_demote_pte1(pmap, pte1p, sva)) 5472 panic("%s: demotion failed", __func__); 5473 #ifdef INVARIANTS 5474 else { 5475 /* Update pte1 after demotion */ 5476 pte1 = pte1_load(pte1p); 5477 } 5478 #endif 5479 } 5480 } 5481 5482 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5483 " is not link", __func__, pmap, sva, pte1, pte1p)); 5484 5485 /* 5486 * Limit our scan to either the end of the va represented 5487 * by the current L2 page table page, or to the end of the 5488 * range being protected. 5489 */ 5490 if (nextva > eva) 5491 nextva = eva; 5492 5493 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5494 sva += PAGE_SIZE) { 5495 pte2 = pte2_load(pte2p); 5496 if (!pte2_is_valid(pte2)) 5497 continue; 5498 if (!pte2_is_wired(pte2)) 5499 panic("%s: pte2 %#x is missing PTE2_W", 5500 __func__, pte2); 5501 5502 /* 5503 * PTE2_W must be cleared atomically. Although the pmap 5504 * lock synchronizes access to PTE2_W, another processor 5505 * could be changing PTE2_NM and/or PTE2_A concurrently. 5506 */ 5507 pte2_clear_bit(pte2p, PTE2_W); 5508 pmap->pm_stats.wired_count--; 5509 } 5510 } 5511 if (pv_lists_locked) { 5512 sched_unpin(); 5513 rw_wunlock(&pvh_global_lock); 5514 } 5515 PMAP_UNLOCK(pmap); 5516 } 5517 5518 /* 5519 * Clear the write and modified bits in each of the given page's mappings. 5520 */ 5521 void 5522 pmap_remove_write(vm_page_t m) 5523 { 5524 struct md_page *pvh; 5525 pv_entry_t next_pv, pv; 5526 pmap_t pmap; 5527 pt1_entry_t *pte1p; 5528 pt2_entry_t *pte2p, opte2; 5529 vm_offset_t va; 5530 5531 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5532 ("%s: page %p is not managed", __func__, m)); 5533 vm_page_assert_busied(m); 5534 5535 if (!pmap_page_is_write_mapped(m)) 5536 return; 5537 rw_wlock(&pvh_global_lock); 5538 sched_pin(); 5539 if ((m->flags & PG_FICTITIOUS) != 0) 5540 goto small_mappings; 5541 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5542 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5543 va = pv->pv_va; 5544 pmap = PV_PMAP(pv); 5545 PMAP_LOCK(pmap); 5546 pte1p = pmap_pte1(pmap, va); 5547 if (!(pte1_load(pte1p) & PTE1_RO)) 5548 (void)pmap_demote_pte1(pmap, pte1p, va); 5549 PMAP_UNLOCK(pmap); 5550 } 5551 small_mappings: 5552 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5553 pmap = PV_PMAP(pv); 5554 PMAP_LOCK(pmap); 5555 pte1p = pmap_pte1(pmap, pv->pv_va); 5556 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5557 " a section in page %p's pv list", __func__, m)); 5558 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5559 opte2 = pte2_load(pte2p); 5560 if (!(opte2 & PTE2_RO)) { 5561 pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); 5562 if (pte2_is_dirty(opte2)) 5563 vm_page_dirty(m); 5564 pmap_tlb_flush(pmap, pv->pv_va); 5565 } 5566 PMAP_UNLOCK(pmap); 5567 } 5568 vm_page_aflag_clear(m, PGA_WRITEABLE); 5569 sched_unpin(); 5570 rw_wunlock(&pvh_global_lock); 5571 } 5572 5573 /* 5574 * Apply the given advice to the specified range of addresses within the 5575 * given pmap. Depending on the advice, clear the referenced and/or 5576 * modified flags in each mapping and set the mapped page's dirty field. 5577 */ 5578 void 5579 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5580 { 5581 pt1_entry_t *pte1p, opte1; 5582 pt2_entry_t *pte2p, pte2; 5583 vm_offset_t pdnxt; 5584 vm_page_t m; 5585 boolean_t pv_lists_locked; 5586 5587 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5588 return; 5589 if (pmap_is_current(pmap)) 5590 pv_lists_locked = FALSE; 5591 else { 5592 pv_lists_locked = TRUE; 5593 resume: 5594 rw_wlock(&pvh_global_lock); 5595 sched_pin(); 5596 } 5597 PMAP_LOCK(pmap); 5598 for (; sva < eva; sva = pdnxt) { 5599 pdnxt = pte1_trunc(sva + PTE1_SIZE); 5600 if (pdnxt < sva) 5601 pdnxt = eva; 5602 pte1p = pmap_pte1(pmap, sva); 5603 opte1 = pte1_load(pte1p); 5604 if (!pte1_is_valid(opte1)) /* XXX */ 5605 continue; 5606 else if (pte1_is_section(opte1)) { 5607 if (!pte1_is_managed(opte1)) 5608 continue; 5609 if (!pv_lists_locked) { 5610 pv_lists_locked = TRUE; 5611 if (!rw_try_wlock(&pvh_global_lock)) { 5612 PMAP_UNLOCK(pmap); 5613 goto resume; 5614 } 5615 sched_pin(); 5616 } 5617 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5618 /* 5619 * The large page mapping was destroyed. 5620 */ 5621 continue; 5622 } 5623 5624 /* 5625 * Unless the page mappings are wired, remove the 5626 * mapping to a single page so that a subsequent 5627 * access may repromote. Since the underlying L2 page 5628 * table is fully populated, this removal never 5629 * frees a L2 page table page. 5630 */ 5631 if (!pte1_is_wired(opte1)) { 5632 pte2p = pmap_pte2_quick(pmap, sva); 5633 KASSERT(pte2_is_valid(pte2_load(pte2p)), 5634 ("%s: invalid PTE2", __func__)); 5635 pmap_remove_pte2(pmap, pte2p, sva, NULL); 5636 } 5637 } 5638 if (pdnxt > eva) 5639 pdnxt = eva; 5640 for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, 5641 sva += PAGE_SIZE) { 5642 pte2 = pte2_load(pte2p); 5643 if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) 5644 continue; 5645 else if (pte2_is_dirty(pte2)) { 5646 if (advice == MADV_DONTNEED) { 5647 /* 5648 * Future calls to pmap_is_modified() 5649 * can be avoided by making the page 5650 * dirty now. 5651 */ 5652 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 5653 vm_page_dirty(m); 5654 } 5655 pte2_set_bit(pte2p, PTE2_NM); 5656 pte2_clear_bit(pte2p, PTE2_A); 5657 } else if ((pte2 & PTE2_A) != 0) 5658 pte2_clear_bit(pte2p, PTE2_A); 5659 else 5660 continue; 5661 pmap_tlb_flush(pmap, sva); 5662 } 5663 } 5664 if (pv_lists_locked) { 5665 sched_unpin(); 5666 rw_wunlock(&pvh_global_lock); 5667 } 5668 PMAP_UNLOCK(pmap); 5669 } 5670 5671 /* 5672 * Clear the modify bits on the specified physical page. 5673 */ 5674 void 5675 pmap_clear_modify(vm_page_t m) 5676 { 5677 struct md_page *pvh; 5678 pv_entry_t next_pv, pv; 5679 pmap_t pmap; 5680 pt1_entry_t *pte1p, opte1; 5681 pt2_entry_t *pte2p, opte2; 5682 vm_offset_t va; 5683 5684 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5685 ("%s: page %p is not managed", __func__, m)); 5686 vm_page_assert_busied(m); 5687 5688 if (!pmap_page_is_write_mapped(m)) 5689 return; 5690 rw_wlock(&pvh_global_lock); 5691 sched_pin(); 5692 if ((m->flags & PG_FICTITIOUS) != 0) 5693 goto small_mappings; 5694 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5695 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5696 va = pv->pv_va; 5697 pmap = PV_PMAP(pv); 5698 PMAP_LOCK(pmap); 5699 pte1p = pmap_pte1(pmap, va); 5700 opte1 = pte1_load(pte1p); 5701 if (!(opte1 & PTE1_RO)) { 5702 if (pmap_demote_pte1(pmap, pte1p, va) && 5703 !pte1_is_wired(opte1)) { 5704 /* 5705 * Write protect the mapping to a 5706 * single page so that a subsequent 5707 * write access may repromote. 5708 */ 5709 va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); 5710 pte2p = pmap_pte2_quick(pmap, va); 5711 opte2 = pte2_load(pte2p); 5712 if ((opte2 & PTE2_V)) { 5713 pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); 5714 vm_page_dirty(m); 5715 pmap_tlb_flush(pmap, va); 5716 } 5717 } 5718 } 5719 PMAP_UNLOCK(pmap); 5720 } 5721 small_mappings: 5722 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5723 pmap = PV_PMAP(pv); 5724 PMAP_LOCK(pmap); 5725 pte1p = pmap_pte1(pmap, pv->pv_va); 5726 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5727 " a section in page %p's pv list", __func__, m)); 5728 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5729 if (pte2_is_dirty(pte2_load(pte2p))) { 5730 pte2_set_bit(pte2p, PTE2_NM); 5731 pmap_tlb_flush(pmap, pv->pv_va); 5732 } 5733 PMAP_UNLOCK(pmap); 5734 } 5735 sched_unpin(); 5736 rw_wunlock(&pvh_global_lock); 5737 } 5738 5739 /* 5740 * Sets the memory attribute for the specified page. 5741 */ 5742 void 5743 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5744 { 5745 pt2_entry_t *cmap2_pte2p; 5746 vm_memattr_t oma; 5747 vm_paddr_t pa; 5748 struct pcpu *pc; 5749 5750 oma = m->md.pat_mode; 5751 m->md.pat_mode = ma; 5752 5753 CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, 5754 VM_PAGE_TO_PHYS(m), oma, ma); 5755 if ((m->flags & PG_FICTITIOUS) != 0) 5756 return; 5757 #if 0 5758 /* 5759 * If "m" is a normal page, flush it from the cache. 5760 * 5761 * First, try to find an existing mapping of the page by sf 5762 * buffer. sf_buf_invalidate_cache() modifies mapping and 5763 * flushes the cache. 5764 */ 5765 if (sf_buf_invalidate_cache(m, oma)) 5766 return; 5767 #endif 5768 /* 5769 * If page is not mapped by sf buffer, map the page 5770 * transient and do invalidation. 5771 */ 5772 if (ma != oma) { 5773 pa = VM_PAGE_TO_PHYS(m); 5774 sched_pin(); 5775 pc = get_pcpu(); 5776 cmap2_pte2p = pc->pc_cmap2_pte2p; 5777 mtx_lock(&pc->pc_cmap_lock); 5778 if (pte2_load(cmap2_pte2p) != 0) 5779 panic("%s: CMAP2 busy", __func__); 5780 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 5781 vm_memattr_to_pte2(ma))); 5782 dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); 5783 pte2_clear(cmap2_pte2p); 5784 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5785 sched_unpin(); 5786 mtx_unlock(&pc->pc_cmap_lock); 5787 } 5788 } 5789 5790 /* 5791 * Miscellaneous support routines follow 5792 */ 5793 5794 /* 5795 * Returns TRUE if the given page is mapped individually or as part of 5796 * a 1mpage. Otherwise, returns FALSE. 5797 */ 5798 boolean_t 5799 pmap_page_is_mapped(vm_page_t m) 5800 { 5801 boolean_t rv; 5802 5803 if ((m->oflags & VPO_UNMANAGED) != 0) 5804 return (FALSE); 5805 rw_wlock(&pvh_global_lock); 5806 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5807 ((m->flags & PG_FICTITIOUS) == 0 && 5808 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5809 rw_wunlock(&pvh_global_lock); 5810 return (rv); 5811 } 5812 5813 /* 5814 * Returns true if the pmap's pv is one of the first 5815 * 16 pvs linked to from this page. This count may 5816 * be changed upwards or downwards in the future; it 5817 * is only necessary that true be returned for a small 5818 * subset of pmaps for proper page aging. 5819 */ 5820 boolean_t 5821 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5822 { 5823 struct md_page *pvh; 5824 pv_entry_t pv; 5825 int loops = 0; 5826 boolean_t rv; 5827 5828 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5829 ("%s: page %p is not managed", __func__, m)); 5830 rv = FALSE; 5831 rw_wlock(&pvh_global_lock); 5832 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5833 if (PV_PMAP(pv) == pmap) { 5834 rv = TRUE; 5835 break; 5836 } 5837 loops++; 5838 if (loops >= 16) 5839 break; 5840 } 5841 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5842 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5843 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5844 if (PV_PMAP(pv) == pmap) { 5845 rv = TRUE; 5846 break; 5847 } 5848 loops++; 5849 if (loops >= 16) 5850 break; 5851 } 5852 } 5853 rw_wunlock(&pvh_global_lock); 5854 return (rv); 5855 } 5856 5857 /* 5858 * pmap_zero_page zeros the specified hardware page by mapping 5859 * the page into KVM and using bzero to clear its contents. 5860 */ 5861 void 5862 pmap_zero_page(vm_page_t m) 5863 { 5864 pt2_entry_t *cmap2_pte2p; 5865 struct pcpu *pc; 5866 5867 sched_pin(); 5868 pc = get_pcpu(); 5869 cmap2_pte2p = pc->pc_cmap2_pte2p; 5870 mtx_lock(&pc->pc_cmap_lock); 5871 if (pte2_load(cmap2_pte2p) != 0) 5872 panic("%s: CMAP2 busy", __func__); 5873 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5874 vm_page_pte2_attr(m))); 5875 pagezero(pc->pc_cmap2_addr); 5876 pte2_clear(cmap2_pte2p); 5877 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5878 sched_unpin(); 5879 mtx_unlock(&pc->pc_cmap_lock); 5880 } 5881 5882 /* 5883 * pmap_zero_page_area zeros the specified hardware page by mapping 5884 * the page into KVM and using bzero to clear its contents. 5885 * 5886 * off and size may not cover an area beyond a single hardware page. 5887 */ 5888 void 5889 pmap_zero_page_area(vm_page_t m, int off, int size) 5890 { 5891 pt2_entry_t *cmap2_pte2p; 5892 struct pcpu *pc; 5893 5894 sched_pin(); 5895 pc = get_pcpu(); 5896 cmap2_pte2p = pc->pc_cmap2_pte2p; 5897 mtx_lock(&pc->pc_cmap_lock); 5898 if (pte2_load(cmap2_pte2p) != 0) 5899 panic("%s: CMAP2 busy", __func__); 5900 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5901 vm_page_pte2_attr(m))); 5902 if (off == 0 && size == PAGE_SIZE) 5903 pagezero(pc->pc_cmap2_addr); 5904 else 5905 bzero(pc->pc_cmap2_addr + off, size); 5906 pte2_clear(cmap2_pte2p); 5907 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5908 sched_unpin(); 5909 mtx_unlock(&pc->pc_cmap_lock); 5910 } 5911 5912 /* 5913 * pmap_copy_page copies the specified (machine independent) 5914 * page by mapping the page into virtual memory and using 5915 * bcopy to copy the page, one machine dependent page at a 5916 * time. 5917 */ 5918 void 5919 pmap_copy_page(vm_page_t src, vm_page_t dst) 5920 { 5921 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5922 struct pcpu *pc; 5923 5924 sched_pin(); 5925 pc = get_pcpu(); 5926 cmap1_pte2p = pc->pc_cmap1_pte2p; 5927 cmap2_pte2p = pc->pc_cmap2_pte2p; 5928 mtx_lock(&pc->pc_cmap_lock); 5929 if (pte2_load(cmap1_pte2p) != 0) 5930 panic("%s: CMAP1 busy", __func__); 5931 if (pte2_load(cmap2_pte2p) != 0) 5932 panic("%s: CMAP2 busy", __func__); 5933 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), 5934 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); 5935 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), 5936 PTE2_AP_KRW, vm_page_pte2_attr(dst))); 5937 bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); 5938 pte2_clear(cmap1_pte2p); 5939 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5940 pte2_clear(cmap2_pte2p); 5941 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5942 sched_unpin(); 5943 mtx_unlock(&pc->pc_cmap_lock); 5944 } 5945 5946 int unmapped_buf_allowed = 1; 5947 5948 void 5949 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5950 vm_offset_t b_offset, int xfersize) 5951 { 5952 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5953 vm_page_t a_pg, b_pg; 5954 char *a_cp, *b_cp; 5955 vm_offset_t a_pg_offset, b_pg_offset; 5956 struct pcpu *pc; 5957 int cnt; 5958 5959 sched_pin(); 5960 pc = get_pcpu(); 5961 cmap1_pte2p = pc->pc_cmap1_pte2p; 5962 cmap2_pte2p = pc->pc_cmap2_pte2p; 5963 mtx_lock(&pc->pc_cmap_lock); 5964 if (pte2_load(cmap1_pte2p) != 0) 5965 panic("pmap_copy_pages: CMAP1 busy"); 5966 if (pte2_load(cmap2_pte2p) != 0) 5967 panic("pmap_copy_pages: CMAP2 busy"); 5968 while (xfersize > 0) { 5969 a_pg = ma[a_offset >> PAGE_SHIFT]; 5970 a_pg_offset = a_offset & PAGE_MASK; 5971 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5972 b_pg = mb[b_offset >> PAGE_SHIFT]; 5973 b_pg_offset = b_offset & PAGE_MASK; 5974 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5975 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), 5976 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); 5977 tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); 5978 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), 5979 PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); 5980 tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); 5981 a_cp = pc->pc_cmap1_addr + a_pg_offset; 5982 b_cp = pc->pc_cmap2_addr + b_pg_offset; 5983 bcopy(a_cp, b_cp, cnt); 5984 a_offset += cnt; 5985 b_offset += cnt; 5986 xfersize -= cnt; 5987 } 5988 pte2_clear(cmap1_pte2p); 5989 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5990 pte2_clear(cmap2_pte2p); 5991 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5992 sched_unpin(); 5993 mtx_unlock(&pc->pc_cmap_lock); 5994 } 5995 5996 vm_offset_t 5997 pmap_quick_enter_page(vm_page_t m) 5998 { 5999 struct pcpu *pc; 6000 pt2_entry_t *pte2p; 6001 6002 critical_enter(); 6003 pc = get_pcpu(); 6004 pte2p = pc->pc_qmap_pte2p; 6005 6006 KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); 6007 6008 pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6009 vm_page_pte2_attr(m))); 6010 return (pc->pc_qmap_addr); 6011 } 6012 6013 void 6014 pmap_quick_remove_page(vm_offset_t addr) 6015 { 6016 struct pcpu *pc; 6017 pt2_entry_t *pte2p; 6018 6019 pc = get_pcpu(); 6020 pte2p = pc->pc_qmap_pte2p; 6021 6022 KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); 6023 KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); 6024 6025 pte2_clear(pte2p); 6026 tlb_flush(pc->pc_qmap_addr); 6027 critical_exit(); 6028 } 6029 6030 /* 6031 * Copy the range specified by src_addr/len 6032 * from the source map to the range dst_addr/len 6033 * in the destination map. 6034 * 6035 * This routine is only advisory and need not do anything. 6036 */ 6037 void 6038 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6039 vm_offset_t src_addr) 6040 { 6041 struct spglist free; 6042 vm_offset_t addr; 6043 vm_offset_t end_addr = src_addr + len; 6044 vm_offset_t nextva; 6045 6046 if (dst_addr != src_addr) 6047 return; 6048 6049 if (!pmap_is_current(src_pmap)) 6050 return; 6051 6052 rw_wlock(&pvh_global_lock); 6053 if (dst_pmap < src_pmap) { 6054 PMAP_LOCK(dst_pmap); 6055 PMAP_LOCK(src_pmap); 6056 } else { 6057 PMAP_LOCK(src_pmap); 6058 PMAP_LOCK(dst_pmap); 6059 } 6060 sched_pin(); 6061 for (addr = src_addr; addr < end_addr; addr = nextva) { 6062 pt2_entry_t *src_pte2p, *dst_pte2p; 6063 vm_page_t dst_mpt2pg, src_mpt2pg; 6064 pt1_entry_t src_pte1; 6065 u_int pte1_idx; 6066 6067 KASSERT(addr < VM_MAXUSER_ADDRESS, 6068 ("%s: invalid to pmap_copy page tables", __func__)); 6069 6070 nextva = pte1_trunc(addr + PTE1_SIZE); 6071 if (nextva < addr) 6072 nextva = end_addr; 6073 6074 pte1_idx = pte1_index(addr); 6075 src_pte1 = src_pmap->pm_pt1[pte1_idx]; 6076 if (pte1_is_section(src_pte1)) { 6077 if ((addr & PTE1_OFFSET) != 0 || 6078 (addr + PTE1_SIZE) > end_addr) 6079 continue; 6080 if (dst_pmap->pm_pt1[pte1_idx] == 0 && 6081 (!pte1_is_managed(src_pte1) || 6082 pmap_pv_insert_pte1(dst_pmap, addr, src_pte1, 6083 PMAP_ENTER_NORECLAIM))) { 6084 dst_pmap->pm_pt1[pte1_idx] = src_pte1 & 6085 ~PTE1_W; 6086 dst_pmap->pm_stats.resident_count += 6087 PTE1_SIZE / PAGE_SIZE; 6088 pmap_pte1_mappings++; 6089 } 6090 continue; 6091 } else if (!pte1_is_link(src_pte1)) 6092 continue; 6093 6094 src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); 6095 6096 /* 6097 * We leave PT2s to be linked from PT1 even if they are not 6098 * referenced until all PT2s in a page are without reference. 6099 * 6100 * QQQ: It could be changed ... 6101 */ 6102 #if 0 /* single_pt2_link_is_cleared */ 6103 KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, 6104 ("%s: source page table page is unused", __func__)); 6105 #else 6106 if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) 6107 continue; 6108 #endif 6109 if (nextva > end_addr) 6110 nextva = end_addr; 6111 6112 src_pte2p = pt2map_entry(addr); 6113 while (addr < nextva) { 6114 pt2_entry_t temp_pte2; 6115 temp_pte2 = pte2_load(src_pte2p); 6116 /* 6117 * we only virtual copy managed pages 6118 */ 6119 if (pte2_is_managed(temp_pte2)) { 6120 dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, 6121 PMAP_ENTER_NOSLEEP); 6122 if (dst_mpt2pg == NULL) 6123 goto out; 6124 dst_pte2p = pmap_pte2_quick(dst_pmap, addr); 6125 if (!pte2_is_valid(pte2_load(dst_pte2p)) && 6126 pmap_try_insert_pv_entry(dst_pmap, addr, 6127 PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { 6128 /* 6129 * Clear the wired, modified, and 6130 * accessed (referenced) bits 6131 * during the copy. 6132 */ 6133 temp_pte2 &= ~(PTE2_W | PTE2_A); 6134 temp_pte2 |= PTE2_NM; 6135 pte2_store(dst_pte2p, temp_pte2); 6136 dst_pmap->pm_stats.resident_count++; 6137 } else { 6138 SLIST_INIT(&free); 6139 if (pmap_unwire_pt2(dst_pmap, addr, 6140 dst_mpt2pg, &free)) { 6141 pmap_tlb_flush(dst_pmap, addr); 6142 vm_page_free_pages_toq(&free, 6143 false); 6144 } 6145 goto out; 6146 } 6147 if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= 6148 pt2_wirecount_get(src_mpt2pg, pte1_idx)) 6149 break; 6150 } 6151 addr += PAGE_SIZE; 6152 src_pte2p++; 6153 } 6154 } 6155 out: 6156 sched_unpin(); 6157 rw_wunlock(&pvh_global_lock); 6158 PMAP_UNLOCK(src_pmap); 6159 PMAP_UNLOCK(dst_pmap); 6160 } 6161 6162 /* 6163 * Increase the starting virtual address of the given mapping if a 6164 * different alignment might result in more section mappings. 6165 */ 6166 void 6167 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6168 vm_offset_t *addr, vm_size_t size) 6169 { 6170 vm_offset_t pte1_offset; 6171 6172 if (size < PTE1_SIZE) 6173 return; 6174 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6175 offset += ptoa(object->pg_color); 6176 pte1_offset = offset & PTE1_OFFSET; 6177 if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || 6178 (*addr & PTE1_OFFSET) == pte1_offset) 6179 return; 6180 if ((*addr & PTE1_OFFSET) < pte1_offset) 6181 *addr = pte1_trunc(*addr) + pte1_offset; 6182 else 6183 *addr = pte1_roundup(*addr) + pte1_offset; 6184 } 6185 6186 void 6187 pmap_activate(struct thread *td) 6188 { 6189 pmap_t pmap, oldpmap; 6190 u_int cpuid, ttb; 6191 6192 PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); 6193 6194 critical_enter(); 6195 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6196 oldpmap = PCPU_GET(curpmap); 6197 cpuid = PCPU_GET(cpuid); 6198 6199 #if defined(SMP) 6200 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6201 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6202 #else 6203 CPU_CLR(cpuid, &oldpmap->pm_active); 6204 CPU_SET(cpuid, &pmap->pm_active); 6205 #endif 6206 6207 ttb = pmap_ttb_get(pmap); 6208 6209 /* 6210 * pmap_activate is for the current thread on the current cpu 6211 */ 6212 td->td_pcb->pcb_pagedir = ttb; 6213 cp15_ttbr_set(ttb); 6214 PCPU_SET(curpmap, pmap); 6215 critical_exit(); 6216 } 6217 6218 /* 6219 * Perform the pmap work for mincore(2). If the page is not both referenced and 6220 * modified by this pmap, returns its physical address so that the caller can 6221 * find other mappings. 6222 */ 6223 int 6224 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6225 { 6226 pt1_entry_t *pte1p, pte1; 6227 pt2_entry_t *pte2p, pte2; 6228 vm_paddr_t pa; 6229 bool managed; 6230 int val; 6231 6232 PMAP_LOCK(pmap); 6233 pte1p = pmap_pte1(pmap, addr); 6234 pte1 = pte1_load(pte1p); 6235 if (pte1_is_section(pte1)) { 6236 pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); 6237 managed = pte1_is_managed(pte1); 6238 val = MINCORE_PSIND(1) | MINCORE_INCORE; 6239 if (pte1_is_dirty(pte1)) 6240 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6241 if (pte1 & PTE1_A) 6242 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6243 } else if (pte1_is_link(pte1)) { 6244 pte2p = pmap_pte2(pmap, addr); 6245 pte2 = pte2_load(pte2p); 6246 pmap_pte2_release(pte2p); 6247 pa = pte2_pa(pte2); 6248 managed = pte2_is_managed(pte2); 6249 val = MINCORE_INCORE; 6250 if (pte2_is_dirty(pte2)) 6251 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6252 if (pte2 & PTE2_A) 6253 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6254 } else { 6255 managed = false; 6256 val = 0; 6257 } 6258 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6259 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6260 *pap = pa; 6261 } 6262 PMAP_UNLOCK(pmap); 6263 return (val); 6264 } 6265 6266 void 6267 pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) 6268 { 6269 vm_offset_t sva; 6270 uint32_t l2attr; 6271 6272 KASSERT((size & PAGE_MASK) == 0, 6273 ("%s: device mapping not page-sized", __func__)); 6274 6275 sva = va; 6276 l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); 6277 while (size != 0) { 6278 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); 6279 va += PAGE_SIZE; 6280 pa += PAGE_SIZE; 6281 size -= PAGE_SIZE; 6282 } 6283 tlb_flush_range(sva, va - sva); 6284 } 6285 6286 void 6287 pmap_kremove_device(vm_offset_t va, vm_size_t size) 6288 { 6289 vm_offset_t sva; 6290 6291 KASSERT((size & PAGE_MASK) == 0, 6292 ("%s: device mapping not page-sized", __func__)); 6293 6294 sva = va; 6295 while (size != 0) { 6296 pmap_kremove(va); 6297 va += PAGE_SIZE; 6298 size -= PAGE_SIZE; 6299 } 6300 tlb_flush_range(sva, va - sva); 6301 } 6302 6303 void 6304 pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) 6305 { 6306 6307 pcb->pcb_pagedir = pmap_ttb_get(pmap); 6308 } 6309 6310 /* 6311 * Clean L1 data cache range by physical address. 6312 * The range must be within a single page. 6313 */ 6314 static void 6315 pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) 6316 { 6317 pt2_entry_t *cmap2_pte2p; 6318 struct pcpu *pc; 6319 6320 KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, 6321 ("%s: not on single page", __func__)); 6322 6323 sched_pin(); 6324 pc = get_pcpu(); 6325 cmap2_pte2p = pc->pc_cmap2_pte2p; 6326 mtx_lock(&pc->pc_cmap_lock); 6327 if (pte2_load(cmap2_pte2p) != 0) 6328 panic("%s: CMAP2 busy", __func__); 6329 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); 6330 dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); 6331 pte2_clear(cmap2_pte2p); 6332 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6333 sched_unpin(); 6334 mtx_unlock(&pc->pc_cmap_lock); 6335 } 6336 6337 /* 6338 * Sync instruction cache range which is not mapped yet. 6339 */ 6340 void 6341 cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) 6342 { 6343 uint32_t len, offset; 6344 vm_page_t m; 6345 6346 /* Write back d-cache on given address range. */ 6347 offset = pa & PAGE_MASK; 6348 for ( ; size != 0; size -= len, pa += len, offset = 0) { 6349 len = min(PAGE_SIZE - offset, size); 6350 m = PHYS_TO_VM_PAGE(pa); 6351 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6352 __func__, pa)); 6353 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6354 } 6355 /* 6356 * I-cache is VIPT. Only way how to flush all virtual mappings 6357 * on given physical address is to invalidate all i-cache. 6358 */ 6359 icache_inv_all(); 6360 } 6361 6362 void 6363 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) 6364 { 6365 6366 /* Write back d-cache on given address range. */ 6367 if (va >= VM_MIN_KERNEL_ADDRESS) { 6368 dcache_wb_pou(va, size); 6369 } else { 6370 uint32_t len, offset; 6371 vm_paddr_t pa; 6372 vm_page_t m; 6373 6374 offset = va & PAGE_MASK; 6375 for ( ; size != 0; size -= len, va += len, offset = 0) { 6376 pa = pmap_extract(pmap, va); /* offset is preserved */ 6377 len = min(PAGE_SIZE - offset, size); 6378 m = PHYS_TO_VM_PAGE(pa); 6379 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6380 __func__, pa)); 6381 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6382 } 6383 } 6384 /* 6385 * I-cache is VIPT. Only way how to flush all virtual mappings 6386 * on given physical address is to invalidate all i-cache. 6387 */ 6388 icache_inv_all(); 6389 } 6390 6391 /* 6392 * The implementation of pmap_fault() uses IN_RANGE2() macro which 6393 * depends on the fact that given range size is a power of 2. 6394 */ 6395 CTASSERT(powerof2(NB_IN_PT1)); 6396 CTASSERT(powerof2(PT2MAP_SIZE)); 6397 6398 #define IN_RANGE2(addr, start, size) \ 6399 ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) 6400 6401 /* 6402 * Handle access and R/W emulation faults. 6403 */ 6404 int 6405 pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) 6406 { 6407 pt1_entry_t *pte1p, pte1; 6408 pt2_entry_t *pte2p, pte2; 6409 6410 if (pmap == NULL) 6411 pmap = kernel_pmap; 6412 6413 /* 6414 * In kernel, we should never get abort with FAR which is in range of 6415 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here 6416 * and print out a useful abort message and even get to the debugger 6417 * otherwise it likely ends with never ending loop of aborts. 6418 */ 6419 if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { 6420 /* 6421 * All L1 tables should always be mapped and present. 6422 * However, we check only current one herein. For user mode, 6423 * only permission abort from malicious user is not fatal. 6424 * And alignment abort as it may have higher priority. 6425 */ 6426 if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { 6427 CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", 6428 __func__, pmap, pmap->pm_pt1, far); 6429 panic("%s: pm_pt1 abort", __func__); 6430 } 6431 return (KERN_INVALID_ADDRESS); 6432 } 6433 if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { 6434 /* 6435 * PT2MAP should be always mapped and present in current 6436 * L1 table. However, only existing L2 tables are mapped 6437 * in PT2MAP. For user mode, only L2 translation abort and 6438 * permission abort from malicious user is not fatal. 6439 * And alignment abort as it may have higher priority. 6440 */ 6441 if (!usermode || (idx != FAULT_ALIGN && 6442 idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { 6443 CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", 6444 __func__, pmap, PT2MAP, far); 6445 panic("%s: PT2MAP abort", __func__); 6446 } 6447 return (KERN_INVALID_ADDRESS); 6448 } 6449 6450 /* 6451 * A pmap lock is used below for handling of access and R/W emulation 6452 * aborts. They were handled by atomic operations before so some 6453 * analysis of new situation is needed to answer the following question: 6454 * Is it safe to use the lock even for these aborts? 6455 * 6456 * There may happen two cases in general: 6457 * 6458 * (1) Aborts while the pmap lock is locked already - this should not 6459 * happen as pmap lock is not recursive. However, under pmap lock only 6460 * internal kernel data should be accessed and such data should be 6461 * mapped with A bit set and NM bit cleared. If double abort happens, 6462 * then a mapping of data which has caused it must be fixed. Further, 6463 * all new mappings are always made with A bit set and the bit can be 6464 * cleared only on managed mappings. 6465 * 6466 * (2) Aborts while another lock(s) is/are locked - this already can 6467 * happen. However, there is no difference here if it's either access or 6468 * R/W emulation abort, or if it's some other abort. 6469 */ 6470 6471 PMAP_LOCK(pmap); 6472 #ifdef INVARIANTS 6473 pte1 = pte1_load(pmap_pte1(pmap, far)); 6474 if (pte1_is_link(pte1)) { 6475 /* 6476 * Check in advance that associated L2 page table is mapped into 6477 * PT2MAP space. Note that faulty access to not mapped L2 page 6478 * table is caught in more general check above where "far" is 6479 * checked that it does not lay in PT2MAP space. Note also that 6480 * L1 page table and PT2TAB always exist and are mapped. 6481 */ 6482 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); 6483 if (!pte2_is_valid(pte2)) 6484 panic("%s: missing L2 page table (%p, %#x)", 6485 __func__, pmap, far); 6486 } 6487 #endif 6488 #ifdef SMP 6489 /* 6490 * Special treatment is due to break-before-make approach done when 6491 * pte1 is updated for userland mapping during section promotion or 6492 * demotion. If not caught here, pmap_enter() can find a section 6493 * mapping on faulting address. That is not allowed. 6494 */ 6495 if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { 6496 PMAP_UNLOCK(pmap); 6497 return (KERN_SUCCESS); 6498 } 6499 #endif 6500 /* 6501 * Accesss bits for page and section. Note that the entry 6502 * is not in TLB yet, so TLB flush is not necessary. 6503 * 6504 * QQQ: This is hardware emulation, we do not call userret() 6505 * for aborts from user mode. 6506 */ 6507 if (idx == FAULT_ACCESS_L2) { 6508 pte1 = pte1_load(pmap_pte1(pmap, far)); 6509 if (pte1_is_link(pte1)) { 6510 /* L2 page table should exist and be mapped. */ 6511 pte2p = pt2map_entry(far); 6512 pte2 = pte2_load(pte2p); 6513 if (pte2_is_valid(pte2)) { 6514 pte2_store(pte2p, pte2 | PTE2_A); 6515 PMAP_UNLOCK(pmap); 6516 return (KERN_SUCCESS); 6517 } 6518 } else { 6519 /* 6520 * We got L2 access fault but PTE1 is not a link. 6521 * Probably some race happened, do nothing. 6522 */ 6523 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L2 - pmap %#x far %#x", 6524 __func__, pmap, far); 6525 PMAP_UNLOCK(pmap); 6526 return (KERN_SUCCESS); 6527 } 6528 } 6529 if (idx == FAULT_ACCESS_L1) { 6530 pte1p = pmap_pte1(pmap, far); 6531 pte1 = pte1_load(pte1p); 6532 if (pte1_is_section(pte1)) { 6533 pte1_store(pte1p, pte1 | PTE1_A); 6534 PMAP_UNLOCK(pmap); 6535 return (KERN_SUCCESS); 6536 } else { 6537 /* 6538 * We got L1 access fault but PTE1 is not section 6539 * mapping. Probably some race happened, do nothing. 6540 */ 6541 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L1 - pmap %#x far %#x", 6542 __func__, pmap, far); 6543 PMAP_UNLOCK(pmap); 6544 return (KERN_SUCCESS); 6545 } 6546 } 6547 6548 /* 6549 * Handle modify bits for page and section. Note that the modify 6550 * bit is emulated by software. So PTEx_RO is software read only 6551 * bit and PTEx_NM flag is real hardware read only bit. 6552 * 6553 * QQQ: This is hardware emulation, we do not call userret() 6554 * for aborts from user mode. 6555 */ 6556 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { 6557 pte1 = pte1_load(pmap_pte1(pmap, far)); 6558 if (pte1_is_link(pte1)) { 6559 /* L2 page table should exist and be mapped. */ 6560 pte2p = pt2map_entry(far); 6561 pte2 = pte2_load(pte2p); 6562 if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && 6563 (pte2 & PTE2_NM)) { 6564 pte2_store(pte2p, pte2 & ~PTE2_NM); 6565 tlb_flush(trunc_page(far)); 6566 PMAP_UNLOCK(pmap); 6567 return (KERN_SUCCESS); 6568 } 6569 } else { 6570 /* 6571 * We got L2 permission fault but PTE1 is not a link. 6572 * Probably some race happened, do nothing. 6573 */ 6574 CTR3(KTR_PMAP, "%s: FAULT_PERM_L2 - pmap %#x far %#x", 6575 __func__, pmap, far); 6576 PMAP_UNLOCK(pmap); 6577 return (KERN_SUCCESS); 6578 } 6579 } 6580 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { 6581 pte1p = pmap_pte1(pmap, far); 6582 pte1 = pte1_load(pte1p); 6583 if (pte1_is_section(pte1)) { 6584 if (!(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { 6585 pte1_store(pte1p, pte1 & ~PTE1_NM); 6586 tlb_flush(pte1_trunc(far)); 6587 PMAP_UNLOCK(pmap); 6588 return (KERN_SUCCESS); 6589 } 6590 } else { 6591 /* 6592 * We got L1 permission fault but PTE1 is not section 6593 * mapping. Probably some race happened, do nothing. 6594 */ 6595 CTR3(KTR_PMAP, "%s: FAULT_PERM_L1 - pmap %#x far %#x", 6596 __func__, pmap, far); 6597 PMAP_UNLOCK(pmap); 6598 return (KERN_SUCCESS); 6599 } 6600 } 6601 6602 /* 6603 * QQQ: The previous code, mainly fast handling of access and 6604 * modify bits aborts, could be moved to ASM. Now we are 6605 * starting to deal with not fast aborts. 6606 */ 6607 PMAP_UNLOCK(pmap); 6608 return (KERN_FAILURE); 6609 } 6610 6611 #if defined(PMAP_DEBUG) 6612 /* 6613 * Reusing of KVA used in pmap_zero_page function !!! 6614 */ 6615 static void 6616 pmap_zero_page_check(vm_page_t m) 6617 { 6618 pt2_entry_t *cmap2_pte2p; 6619 uint32_t *p, *end; 6620 struct pcpu *pc; 6621 6622 sched_pin(); 6623 pc = get_pcpu(); 6624 cmap2_pte2p = pc->pc_cmap2_pte2p; 6625 mtx_lock(&pc->pc_cmap_lock); 6626 if (pte2_load(cmap2_pte2p) != 0) 6627 panic("%s: CMAP2 busy", __func__); 6628 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6629 vm_page_pte2_attr(m))); 6630 end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); 6631 for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) 6632 if (*p != 0) 6633 panic("%s: page %p not zero, va: %p", __func__, m, 6634 pc->pc_cmap2_addr); 6635 pte2_clear(cmap2_pte2p); 6636 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6637 sched_unpin(); 6638 mtx_unlock(&pc->pc_cmap_lock); 6639 } 6640 6641 int 6642 pmap_pid_dump(int pid) 6643 { 6644 pmap_t pmap; 6645 struct proc *p; 6646 int npte2 = 0; 6647 int i, j, index; 6648 6649 sx_slock(&allproc_lock); 6650 FOREACH_PROC_IN_SYSTEM(p) { 6651 if (p->p_pid != pid || p->p_vmspace == NULL) 6652 continue; 6653 index = 0; 6654 pmap = vmspace_pmap(p->p_vmspace); 6655 for (i = 0; i < NPTE1_IN_PT1; i++) { 6656 pt1_entry_t pte1; 6657 pt2_entry_t *pte2p, pte2; 6658 vm_offset_t base, va; 6659 vm_paddr_t pa; 6660 vm_page_t m; 6661 6662 base = i << PTE1_SHIFT; 6663 pte1 = pte1_load(&pmap->pm_pt1[i]); 6664 6665 if (pte1_is_section(pte1)) { 6666 /* 6667 * QQQ: Do something here! 6668 */ 6669 } else if (pte1_is_link(pte1)) { 6670 for (j = 0; j < NPTE2_IN_PT2; j++) { 6671 va = base + (j << PAGE_SHIFT); 6672 if (va >= VM_MIN_KERNEL_ADDRESS) { 6673 if (index) { 6674 index = 0; 6675 printf("\n"); 6676 } 6677 sx_sunlock(&allproc_lock); 6678 return (npte2); 6679 } 6680 pte2p = pmap_pte2(pmap, va); 6681 pte2 = pte2_load(pte2p); 6682 pmap_pte2_release(pte2p); 6683 if (!pte2_is_valid(pte2)) 6684 continue; 6685 6686 pa = pte2_pa(pte2); 6687 m = PHYS_TO_VM_PAGE(pa); 6688 printf("va: 0x%x, pa: 0x%x, w: %d, " 6689 "f: 0x%x", va, pa, 6690 m->ref_count, m->flags); 6691 npte2++; 6692 index++; 6693 if (index >= 2) { 6694 index = 0; 6695 printf("\n"); 6696 } else { 6697 printf(" "); 6698 } 6699 } 6700 } 6701 } 6702 } 6703 sx_sunlock(&allproc_lock); 6704 return (npte2); 6705 } 6706 6707 #endif 6708 6709 #ifdef DDB 6710 static pt2_entry_t * 6711 pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) 6712 { 6713 pt1_entry_t pte1; 6714 vm_paddr_t pt2pg_pa; 6715 6716 pte1 = pte1_load(pmap_pte1(pmap, va)); 6717 if (!pte1_is_link(pte1)) 6718 return (NULL); 6719 6720 if (pmap_is_current(pmap)) 6721 return (pt2map_entry(va)); 6722 6723 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 6724 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 6725 if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { 6726 pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); 6727 #ifdef SMP 6728 PMAP3cpu = PCPU_GET(cpuid); 6729 #endif 6730 tlb_flush_local((vm_offset_t)PADDR3); 6731 } 6732 #ifdef SMP 6733 else if (PMAP3cpu != PCPU_GET(cpuid)) { 6734 PMAP3cpu = PCPU_GET(cpuid); 6735 tlb_flush_local((vm_offset_t)PADDR3); 6736 } 6737 #endif 6738 return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 6739 } 6740 6741 static void 6742 dump_pmap(pmap_t pmap) 6743 { 6744 6745 printf("pmap %p\n", pmap); 6746 printf(" pm_pt1: %p\n", pmap->pm_pt1); 6747 printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); 6748 printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); 6749 } 6750 6751 DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) 6752 { 6753 6754 pmap_t pmap; 6755 LIST_FOREACH(pmap, &allpmaps, pm_list) { 6756 dump_pmap(pmap); 6757 } 6758 } 6759 6760 static int 6761 pte2_class(pt2_entry_t pte2) 6762 { 6763 int cls; 6764 6765 cls = (pte2 >> 2) & 0x03; 6766 cls |= (pte2 >> 4) & 0x04; 6767 return (cls); 6768 } 6769 6770 static void 6771 dump_section(pmap_t pmap, uint32_t pte1_idx) 6772 { 6773 } 6774 6775 static void 6776 dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) 6777 { 6778 uint32_t i; 6779 vm_offset_t va; 6780 pt2_entry_t *pte2p, pte2; 6781 vm_page_t m; 6782 6783 va = pte1_idx << PTE1_SHIFT; 6784 pte2p = pmap_pte2_ddb(pmap, va); 6785 for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { 6786 pte2 = pte2_load(pte2p); 6787 if (pte2 == 0) 6788 continue; 6789 if (!pte2_is_valid(pte2)) { 6790 printf(" 0x%08X: 0x%08X", va, pte2); 6791 if (!invalid_ok) 6792 printf(" - not valid !!!"); 6793 printf("\n"); 6794 continue; 6795 } 6796 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 6797 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, 6798 pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); 6799 if (m != NULL) { 6800 printf(" v:%d w:%d f:0x%04X\n", m->valid, 6801 m->ref_count, m->flags); 6802 } else { 6803 printf("\n"); 6804 } 6805 } 6806 } 6807 6808 static __inline boolean_t 6809 is_pv_chunk_space(vm_offset_t va) 6810 { 6811 6812 if ((((vm_offset_t)pv_chunkbase) <= va) && 6813 (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) 6814 return (TRUE); 6815 return (FALSE); 6816 } 6817 6818 DB_SHOW_COMMAND(pmap, pmap_pmap_print) 6819 { 6820 /* XXX convert args. */ 6821 pmap_t pmap = (pmap_t)addr; 6822 pt1_entry_t pte1; 6823 pt2_entry_t pte2; 6824 vm_offset_t va, eva; 6825 vm_page_t m; 6826 uint32_t i; 6827 boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; 6828 6829 if (have_addr) { 6830 pmap_t pm; 6831 6832 LIST_FOREACH(pm, &allpmaps, pm_list) 6833 if (pm == pmap) break; 6834 if (pm == NULL) { 6835 printf("given pmap %p is not in allpmaps list\n", pmap); 6836 return; 6837 } 6838 } else 6839 pmap = PCPU_GET(curpmap); 6840 6841 eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; 6842 dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ 6843 6844 printf("pmap: 0x%08X\n", (uint32_t)pmap); 6845 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6846 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6847 6848 for(i = 0; i < NPTE1_IN_PT1; i++) { 6849 pte1 = pte1_load(&pmap->pm_pt1[i]); 6850 if (pte1 == 0) 6851 continue; 6852 va = i << PTE1_SHIFT; 6853 if (va >= eva) 6854 break; 6855 6856 if (pte1_is_section(pte1)) { 6857 printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, 6858 !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); 6859 dump_section(pmap, i); 6860 } else if (pte1_is_link(pte1)) { 6861 dump_link_ok = TRUE; 6862 invalid_ok = FALSE; 6863 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6864 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 6865 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", 6866 va, pte1, pte2, m); 6867 if (is_pv_chunk_space(va)) { 6868 printf(" - pv_chunk space"); 6869 if (dump_pv_chunk) 6870 invalid_ok = TRUE; 6871 else 6872 dump_link_ok = FALSE; 6873 } 6874 else if (m != NULL) 6875 printf(" w:%d w2:%u", m->ref_count, 6876 pt2_wirecount_get(m, pte1_index(va))); 6877 if (pte2 == 0) 6878 printf(" !!! pt2tab entry is ZERO"); 6879 else if (pte2_pa(pte1) != pte2_pa(pte2)) 6880 printf(" !!! pt2tab entry is DIFFERENT - m: %p", 6881 PHYS_TO_VM_PAGE(pte2_pa(pte2))); 6882 printf("\n"); 6883 if (dump_link_ok) 6884 dump_link(pmap, i, invalid_ok); 6885 } else 6886 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6887 } 6888 } 6889 6890 static void 6891 dump_pt2tab(pmap_t pmap) 6892 { 6893 uint32_t i; 6894 pt2_entry_t pte2; 6895 vm_offset_t va; 6896 vm_paddr_t pa; 6897 vm_page_t m; 6898 6899 printf("PT2TAB:\n"); 6900 for (i = 0; i < PT2TAB_ENTRIES; i++) { 6901 pte2 = pte2_load(&pmap->pm_pt2tab[i]); 6902 if (!pte2_is_valid(pte2)) 6903 continue; 6904 va = i << PT2TAB_SHIFT; 6905 pa = pte2_pa(pte2); 6906 m = PHYS_TO_VM_PAGE(pa); 6907 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, 6908 pte2_class(pte2), !!(pte2 & PTE2_S), m); 6909 if (m != NULL) 6910 printf(" , w: %d, f: 0x%04X pidx: %lld", 6911 m->ref_count, m->flags, m->pindex); 6912 printf("\n"); 6913 } 6914 } 6915 6916 DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) 6917 { 6918 /* XXX convert args. */ 6919 pmap_t pmap = (pmap_t)addr; 6920 pt1_entry_t pte1; 6921 pt2_entry_t pte2; 6922 vm_offset_t va; 6923 uint32_t i, start; 6924 6925 if (have_addr) { 6926 printf("supported only on current pmap\n"); 6927 return; 6928 } 6929 6930 pmap = PCPU_GET(curpmap); 6931 printf("curpmap: 0x%08X\n", (uint32_t)pmap); 6932 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6933 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6934 6935 start = pte1_index((vm_offset_t)PT2MAP); 6936 for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { 6937 pte1 = pte1_load(&pmap->pm_pt1[i]); 6938 if (pte1 == 0) 6939 continue; 6940 va = i << PTE1_SHIFT; 6941 if (pte1_is_section(pte1)) { 6942 printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, 6943 !!(pte1 & PTE1_S)); 6944 dump_section(pmap, i); 6945 } else if (pte1_is_link(pte1)) { 6946 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6947 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, 6948 pte1, pte2); 6949 if (pte2 == 0) 6950 printf(" !!! pt2tab entry is ZERO\n"); 6951 } else 6952 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6953 } 6954 dump_pt2tab(pmap); 6955 } 6956 #endif 6957