1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 8 * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org> 9 * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org> 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 41 */ 42 /*- 43 * Copyright (c) 2003 Networks Associates Technology, Inc. 44 * All rights reserved. 45 * 46 * This software was developed for the FreeBSD Project by Jake Burkholder, 47 * Safeport Network Services, and Network Associates Laboratories, the 48 * Security Research Division of Network Associates, Inc. under 49 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 50 * CHATS research program. 51 * 52 * Redistribution and use in source and binary forms, with or without 53 * modification, are permitted provided that the following conditions 54 * are met: 55 * 1. Redistributions of source code must retain the above copyright 56 * notice, this list of conditions and the following disclaimer. 57 * 2. Redistributions in binary form must reproduce the above copyright 58 * notice, this list of conditions and the following disclaimer in the 59 * documentation and/or other materials provided with the distribution. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 */ 73 74 #include <sys/cdefs.h> 75 __FBSDID("$FreeBSD$"); 76 77 /* 78 * Manages physical address maps. 79 * 80 * Since the information managed by this module is 81 * also stored by the logical address mapping module, 82 * this module may throw away valid virtual-to-physical 83 * mappings at almost any time. However, invalidations 84 * of virtual-to-physical mappings must be done as 85 * requested. 86 * 87 * In order to cope with hardware architectures which 88 * make virtual-to-physical map invalidates expensive, 89 * this module may delay invalidate or reduced protection 90 * operations until such time as they are actually 91 * necessary. This module is given full information as 92 * to which processors are currently using which maps, 93 * and to when physical maps must be made correct. 94 */ 95 96 #include "opt_vm.h" 97 #include "opt_pmap.h" 98 #include "opt_ddb.h" 99 100 #include <sys/param.h> 101 #include <sys/systm.h> 102 #include <sys/kernel.h> 103 #include <sys/ktr.h> 104 #include <sys/lock.h> 105 #include <sys/proc.h> 106 #include <sys/rwlock.h> 107 #include <sys/malloc.h> 108 #include <sys/vmmeter.h> 109 #include <sys/malloc.h> 110 #include <sys/mman.h> 111 #include <sys/sf_buf.h> 112 #include <sys/smp.h> 113 #include <sys/sched.h> 114 #include <sys/sysctl.h> 115 116 #ifdef DDB 117 #include <ddb/ddb.h> 118 #endif 119 120 #include <vm/vm.h> 121 #include <vm/uma.h> 122 #include <vm/pmap.h> 123 #include <vm/vm_param.h> 124 #include <vm/vm_kern.h> 125 #include <vm/vm_object.h> 126 #include <vm/vm_map.h> 127 #include <vm/vm_page.h> 128 #include <vm/vm_pageout.h> 129 #include <vm/vm_phys.h> 130 #include <vm/vm_extern.h> 131 #include <vm/vm_reserv.h> 132 #include <sys/lock.h> 133 #include <sys/mutex.h> 134 135 #include <machine/md_var.h> 136 #include <machine/pmap_var.h> 137 #include <machine/cpu.h> 138 #include <machine/pcb.h> 139 #include <machine/sf_buf.h> 140 #ifdef SMP 141 #include <machine/smp.h> 142 #endif 143 #ifndef PMAP_SHPGPERPROC 144 #define PMAP_SHPGPERPROC 200 145 #endif 146 147 #ifndef DIAGNOSTIC 148 #define PMAP_INLINE __inline 149 #else 150 #define PMAP_INLINE 151 #endif 152 153 #ifdef PMAP_DEBUG 154 static void pmap_zero_page_check(vm_page_t m); 155 void pmap_debug(int level); 156 int pmap_pid_dump(int pid); 157 158 #define PDEBUG(_lev_,_stat_) \ 159 if (pmap_debug_level >= (_lev_)) \ 160 ((_stat_)) 161 #define dprintf printf 162 int pmap_debug_level = 1; 163 #else /* PMAP_DEBUG */ 164 #define PDEBUG(_lev_,_stat_) /* Nothing */ 165 #define dprintf(x, arg...) 166 #endif /* PMAP_DEBUG */ 167 168 /* 169 * Level 2 page tables map definion ('max' is excluded). 170 */ 171 172 #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 173 #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) 174 175 #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 176 #define UPT2V_MAX_ADDRESS \ 177 ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) 178 179 /* 180 * Promotion to a 1MB (PTE1) page mapping requires that the corresponding 181 * 4KB (PTE2) page mappings have identical settings for the following fields: 182 */ 183 #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ 184 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ 185 PTE2_ATTR_MASK) 186 187 #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ 188 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ 189 PTE1_ATTR_MASK) 190 191 #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ 192 (((l2_attr) & L2_C) ? L1_S_C : 0) | \ 193 (((l2_attr) & L2_B) ? L1_S_B : 0) | \ 194 (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ 195 (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ 196 (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ 197 (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ 198 (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ 199 (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ 200 (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ 201 (((l2_attr) & PTE2_W) ? PTE1_W : 0)) 202 203 #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ 204 (((l1_attr) & L1_S_C) ? L2_C : 0) | \ 205 (((l1_attr) & L1_S_B) ? L2_B : 0) | \ 206 (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ 207 (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ 208 (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ 209 (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ 210 (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ 211 (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ 212 (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ 213 (((l1_attr) & PTE1_W) ? PTE2_W : 0)) 214 215 /* 216 * PTE2 descriptors creation macros. 217 */ 218 #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) 219 #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) 220 221 #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 222 #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 223 224 #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) 225 #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) 226 227 #define PV_STATS 228 #ifdef PV_STATS 229 #define PV_STAT(x) do { x ; } while (0) 230 #else 231 #define PV_STAT(x) do { } while (0) 232 #endif 233 234 /* 235 * The boot_pt1 is used temporary in very early boot stage as L1 page table. 236 * We can init many things with no memory allocation thanks to its static 237 * allocation and this brings two main advantages: 238 * (1) other cores can be started very simply, 239 * (2) various boot loaders can be supported as its arguments can be processed 240 * in virtual address space and can be moved to safe location before 241 * first allocation happened. 242 * Only disadvantage is that boot_pt1 is used only in very early boot stage. 243 * However, the table is uninitialized and so lays in bss. Therefore kernel 244 * image size is not influenced. 245 * 246 * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and 247 * CPU suspend/resume game. 248 */ 249 extern pt1_entry_t boot_pt1[]; 250 251 vm_paddr_t base_pt1; 252 pt1_entry_t *kern_pt1; 253 pt2_entry_t *kern_pt2tab; 254 pt2_entry_t *PT2MAP; 255 256 static uint32_t ttb_flags; 257 static vm_memattr_t pt_memattr; 258 ttb_entry_t pmap_kern_ttb; 259 260 struct pmap kernel_pmap_store; 261 LIST_HEAD(pmaplist, pmap); 262 static struct pmaplist allpmaps; 263 static struct mtx allpmaps_lock; 264 265 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 266 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 267 268 static vm_offset_t kernel_vm_end_new; 269 vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; 270 vm_offset_t vm_max_kernel_address; 271 vm_paddr_t kernel_l1pa; 272 273 static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; 274 275 /* 276 * Data for the pv entry allocation mechanism 277 */ 278 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 279 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 280 static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ 281 static int shpgperproc = PMAP_SHPGPERPROC; 282 283 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 284 int pv_maxchunks; /* How many chunks we have KVA for */ 285 vm_offset_t pv_vafree; /* freelist stored in the PTE */ 286 287 vm_paddr_t first_managed_pa; 288 #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) 289 290 /* 291 * All those kernel PT submaps that BSD is so fond of 292 */ 293 caddr_t _tmppt = 0; 294 295 /* 296 * Crashdump maps. 297 */ 298 static caddr_t crashdumpmap; 299 300 static pt2_entry_t *PMAP1 = NULL, *PMAP2; 301 static pt2_entry_t *PADDR1 = NULL, *PADDR2; 302 #ifdef DDB 303 static pt2_entry_t *PMAP3; 304 static pt2_entry_t *PADDR3; 305 static int PMAP3cpu __unused; /* for SMP only */ 306 #endif 307 #ifdef SMP 308 static int PMAP1cpu; 309 static int PMAP1changedcpu; 310 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 311 &PMAP1changedcpu, 0, 312 "Number of times pmap_pte2_quick changed CPU with same PMAP1"); 313 #endif 314 static int PMAP1changed; 315 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 316 &PMAP1changed, 0, 317 "Number of times pmap_pte2_quick changed PMAP1"); 318 static int PMAP1unchanged; 319 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 320 &PMAP1unchanged, 0, 321 "Number of times pmap_pte2_quick didn't change PMAP1"); 322 static struct mtx PMAP2mutex; 323 324 /* 325 * Internal flags for pmap_enter()'s helper functions. 326 */ 327 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 328 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 329 330 static __inline void pt2_wirecount_init(vm_page_t m); 331 static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, 332 vm_offset_t va); 333 static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, 334 u_int flags, vm_page_t m); 335 void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); 336 337 /* 338 * Function to set the debug level of the pmap code. 339 */ 340 #ifdef PMAP_DEBUG 341 void 342 pmap_debug(int level) 343 { 344 345 pmap_debug_level = level; 346 dprintf("pmap_debug: level=%d\n", pmap_debug_level); 347 } 348 #endif /* PMAP_DEBUG */ 349 350 /* 351 * This table must corespond with memory attribute configuration in vm.h. 352 * First entry is used for normal system mapping. 353 * 354 * Device memory is always marked as shared. 355 * Normal memory is shared only in SMP . 356 * Not outer shareable bits are not used yet. 357 * Class 6 cannot be used on ARM11. 358 */ 359 #define TEXDEF_TYPE_SHIFT 0 360 #define TEXDEF_TYPE_MASK 0x3 361 #define TEXDEF_INNER_SHIFT 2 362 #define TEXDEF_INNER_MASK 0x3 363 #define TEXDEF_OUTER_SHIFT 4 364 #define TEXDEF_OUTER_MASK 0x3 365 #define TEXDEF_NOS_SHIFT 6 366 #define TEXDEF_NOS_MASK 0x1 367 368 #define TEX(t, i, o, s) \ 369 ((t) << TEXDEF_TYPE_SHIFT) | \ 370 ((i) << TEXDEF_INNER_SHIFT) | \ 371 ((o) << TEXDEF_OUTER_SHIFT | \ 372 ((s) << TEXDEF_NOS_SHIFT)) 373 374 static uint32_t tex_class[8] = { 375 /* type inner cache outer cache */ 376 TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ 377 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ 378 TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ 379 TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ 380 TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ 381 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ 382 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ 383 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ 384 }; 385 #undef TEX 386 387 static uint32_t pte2_attr_tab[8] = { 388 PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ 389 PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ 390 PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ 391 PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ 392 PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 393 0, /* 5 - NOT USED YET */ 394 0, /* 6 - NOT USED YET */ 395 0 /* 7 - NOT USED YET */ 396 }; 397 CTASSERT(VM_MEMATTR_WB_WA == 0); 398 CTASSERT(VM_MEMATTR_NOCACHE == 1); 399 CTASSERT(VM_MEMATTR_DEVICE == 2); 400 CTASSERT(VM_MEMATTR_SO == 3); 401 CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); 402 #define VM_MEMATTR_END (VM_MEMATTR_WRITE_THROUGH + 1) 403 404 boolean_t 405 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 406 { 407 408 return (mode >= 0 && mode < VM_MEMATTR_END); 409 } 410 411 static inline uint32_t 412 vm_memattr_to_pte2(vm_memattr_t ma) 413 { 414 415 KASSERT((u_int)ma < VM_MEMATTR_END, 416 ("%s: bad vm_memattr_t %d", __func__, ma)); 417 return (pte2_attr_tab[(u_int)ma]); 418 } 419 420 static inline uint32_t 421 vm_page_pte2_attr(vm_page_t m) 422 { 423 424 return (vm_memattr_to_pte2(m->md.pat_mode)); 425 } 426 427 /* 428 * Convert TEX definition entry to TTB flags. 429 */ 430 static uint32_t 431 encode_ttb_flags(int idx) 432 { 433 uint32_t inner, outer, nos, reg; 434 435 inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & 436 TEXDEF_INNER_MASK; 437 outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & 438 TEXDEF_OUTER_MASK; 439 nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & 440 TEXDEF_NOS_MASK; 441 442 reg = nos << 5; 443 reg |= outer << 3; 444 if (cpuinfo.coherent_walk) 445 reg |= (inner & 0x1) << 6; 446 reg |= (inner & 0x2) >> 1; 447 #ifdef SMP 448 ARM_SMP_UP( 449 reg |= 1 << 1, 450 ); 451 #endif 452 return reg; 453 } 454 455 /* 456 * Set TEX remapping registers in current CPU. 457 */ 458 void 459 pmap_set_tex(void) 460 { 461 uint32_t prrr, nmrr; 462 uint32_t type, inner, outer, nos; 463 int i; 464 465 #ifdef PMAP_PTE_NOCACHE 466 /* XXX fixme */ 467 if (cpuinfo.coherent_walk) { 468 pt_memattr = VM_MEMATTR_WB_WA; 469 ttb_flags = encode_ttb_flags(0); 470 } 471 else { 472 pt_memattr = VM_MEMATTR_NOCACHE; 473 ttb_flags = encode_ttb_flags(1); 474 } 475 #else 476 pt_memattr = VM_MEMATTR_WB_WA; 477 ttb_flags = encode_ttb_flags(0); 478 #endif 479 480 prrr = 0; 481 nmrr = 0; 482 483 /* Build remapping register from TEX classes. */ 484 for (i = 0; i < 8; i++) { 485 type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & 486 TEXDEF_TYPE_MASK; 487 inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & 488 TEXDEF_INNER_MASK; 489 outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & 490 TEXDEF_OUTER_MASK; 491 nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & 492 TEXDEF_NOS_MASK; 493 494 prrr |= type << (i * 2); 495 prrr |= nos << (i + 24); 496 nmrr |= inner << (i * 2); 497 nmrr |= outer << (i * 2 + 16); 498 } 499 /* Add shareable bits for device memory. */ 500 prrr |= PRRR_DS0 | PRRR_DS1; 501 502 /* Add shareable bits for normal memory in SMP case. */ 503 #ifdef SMP 504 ARM_SMP_UP( 505 prrr |= PRRR_NS1, 506 ); 507 #endif 508 cp15_prrr_set(prrr); 509 cp15_nmrr_set(nmrr); 510 511 /* Caches are disabled, so full TLB flush should be enough. */ 512 tlb_flush_all_local(); 513 } 514 515 /* 516 * Remap one vm_meattr class to another one. This can be useful as 517 * workaround for SOC errata, e.g. if devices must be accessed using 518 * SO memory class. 519 * 520 * !!! Please note that this function is absolutely last resort thing. 521 * It should not be used under normal circumstances. !!! 522 * 523 * Usage rules: 524 * - it shall be called after pmap_bootstrap_prepare() and before 525 * cpu_mp_start() (thus only on boot CPU). In practice, it's expected 526 * to be called from platform_attach() or platform_late_init(). 527 * 528 * - if remapping doesn't change caching mode, or until uncached class 529 * is remapped to any kind of cached one, then no other restriction exists. 530 * 531 * - if pmap_remap_vm_attr() changes caching mode, but both (original and 532 * remapped) remain cached, then caller is resposible for calling 533 * of dcache_wbinv_poc_all(). 534 * 535 * - remapping of any kind of cached class to uncached is not permitted. 536 */ 537 void 538 pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) 539 { 540 int old_idx, new_idx; 541 542 /* Map VM memattrs to indexes to tex_class table. */ 543 old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); 544 new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); 545 546 /* Replace TEX attribute and apply it. */ 547 tex_class[old_idx] = tex_class[new_idx]; 548 pmap_set_tex(); 549 } 550 551 /* 552 * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, 553 * KERNBASE is mapped by first L2 page table in L2 page table page. It 554 * meets same constrain due to PT2MAP being placed just under KERNBASE. 555 */ 556 CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); 557 CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); 558 559 /* 560 * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. 561 * For now, anyhow, the following check must be fulfilled. 562 */ 563 CTASSERT(PAGE_SIZE == PTE2_SIZE); 564 /* 565 * We don't want to mess up MI code with all MMU and PMAP definitions, 566 * so some things, which depend on other ones, are defined independently. 567 * Now, it is time to check that we don't screw up something. 568 */ 569 CTASSERT(PDRSHIFT == PTE1_SHIFT); 570 /* 571 * Check L1 and L2 page table entries definitions consistency. 572 */ 573 CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); 574 CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); 575 /* 576 * Check L2 page tables page consistency. 577 */ 578 CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); 579 CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); 580 /* 581 * Check PT2TAB consistency. 582 * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. 583 * This should be done without remainder. 584 */ 585 CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); 586 587 /* 588 * A PT2MAP magic. 589 * 590 * All level 2 page tables (PT2s) are mapped continuously and accordingly 591 * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can 592 * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page 593 * must be used together, but not necessary at once. The first PT2 in a page 594 * must map things on correctly aligned address and the others must follow 595 * in right order. 596 */ 597 #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) 598 #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) 599 #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) 600 601 /* 602 * Check PT2TAB consistency. 603 * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. 604 * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. 605 * The both should be done without remainder. 606 */ 607 CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); 608 CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); 609 /* 610 * The implementation was made general, however, with the assumption 611 * bellow in mind. In case of another value of NPG_IN_PT2TAB, 612 * the code should be once more rechecked. 613 */ 614 CTASSERT(NPG_IN_PT2TAB == 1); 615 616 /* 617 * Get offset of PT2 in a page 618 * associated with given PT1 index. 619 */ 620 static __inline u_int 621 page_pt2off(u_int pt1_idx) 622 { 623 624 return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); 625 } 626 627 /* 628 * Get physical address of PT2 629 * associated with given PT2s page and PT1 index. 630 */ 631 static __inline vm_paddr_t 632 page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) 633 { 634 635 return (pgpa + page_pt2off(pt1_idx)); 636 } 637 638 /* 639 * Get first entry of PT2 640 * associated with given PT2s page and PT1 index. 641 */ 642 static __inline pt2_entry_t * 643 page_pt2(vm_offset_t pgva, u_int pt1_idx) 644 { 645 646 return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); 647 } 648 649 /* 650 * Get virtual address of PT2s page (mapped in PT2MAP) 651 * which holds PT2 which holds entry which maps given virtual address. 652 */ 653 static __inline vm_offset_t 654 pt2map_pt2pg(vm_offset_t va) 655 { 656 657 va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); 658 return ((vm_offset_t)pt2map_entry(va)); 659 } 660 661 /***************************************************************************** 662 * 663 * THREE pmap initialization milestones exist: 664 * 665 * locore.S 666 * -> fundamental init (including MMU) in ASM 667 * 668 * initarm() 669 * -> fundamental init continues in C 670 * -> first available physical address is known 671 * 672 * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) 673 * -> basic (safe) interface for physical address allocation is made 674 * -> basic (safe) interface for virtual mapping is made 675 * -> limited not SMP coherent work is possible 676 * 677 * -> more fundamental init continues in C 678 * -> locks and some more things are available 679 * -> all fundamental allocations and mappings are done 680 * 681 * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) 682 * -> phys_avail[] and virtual_avail is set 683 * -> control is passed to vm subsystem 684 * -> physical and virtual address allocation are off limit 685 * -> low level mapping functions, some SMP coherent, 686 * are available, which cannot be used before vm subsystem 687 * is being inited 688 * 689 * mi_startup() 690 * -> vm subsystem is being inited 691 * 692 * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) 693 * -> pmap is fully inited 694 * 695 *****************************************************************************/ 696 697 /***************************************************************************** 698 * 699 * PMAP first stage initialization and utility functions 700 * for pre-bootstrap epoch. 701 * 702 * After pmap_bootstrap_prepare() is called, the following functions 703 * can be used: 704 * 705 * (1) strictly only for this stage functions for physical page allocations, 706 * virtual space allocations, and mappings: 707 * 708 * vm_paddr_t pmap_preboot_get_pages(u_int num); 709 * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); 710 * vm_offset_t pmap_preboot_reserve_pages(u_int num); 711 * vm_offset_t pmap_preboot_get_vpages(u_int num); 712 * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 713 * vm_prot_t prot, vm_memattr_t attr); 714 * 715 * (2) for all stages: 716 * 717 * vm_paddr_t pmap_kextract(vm_offset_t va); 718 * 719 * NOTE: This is not SMP coherent stage. 720 * 721 *****************************************************************************/ 722 723 #define KERNEL_P2V(pa) \ 724 ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) 725 #define KERNEL_V2P(va) \ 726 ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) 727 728 static vm_paddr_t last_paddr; 729 730 /* 731 * Pre-bootstrap epoch page allocator. 732 */ 733 vm_paddr_t 734 pmap_preboot_get_pages(u_int num) 735 { 736 vm_paddr_t ret; 737 738 ret = last_paddr; 739 last_paddr += num * PAGE_SIZE; 740 741 return (ret); 742 } 743 744 /* 745 * The fundamental initialization of PMAP stuff. 746 * 747 * Some things already happened in locore.S and some things could happen 748 * before pmap_bootstrap_prepare() is called, so let's recall what is done: 749 * 1. Caches are disabled. 750 * 2. We are running on virtual addresses already with 'boot_pt1' 751 * as L1 page table. 752 * 3. So far, all virtual addresses can be converted to physical ones and 753 * vice versa by the following macros: 754 * KERNEL_P2V(pa) .... physical to virtual ones, 755 * KERNEL_V2P(va) .... virtual to physical ones. 756 * 757 * What is done herein: 758 * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. 759 * 2. PT2MAP magic is brought to live. 760 * 3. Basic preboot functions for page allocations and mappings can be used. 761 * 4. Everything is prepared for L1 cache enabling. 762 * 763 * Variations: 764 * 1. To use second TTB register, so kernel and users page tables will be 765 * separated. This way process forking - pmap_pinit() - could be faster, 766 * it saves physical pages and KVA per a process, and it's simple change. 767 * However, it will lead, due to hardware matter, to the following: 768 * (a) 2G space for kernel and 2G space for users. 769 * (b) 1G space for kernel in low addresses and 3G for users above it. 770 * A question is: Is the case (b) really an option? Note that case (b) 771 * does save neither physical memory and KVA. 772 */ 773 void 774 pmap_bootstrap_prepare(vm_paddr_t last) 775 { 776 vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; 777 vm_offset_t pt2pg_va; 778 pt1_entry_t *pte1p; 779 pt2_entry_t *pte2p; 780 u_int i; 781 uint32_t l1_attr; 782 783 /* 784 * Now, we are going to make real kernel mapping. Note that we are 785 * already running on some mapping made in locore.S and we expect 786 * that it's large enough to ensure nofault access to physical memory 787 * allocated herein before switch. 788 * 789 * As kernel image and everything needed before are and will be mapped 790 * by section mappings, we align last physical address to PTE1_SIZE. 791 */ 792 last_paddr = pte1_roundup(last); 793 794 /* 795 * Allocate and zero page(s) for kernel L1 page table. 796 * 797 * Note that it's first allocation on space which was PTE1_SIZE 798 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. 799 */ 800 base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); 801 kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); 802 bzero((void*)kern_pt1, NB_IN_PT1); 803 pte1_sync_range(kern_pt1, NB_IN_PT1); 804 805 /* Allocate and zero page(s) for kernel PT2TAB. */ 806 pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); 807 kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); 808 bzero(kern_pt2tab, NB_IN_PT2TAB); 809 pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); 810 811 /* Allocate and zero page(s) for kernel L2 page tables. */ 812 pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); 813 pt2pg_va = KERNEL_P2V(pt2pg_pa); 814 size = NKPT2PG * PAGE_SIZE; 815 bzero((void*)pt2pg_va, size); 816 pte2_sync_range((pt2_entry_t *)pt2pg_va, size); 817 818 /* 819 * Add a physical memory segment (vm_phys_seg) corresponding to the 820 * preallocated pages for kernel L2 page tables so that vm_page 821 * structures representing these pages will be created. The vm_page 822 * structures are required for promotion of the corresponding kernel 823 * virtual addresses to section mappings. 824 */ 825 vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); 826 827 /* 828 * Insert allocated L2 page table pages to PT2TAB and make 829 * link to all PT2s in L1 page table. See how kernel_vm_end 830 * is initialized. 831 * 832 * We play simple and safe. So every KVA will have underlaying 833 * L2 page table, even kernel image mapped by sections. 834 */ 835 pte2p = kern_pt2tab_entry(KERNBASE); 836 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) 837 pt2tab_store(pte2p++, PTE2_KPT(pa)); 838 839 pte1p = kern_pte1(KERNBASE); 840 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) 841 pte1_store(pte1p++, PTE1_LINK(pa)); 842 843 /* Make section mappings for kernel. */ 844 l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); 845 pte1p = kern_pte1(KERNBASE); 846 for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) 847 pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); 848 849 /* 850 * Get free and aligned space for PT2MAP and make L1 page table links 851 * to L2 page tables held in PT2TAB. 852 * 853 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t 854 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus 855 * each entry in PT2TAB maps all PT2s in a page. This implies that 856 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. 857 */ 858 PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); 859 pte1p = kern_pte1((vm_offset_t)PT2MAP); 860 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 861 pte1_store(pte1p++, PTE1_LINK(pa)); 862 } 863 864 /* 865 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. 866 * Each pmap will hold own PT2TAB, so the mapping should be not global. 867 */ 868 pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); 869 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 870 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 871 } 872 873 /* 874 * Choose correct L2 page table and make mappings for allocations 875 * made herein which replaces temporary locore.S mappings after a while. 876 * Note that PT2MAP cannot be used until we switch to kern_pt1. 877 * 878 * Note, that these allocations started aligned on 1M section and 879 * kernel PT1 was allocated first. Making of mappings must follow 880 * order of physical allocations as we've used KERNEL_P2V() macro 881 * for virtual addresses resolution. 882 */ 883 pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); 884 pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); 885 886 pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); 887 888 /* Make mapping for kernel L1 page table. */ 889 for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) 890 pte2_store(pte2p++, PTE2_KPT(pa)); 891 892 /* Make mapping for kernel PT2TAB. */ 893 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) 894 pte2_store(pte2p++, PTE2_KPT(pa)); 895 896 /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ 897 pmap_kern_ttb = base_pt1 | ttb_flags; 898 cpuinfo_reinit_mmu(pmap_kern_ttb); 899 /* 900 * Initialize the first available KVA. As kernel image is mapped by 901 * sections, we are leaving some gap behind. 902 */ 903 virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; 904 } 905 906 /* 907 * Setup L2 page table page for given KVA. 908 * Used in pre-bootstrap epoch. 909 * 910 * Note that we have allocated NKPT2PG pages for L2 page tables in advance 911 * and used them for mapping KVA starting from KERNBASE. However, this is not 912 * enough. Vectors and devices need L2 page tables too. Note that they are 913 * even above VM_MAX_KERNEL_ADDRESS. 914 */ 915 static __inline vm_paddr_t 916 pmap_preboot_pt2pg_setup(vm_offset_t va) 917 { 918 pt2_entry_t *pte2p, pte2; 919 vm_paddr_t pt2pg_pa; 920 921 /* Get associated entry in PT2TAB. */ 922 pte2p = kern_pt2tab_entry(va); 923 924 /* Just return, if PT2s page exists already. */ 925 pte2 = pt2tab_load(pte2p); 926 if (pte2_is_valid(pte2)) 927 return (pte2_pa(pte2)); 928 929 KASSERT(va >= VM_MAX_KERNEL_ADDRESS, 930 ("%s: NKPT2PG too small", __func__)); 931 932 /* 933 * Allocate page for PT2s and insert it to PT2TAB. 934 * In other words, map it into PT2MAP space. 935 */ 936 pt2pg_pa = pmap_preboot_get_pages(1); 937 pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); 938 939 /* Zero all PT2s in allocated page. */ 940 bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); 941 pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); 942 943 return (pt2pg_pa); 944 } 945 946 /* 947 * Setup L2 page table for given KVA. 948 * Used in pre-bootstrap epoch. 949 */ 950 static void 951 pmap_preboot_pt2_setup(vm_offset_t va) 952 { 953 pt1_entry_t *pte1p; 954 vm_paddr_t pt2pg_pa, pt2_pa; 955 956 /* Setup PT2's page. */ 957 pt2pg_pa = pmap_preboot_pt2pg_setup(va); 958 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); 959 960 /* Insert PT2 to PT1. */ 961 pte1p = kern_pte1(va); 962 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 963 } 964 965 /* 966 * Get L2 page entry associated with given KVA. 967 * Used in pre-bootstrap epoch. 968 */ 969 static __inline pt2_entry_t* 970 pmap_preboot_vtopte2(vm_offset_t va) 971 { 972 pt1_entry_t *pte1p; 973 974 /* Setup PT2 if needed. */ 975 pte1p = kern_pte1(va); 976 if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ 977 pmap_preboot_pt2_setup(va); 978 979 return (pt2map_entry(va)); 980 } 981 982 /* 983 * Pre-bootstrap epoch page(s) mapping(s). 984 */ 985 void 986 pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) 987 { 988 u_int i; 989 pt2_entry_t *pte2p; 990 991 /* Map all the pages. */ 992 for (i = 0; i < num; i++) { 993 pte2p = pmap_preboot_vtopte2(va); 994 pte2_store(pte2p, PTE2_KRW(pa)); 995 va += PAGE_SIZE; 996 pa += PAGE_SIZE; 997 } 998 } 999 1000 /* 1001 * Pre-bootstrap epoch virtual space alocator. 1002 */ 1003 vm_offset_t 1004 pmap_preboot_reserve_pages(u_int num) 1005 { 1006 u_int i; 1007 vm_offset_t start, va; 1008 pt2_entry_t *pte2p; 1009 1010 /* Allocate virtual space. */ 1011 start = va = virtual_avail; 1012 virtual_avail += num * PAGE_SIZE; 1013 1014 /* Zero the mapping. */ 1015 for (i = 0; i < num; i++) { 1016 pte2p = pmap_preboot_vtopte2(va); 1017 pte2_store(pte2p, 0); 1018 va += PAGE_SIZE; 1019 } 1020 1021 return (start); 1022 } 1023 1024 /* 1025 * Pre-bootstrap epoch page(s) allocation and mapping(s). 1026 */ 1027 vm_offset_t 1028 pmap_preboot_get_vpages(u_int num) 1029 { 1030 vm_paddr_t pa; 1031 vm_offset_t va; 1032 1033 /* Allocate physical page(s). */ 1034 pa = pmap_preboot_get_pages(num); 1035 1036 /* Allocate virtual space. */ 1037 va = virtual_avail; 1038 virtual_avail += num * PAGE_SIZE; 1039 1040 /* Map and zero all. */ 1041 pmap_preboot_map_pages(pa, va, num); 1042 bzero((void *)va, num * PAGE_SIZE); 1043 1044 return (va); 1045 } 1046 1047 /* 1048 * Pre-bootstrap epoch page mapping(s) with attributes. 1049 */ 1050 void 1051 pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 1052 vm_prot_t prot, vm_memattr_t attr) 1053 { 1054 u_int num; 1055 u_int l1_attr, l1_prot, l2_prot, l2_attr; 1056 pt1_entry_t *pte1p; 1057 pt2_entry_t *pte2p; 1058 1059 l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; 1060 l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1061 l2_attr = vm_memattr_to_pte2(attr); 1062 l1_prot = ATTR_TO_L1(l2_prot); 1063 l1_attr = ATTR_TO_L1(l2_attr); 1064 1065 /* Map all the pages. */ 1066 num = round_page(size); 1067 while (num > 0) { 1068 if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { 1069 pte1p = kern_pte1(va); 1070 pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); 1071 va += PTE1_SIZE; 1072 pa += PTE1_SIZE; 1073 num -= PTE1_SIZE; 1074 } else { 1075 pte2p = pmap_preboot_vtopte2(va); 1076 pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); 1077 va += PAGE_SIZE; 1078 pa += PAGE_SIZE; 1079 num -= PAGE_SIZE; 1080 } 1081 } 1082 } 1083 1084 /* 1085 * Extract from the kernel page table the physical address 1086 * that is mapped by the given virtual address "va". 1087 */ 1088 vm_paddr_t 1089 pmap_kextract(vm_offset_t va) 1090 { 1091 vm_paddr_t pa; 1092 pt1_entry_t pte1; 1093 pt2_entry_t pte2; 1094 1095 pte1 = pte1_load(kern_pte1(va)); 1096 if (pte1_is_section(pte1)) { 1097 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1098 } else if (pte1_is_link(pte1)) { 1099 /* 1100 * We should beware of concurrent promotion that changes 1101 * pte1 at this point. However, it's not a problem as PT2 1102 * page is preserved by promotion in PT2TAB. So even if 1103 * it happens, using of PT2MAP is still safe. 1104 * 1105 * QQQ: However, concurrent removing is a problem which 1106 * ends in abort on PT2MAP space. Locking must be used 1107 * to deal with this. 1108 */ 1109 pte2 = pte2_load(pt2map_entry(va)); 1110 pa = pte2_pa(pte2) | (va & PTE2_OFFSET); 1111 } 1112 else { 1113 panic("%s: va %#x pte1 %#x", __func__, va, pte1); 1114 } 1115 return (pa); 1116 } 1117 1118 /* 1119 * Extract from the kernel page table the physical address 1120 * that is mapped by the given virtual address "va". Also 1121 * return L2 page table entry which maps the address. 1122 * 1123 * This is only intended to be used for panic dumps. 1124 */ 1125 vm_paddr_t 1126 pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) 1127 { 1128 vm_paddr_t pa; 1129 pt1_entry_t pte1; 1130 pt2_entry_t pte2; 1131 1132 pte1 = pte1_load(kern_pte1(va)); 1133 if (pte1_is_section(pte1)) { 1134 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1135 pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; 1136 } else if (pte1_is_link(pte1)) { 1137 pte2 = pte2_load(pt2map_entry(va)); 1138 pa = pte2_pa(pte2); 1139 } else { 1140 pte2 = 0; 1141 pa = 0; 1142 } 1143 if (pte2p != NULL) 1144 *pte2p = pte2; 1145 return (pa); 1146 } 1147 1148 /***************************************************************************** 1149 * 1150 * PMAP second stage initialization and utility functions 1151 * for bootstrap epoch. 1152 * 1153 * After pmap_bootstrap() is called, the following functions for 1154 * mappings can be used: 1155 * 1156 * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); 1157 * void pmap_kremove(vm_offset_t va); 1158 * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1159 * int prot); 1160 * 1161 * NOTE: This is not SMP coherent stage. And physical page allocation is not 1162 * allowed during this stage. 1163 * 1164 *****************************************************************************/ 1165 1166 /* 1167 * Initialize kernel PMAP locks and lists, kernel_pmap itself, and 1168 * reserve various virtual spaces for temporary mappings. 1169 */ 1170 void 1171 pmap_bootstrap(vm_offset_t firstaddr) 1172 { 1173 pt2_entry_t *unused __unused; 1174 struct pcpu *pc; 1175 1176 /* 1177 * Initialize the kernel pmap (which is statically allocated). 1178 */ 1179 PMAP_LOCK_INIT(kernel_pmap); 1180 kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ 1181 kernel_pmap->pm_pt1 = kern_pt1; 1182 kernel_pmap->pm_pt2tab = kern_pt2tab; 1183 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1184 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1185 1186 /* 1187 * Initialize the global pv list lock. 1188 */ 1189 rw_init(&pvh_global_lock, "pmap pv global"); 1190 1191 LIST_INIT(&allpmaps); 1192 1193 /* 1194 * Request a spin mutex so that changes to allpmaps cannot be 1195 * preempted by smp_rendezvous_cpus(). 1196 */ 1197 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 1198 mtx_lock_spin(&allpmaps_lock); 1199 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 1200 mtx_unlock_spin(&allpmaps_lock); 1201 1202 /* 1203 * Reserve some special page table entries/VA space for temporary 1204 * mapping of pages. 1205 */ 1206 #define SYSMAP(c, p, v, n) do { \ 1207 v = (c)pmap_preboot_reserve_pages(n); \ 1208 p = pt2map_entry((vm_offset_t)v); \ 1209 } while (0) 1210 1211 /* 1212 * Local CMAP1/CMAP2 are used for zeroing and copying pages. 1213 * Local CMAP2 is also used for data cache cleaning. 1214 */ 1215 pc = get_pcpu(); 1216 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1217 SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); 1218 SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); 1219 SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); 1220 1221 /* 1222 * Crashdump maps. 1223 */ 1224 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); 1225 1226 /* 1227 * _tmppt is used for reading arbitrary physical pages via /dev/mem. 1228 */ 1229 SYSMAP(caddr_t, unused, _tmppt, 1); 1230 1231 /* 1232 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), 1233 * respectively. PADDR3 is used by pmap_pte2_ddb(). 1234 */ 1235 SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); 1236 SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); 1237 #ifdef DDB 1238 SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); 1239 #endif 1240 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 1241 1242 /* 1243 * Note that in very short time in initarm(), we are going to 1244 * initialize phys_avail[] array and no further page allocation 1245 * can happen after that until vm subsystem will be initialized. 1246 */ 1247 kernel_vm_end_new = kernel_vm_end; 1248 virtual_end = vm_max_kernel_address; 1249 } 1250 1251 static void 1252 pmap_init_reserved_pages(void) 1253 { 1254 struct pcpu *pc; 1255 vm_offset_t pages; 1256 int i; 1257 1258 CPU_FOREACH(i) { 1259 pc = pcpu_find(i); 1260 /* 1261 * Skip if the mapping has already been initialized, 1262 * i.e. this is the BSP. 1263 */ 1264 if (pc->pc_cmap1_addr != 0) 1265 continue; 1266 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1267 pages = kva_alloc(PAGE_SIZE * 3); 1268 if (pages == 0) 1269 panic("%s: unable to allocate KVA", __func__); 1270 pc->pc_cmap1_pte2p = pt2map_entry(pages); 1271 pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); 1272 pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); 1273 pc->pc_cmap1_addr = (caddr_t)pages; 1274 pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); 1275 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 1276 } 1277 } 1278 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 1279 1280 /* 1281 * The function can already be use in second initialization stage. 1282 * As such, the function DOES NOT call pmap_growkernel() where PT2 1283 * allocation can happen. So if used, be sure that PT2 for given 1284 * virtual address is allocated already! 1285 * 1286 * Add a wired page to the kva. 1287 * Note: not SMP coherent. 1288 */ 1289 static __inline void 1290 pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, 1291 uint32_t attr) 1292 { 1293 pt1_entry_t *pte1p; 1294 pt2_entry_t *pte2p; 1295 1296 pte1p = kern_pte1(va); 1297 if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ 1298 /* 1299 * This is a very low level function, so PT2 and particularly 1300 * PT2PG associated with given virtual address must be already 1301 * allocated. It's a pain mainly during pmap initialization 1302 * stage. However, called after pmap initialization with 1303 * virtual address not under kernel_vm_end will lead to 1304 * the same misery. 1305 */ 1306 if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) 1307 panic("%s: kernel PT2 not allocated!", __func__); 1308 } 1309 1310 pte2p = pt2map_entry(va); 1311 pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); 1312 } 1313 1314 PMAP_INLINE void 1315 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1316 { 1317 1318 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); 1319 } 1320 1321 /* 1322 * Remove a page from the kernel pagetables. 1323 * Note: not SMP coherent. 1324 */ 1325 PMAP_INLINE void 1326 pmap_kremove(vm_offset_t va) 1327 { 1328 pt1_entry_t *pte1p; 1329 pt2_entry_t *pte2p; 1330 1331 pte1p = kern_pte1(va); 1332 if (pte1_is_section(pte1_load(pte1p))) { 1333 pte1_clear(pte1p); 1334 } else { 1335 pte2p = pt2map_entry(va); 1336 pte2_clear(pte2p); 1337 } 1338 } 1339 1340 /* 1341 * Share new kernel PT2PG with all pmaps. 1342 * The caller is responsible for maintaining TLB consistency. 1343 */ 1344 static void 1345 pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) 1346 { 1347 pmap_t pmap; 1348 pt2_entry_t *pte2p; 1349 1350 mtx_lock_spin(&allpmaps_lock); 1351 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1352 pte2p = pmap_pt2tab_entry(pmap, va); 1353 pt2tab_store(pte2p, npte2); 1354 } 1355 mtx_unlock_spin(&allpmaps_lock); 1356 } 1357 1358 /* 1359 * Share new kernel PTE1 with all pmaps. 1360 * The caller is responsible for maintaining TLB consistency. 1361 */ 1362 static void 1363 pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) 1364 { 1365 pmap_t pmap; 1366 pt1_entry_t *pte1p; 1367 1368 mtx_lock_spin(&allpmaps_lock); 1369 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1370 pte1p = pmap_pte1(pmap, va); 1371 pte1_store(pte1p, npte1); 1372 } 1373 mtx_unlock_spin(&allpmaps_lock); 1374 } 1375 1376 /* 1377 * Used to map a range of physical addresses into kernel 1378 * virtual address space. 1379 * 1380 * The value passed in '*virt' is a suggested virtual address for 1381 * the mapping. Architectures which can support a direct-mapped 1382 * physical to virtual region can return the appropriate address 1383 * within that region, leaving '*virt' unchanged. Other 1384 * architectures should map the pages starting at '*virt' and 1385 * update '*virt' with the first usable address after the mapped 1386 * region. 1387 * 1388 * NOTE: Read the comments above pmap_kenter_prot_attr() as 1389 * the function is used herein! 1390 */ 1391 vm_offset_t 1392 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1393 { 1394 vm_offset_t va, sva; 1395 vm_paddr_t pte1_offset; 1396 pt1_entry_t npte1; 1397 uint32_t l1prot, l2prot; 1398 uint32_t l1attr, l2attr; 1399 1400 PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," 1401 " prot = %d\n", __func__, *virt, start, end, end - start, prot)); 1402 1403 l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; 1404 l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1405 l1prot = ATTR_TO_L1(l2prot); 1406 1407 l2attr = PTE2_ATTR_DEFAULT; 1408 l1attr = ATTR_TO_L1(l2attr); 1409 1410 va = *virt; 1411 /* 1412 * Does the physical address range's size and alignment permit at 1413 * least one section mapping to be created? 1414 */ 1415 pte1_offset = start & PTE1_OFFSET; 1416 if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= 1417 PTE1_SIZE) { 1418 /* 1419 * Increase the starting virtual address so that its alignment 1420 * does not preclude the use of section mappings. 1421 */ 1422 if ((va & PTE1_OFFSET) < pte1_offset) 1423 va = pte1_trunc(va) + pte1_offset; 1424 else if ((va & PTE1_OFFSET) > pte1_offset) 1425 va = pte1_roundup(va) + pte1_offset; 1426 } 1427 sva = va; 1428 while (start < end) { 1429 if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { 1430 KASSERT((va & PTE1_OFFSET) == 0, 1431 ("%s: misaligned va %#x", __func__, va)); 1432 npte1 = PTE1_KERN(start, l1prot, l1attr); 1433 pmap_kenter_pte1(va, npte1); 1434 va += PTE1_SIZE; 1435 start += PTE1_SIZE; 1436 } else { 1437 pmap_kenter_prot_attr(va, start, l2prot, l2attr); 1438 va += PAGE_SIZE; 1439 start += PAGE_SIZE; 1440 } 1441 } 1442 tlb_flush_range(sva, va - sva); 1443 *virt = va; 1444 return (sva); 1445 } 1446 1447 /* 1448 * Make a temporary mapping for a physical address. 1449 * This is only intended to be used for panic dumps. 1450 */ 1451 void * 1452 pmap_kenter_temporary(vm_paddr_t pa, int i) 1453 { 1454 vm_offset_t va; 1455 1456 /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ 1457 1458 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 1459 pmap_kenter(va, pa); 1460 tlb_flush_local(va); 1461 return ((void *)crashdumpmap); 1462 } 1463 1464 /************************************* 1465 * 1466 * TLB & cache maintenance routines. 1467 * 1468 *************************************/ 1469 1470 /* 1471 * We inline these within pmap.c for speed. 1472 */ 1473 PMAP_INLINE void 1474 pmap_tlb_flush(pmap_t pmap, vm_offset_t va) 1475 { 1476 1477 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1478 tlb_flush(va); 1479 } 1480 1481 PMAP_INLINE void 1482 pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) 1483 { 1484 1485 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1486 tlb_flush_range(sva, size); 1487 } 1488 1489 /* 1490 * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. 1491 * Requirements: 1492 * - Must deal with pages in order to ensure that none of the PTE2_* bits 1493 * are ever set, PTE2_V in particular. 1494 * - Assumes we can write to pte2s without pte2_store() atomic ops. 1495 * - Assumes nothing will ever test these addresses for 0 to indicate 1496 * no mapping instead of correctly checking PTE2_V. 1497 * - Assumes a vm_offset_t will fit in a pte2 (true for arm). 1498 * Because PTE2_V is never set, there can be no mappings to invalidate. 1499 */ 1500 static vm_offset_t 1501 pmap_pte2list_alloc(vm_offset_t *head) 1502 { 1503 pt2_entry_t *pte2p; 1504 vm_offset_t va; 1505 1506 va = *head; 1507 if (va == 0) 1508 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 1509 pte2p = pt2map_entry(va); 1510 *head = *pte2p; 1511 if (*head & PTE2_V) 1512 panic("%s: va with PTE2_V set!", __func__); 1513 *pte2p = 0; 1514 return (va); 1515 } 1516 1517 static void 1518 pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) 1519 { 1520 pt2_entry_t *pte2p; 1521 1522 if (va & PTE2_V) 1523 panic("%s: freeing va with PTE2_V set!", __func__); 1524 pte2p = pt2map_entry(va); 1525 *pte2p = *head; /* virtual! PTE2_V is 0 though */ 1526 *head = va; 1527 } 1528 1529 static void 1530 pmap_pte2list_init(vm_offset_t *head, void *base, int npages) 1531 { 1532 int i; 1533 vm_offset_t va; 1534 1535 *head = 0; 1536 for (i = npages - 1; i >= 0; i--) { 1537 va = (vm_offset_t)base + i * PAGE_SIZE; 1538 pmap_pte2list_free(head, va); 1539 } 1540 } 1541 1542 /***************************************************************************** 1543 * 1544 * PMAP third and final stage initialization. 1545 * 1546 * After pmap_init() is called, PMAP subsystem is fully initialized. 1547 * 1548 *****************************************************************************/ 1549 1550 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1551 "VM/pmap parameters"); 1552 1553 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1554 "Max number of PV entries"); 1555 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1556 "Page share factor per proc"); 1557 1558 static u_long nkpt2pg = NKPT2PG; 1559 SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, 1560 &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); 1561 1562 static int sp_enabled = 1; 1563 SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 1564 &sp_enabled, 0, "Are large page mappings enabled?"); 1565 1566 bool 1567 pmap_ps_enabled(pmap_t pmap __unused) 1568 { 1569 1570 return (sp_enabled != 0); 1571 } 1572 1573 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1574 "1MB page mapping counters"); 1575 1576 static u_long pmap_pte1_demotions; 1577 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, 1578 &pmap_pte1_demotions, 0, "1MB page demotions"); 1579 1580 static u_long pmap_pte1_mappings; 1581 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, 1582 &pmap_pte1_mappings, 0, "1MB page mappings"); 1583 1584 static u_long pmap_pte1_p_failures; 1585 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, 1586 &pmap_pte1_p_failures, 0, "1MB page promotion failures"); 1587 1588 static u_long pmap_pte1_promotions; 1589 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, 1590 &pmap_pte1_promotions, 0, "1MB page promotions"); 1591 1592 static u_long pmap_pte1_kern_demotions; 1593 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, 1594 &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); 1595 1596 static u_long pmap_pte1_kern_promotions; 1597 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, 1598 &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); 1599 1600 static __inline ttb_entry_t 1601 pmap_ttb_get(pmap_t pmap) 1602 { 1603 1604 return (vtophys(pmap->pm_pt1) | ttb_flags); 1605 } 1606 1607 /* 1608 * Initialize a vm_page's machine-dependent fields. 1609 * 1610 * Variations: 1611 * 1. Pages for L2 page tables are always not managed. So, pv_list and 1612 * pt2_wirecount can share same physical space. However, proper 1613 * initialization on a page alloc for page tables and reinitialization 1614 * on the page free must be ensured. 1615 */ 1616 void 1617 pmap_page_init(vm_page_t m) 1618 { 1619 1620 TAILQ_INIT(&m->md.pv_list); 1621 pt2_wirecount_init(m); 1622 m->md.pat_mode = VM_MEMATTR_DEFAULT; 1623 } 1624 1625 /* 1626 * Virtualization for faster way how to zero whole page. 1627 */ 1628 static __inline void 1629 pagezero(void *page) 1630 { 1631 1632 bzero(page, PAGE_SIZE); 1633 } 1634 1635 /* 1636 * Zero L2 page table page. 1637 * Use same KVA as in pmap_zero_page(). 1638 */ 1639 static __inline vm_paddr_t 1640 pmap_pt2pg_zero(vm_page_t m) 1641 { 1642 pt2_entry_t *cmap2_pte2p; 1643 vm_paddr_t pa; 1644 struct pcpu *pc; 1645 1646 pa = VM_PAGE_TO_PHYS(m); 1647 1648 /* 1649 * XXX: For now, we map whole page even if it's already zero, 1650 * to sync it even if the sync is only DSB. 1651 */ 1652 sched_pin(); 1653 pc = get_pcpu(); 1654 cmap2_pte2p = pc->pc_cmap2_pte2p; 1655 mtx_lock(&pc->pc_cmap_lock); 1656 if (pte2_load(cmap2_pte2p) != 0) 1657 panic("%s: CMAP2 busy", __func__); 1658 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 1659 vm_page_pte2_attr(m))); 1660 /* Even VM_ALLOC_ZERO request is only advisory. */ 1661 if ((m->flags & PG_ZERO) == 0) 1662 pagezero(pc->pc_cmap2_addr); 1663 pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); 1664 pte2_clear(cmap2_pte2p); 1665 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 1666 1667 /* 1668 * Unpin the thread before releasing the lock. Otherwise the thread 1669 * could be rescheduled while still bound to the current CPU, only 1670 * to unpin itself immediately upon resuming execution. 1671 */ 1672 sched_unpin(); 1673 mtx_unlock(&pc->pc_cmap_lock); 1674 1675 return (pa); 1676 } 1677 1678 /* 1679 * Init just allocated page as L2 page table(s) holder 1680 * and return its physical address. 1681 */ 1682 static __inline vm_paddr_t 1683 pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) 1684 { 1685 vm_paddr_t pa; 1686 pt2_entry_t *pte2p; 1687 1688 /* Check page attributes. */ 1689 if (m->md.pat_mode != pt_memattr) 1690 pmap_page_set_memattr(m, pt_memattr); 1691 1692 /* Zero page and init wire counts. */ 1693 pa = pmap_pt2pg_zero(m); 1694 pt2_wirecount_init(m); 1695 1696 /* 1697 * Map page to PT2MAP address space for given pmap. 1698 * Note that PT2MAP space is shared with all pmaps. 1699 */ 1700 if (pmap == kernel_pmap) 1701 pmap_kenter_pt2tab(va, PTE2_KPT(pa)); 1702 else { 1703 pte2p = pmap_pt2tab_entry(pmap, va); 1704 pt2tab_store(pte2p, PTE2_KPT_NG(pa)); 1705 } 1706 1707 return (pa); 1708 } 1709 1710 /* 1711 * Initialize the pmap module. 1712 * Called by vm_init, to initialize any structures that the pmap 1713 * system needs to map virtual memory. 1714 */ 1715 void 1716 pmap_init(void) 1717 { 1718 vm_size_t s; 1719 pt2_entry_t *pte2p, pte2; 1720 u_int i, pte1_idx, pv_npg; 1721 1722 PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); 1723 1724 /* 1725 * Initialize the vm page array entries for kernel pmap's 1726 * L2 page table pages allocated in advance. 1727 */ 1728 pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); 1729 pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); 1730 for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { 1731 vm_paddr_t pa; 1732 vm_page_t m; 1733 1734 pte2 = pte2_load(pte2p); 1735 KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); 1736 1737 pa = pte2_pa(pte2); 1738 m = PHYS_TO_VM_PAGE(pa); 1739 KASSERT(m >= vm_page_array && 1740 m < &vm_page_array[vm_page_array_size], 1741 ("%s: L2 page table page is out of range", __func__)); 1742 1743 m->pindex = pte1_idx; 1744 m->phys_addr = pa; 1745 pte1_idx += NPT2_IN_PG; 1746 } 1747 1748 /* 1749 * Initialize the address space (zone) for the pv entries. Set a 1750 * high water mark so that the system can recover from excessive 1751 * numbers of pv entries. 1752 */ 1753 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1754 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1755 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1756 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1757 pv_entry_high_water = 9 * (pv_entry_max / 10); 1758 1759 /* 1760 * Are large page mappings enabled? 1761 */ 1762 TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); 1763 if (sp_enabled) { 1764 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1765 ("%s: can't assign to pagesizes[1]", __func__)); 1766 pagesizes[1] = PTE1_SIZE; 1767 } 1768 1769 /* 1770 * Calculate the size of the pv head table for sections. 1771 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1772 * Note that the table is only for sections which could be promoted. 1773 */ 1774 first_managed_pa = pte1_trunc(vm_phys_segs[0].start); 1775 pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) 1776 - first_managed_pa) / PTE1_SIZE + 1; 1777 1778 /* 1779 * Allocate memory for the pv head table for sections. 1780 */ 1781 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1782 s = round_page(s); 1783 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1784 for (i = 0; i < pv_npg; i++) 1785 TAILQ_INIT(&pv_table[i].pv_list); 1786 1787 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1788 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1789 if (pv_chunkbase == NULL) 1790 panic("%s: not enough kvm for pv chunks", __func__); 1791 pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1792 } 1793 1794 /* 1795 * Add a list of wired pages to the kva 1796 * this routine is only used for temporary 1797 * kernel mappings that do not need to have 1798 * page modification or references recorded. 1799 * Note that old mappings are simply written 1800 * over. The page *must* be wired. 1801 * Note: SMP coherent. Uses a ranged shootdown IPI. 1802 */ 1803 void 1804 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1805 { 1806 u_int anychanged; 1807 pt2_entry_t *epte2p, *pte2p, pte2; 1808 vm_page_t m; 1809 vm_paddr_t pa; 1810 1811 anychanged = 0; 1812 pte2p = pt2map_entry(sva); 1813 epte2p = pte2p + count; 1814 while (pte2p < epte2p) { 1815 m = *ma++; 1816 pa = VM_PAGE_TO_PHYS(m); 1817 pte2 = pte2_load(pte2p); 1818 if ((pte2_pa(pte2) != pa) || 1819 (pte2_attr(pte2) != vm_page_pte2_attr(m))) { 1820 anychanged++; 1821 pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, 1822 vm_page_pte2_attr(m))); 1823 } 1824 pte2p++; 1825 } 1826 if (__predict_false(anychanged)) 1827 tlb_flush_range(sva, count * PAGE_SIZE); 1828 } 1829 1830 /* 1831 * This routine tears out page mappings from the 1832 * kernel -- it is meant only for temporary mappings. 1833 * Note: SMP coherent. Uses a ranged shootdown IPI. 1834 */ 1835 void 1836 pmap_qremove(vm_offset_t sva, int count) 1837 { 1838 vm_offset_t va; 1839 1840 va = sva; 1841 while (count-- > 0) { 1842 pmap_kremove(va); 1843 va += PAGE_SIZE; 1844 } 1845 tlb_flush_range(sva, va - sva); 1846 } 1847 1848 /* 1849 * Are we current address space or kernel? 1850 */ 1851 static __inline int 1852 pmap_is_current(pmap_t pmap) 1853 { 1854 1855 return (pmap == kernel_pmap || 1856 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); 1857 } 1858 1859 /* 1860 * If the given pmap is not the current or kernel pmap, the returned 1861 * pte2 must be released by passing it to pmap_pte2_release(). 1862 */ 1863 static pt2_entry_t * 1864 pmap_pte2(pmap_t pmap, vm_offset_t va) 1865 { 1866 pt1_entry_t pte1; 1867 vm_paddr_t pt2pg_pa; 1868 1869 pte1 = pte1_load(pmap_pte1(pmap, va)); 1870 if (pte1_is_section(pte1)) 1871 panic("%s: attempt to map PTE1", __func__); 1872 if (pte1_is_link(pte1)) { 1873 /* Are we current address space or kernel? */ 1874 if (pmap_is_current(pmap)) 1875 return (pt2map_entry(va)); 1876 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1877 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1878 mtx_lock(&PMAP2mutex); 1879 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 1880 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 1881 tlb_flush((vm_offset_t)PADDR2); 1882 } 1883 return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1884 } 1885 return (NULL); 1886 } 1887 1888 /* 1889 * Releases a pte2 that was obtained from pmap_pte2(). 1890 * Be prepared for the pte2p being NULL. 1891 */ 1892 static __inline void 1893 pmap_pte2_release(pt2_entry_t *pte2p) 1894 { 1895 1896 if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { 1897 mtx_unlock(&PMAP2mutex); 1898 } 1899 } 1900 1901 /* 1902 * Super fast pmap_pte2 routine best used when scanning 1903 * the pv lists. This eliminates many coarse-grained 1904 * invltlb calls. Note that many of the pv list 1905 * scans are across different pmaps. It is very wasteful 1906 * to do an entire tlb flush for checking a single mapping. 1907 * 1908 * If the given pmap is not the current pmap, pvh_global_lock 1909 * must be held and curthread pinned to a CPU. 1910 */ 1911 static pt2_entry_t * 1912 pmap_pte2_quick(pmap_t pmap, vm_offset_t va) 1913 { 1914 pt1_entry_t pte1; 1915 vm_paddr_t pt2pg_pa; 1916 1917 pte1 = pte1_load(pmap_pte1(pmap, va)); 1918 if (pte1_is_section(pte1)) 1919 panic("%s: attempt to map PTE1", __func__); 1920 if (pte1_is_link(pte1)) { 1921 /* Are we current address space or kernel? */ 1922 if (pmap_is_current(pmap)) 1923 return (pt2map_entry(va)); 1924 rw_assert(&pvh_global_lock, RA_WLOCKED); 1925 KASSERT(curthread->td_pinned > 0, 1926 ("%s: curthread not pinned", __func__)); 1927 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1928 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1929 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 1930 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 1931 #ifdef SMP 1932 PMAP1cpu = PCPU_GET(cpuid); 1933 #endif 1934 tlb_flush_local((vm_offset_t)PADDR1); 1935 PMAP1changed++; 1936 } else 1937 #ifdef SMP 1938 if (PMAP1cpu != PCPU_GET(cpuid)) { 1939 PMAP1cpu = PCPU_GET(cpuid); 1940 tlb_flush_local((vm_offset_t)PADDR1); 1941 PMAP1changedcpu++; 1942 } else 1943 #endif 1944 PMAP1unchanged++; 1945 return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1946 } 1947 return (NULL); 1948 } 1949 1950 /* 1951 * Routine: pmap_extract 1952 * Function: 1953 * Extract the physical page address associated 1954 * with the given map/virtual_address pair. 1955 */ 1956 vm_paddr_t 1957 pmap_extract(pmap_t pmap, vm_offset_t va) 1958 { 1959 vm_paddr_t pa; 1960 pt1_entry_t pte1; 1961 pt2_entry_t *pte2p; 1962 1963 PMAP_LOCK(pmap); 1964 pte1 = pte1_load(pmap_pte1(pmap, va)); 1965 if (pte1_is_section(pte1)) 1966 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1967 else if (pte1_is_link(pte1)) { 1968 pte2p = pmap_pte2(pmap, va); 1969 pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); 1970 pmap_pte2_release(pte2p); 1971 } else 1972 pa = 0; 1973 PMAP_UNLOCK(pmap); 1974 return (pa); 1975 } 1976 1977 /* 1978 * Routine: pmap_extract_and_hold 1979 * Function: 1980 * Atomically extract and hold the physical page 1981 * with the given pmap and virtual address pair 1982 * if that mapping permits the given protection. 1983 */ 1984 vm_page_t 1985 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1986 { 1987 vm_paddr_t pa; 1988 pt1_entry_t pte1; 1989 pt2_entry_t pte2, *pte2p; 1990 vm_page_t m; 1991 1992 m = NULL; 1993 PMAP_LOCK(pmap); 1994 pte1 = pte1_load(pmap_pte1(pmap, va)); 1995 if (pte1_is_section(pte1)) { 1996 if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { 1997 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1998 m = PHYS_TO_VM_PAGE(pa); 1999 if (!vm_page_wire_mapped(m)) 2000 m = NULL; 2001 } 2002 } else if (pte1_is_link(pte1)) { 2003 pte2p = pmap_pte2(pmap, va); 2004 pte2 = pte2_load(pte2p); 2005 pmap_pte2_release(pte2p); 2006 if (pte2_is_valid(pte2) && 2007 (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { 2008 pa = pte2_pa(pte2); 2009 m = PHYS_TO_VM_PAGE(pa); 2010 if (!vm_page_wire_mapped(m)) 2011 m = NULL; 2012 } 2013 } 2014 PMAP_UNLOCK(pmap); 2015 return (m); 2016 } 2017 2018 /* 2019 * Grow the number of kernel L2 page table entries, if needed. 2020 */ 2021 void 2022 pmap_growkernel(vm_offset_t addr) 2023 { 2024 vm_page_t m; 2025 vm_paddr_t pt2pg_pa, pt2_pa; 2026 pt1_entry_t pte1; 2027 pt2_entry_t pte2; 2028 2029 PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); 2030 /* 2031 * All the time kernel_vm_end is first KVA for which underlying 2032 * L2 page table is either not allocated or linked from L1 page table 2033 * (not considering sections). Except for two possible cases: 2034 * 2035 * (1) in the very beginning as long as pmap_growkernel() was 2036 * not called, it could be first unused KVA (which is not 2037 * rounded up to PTE1_SIZE), 2038 * 2039 * (2) when all KVA space is mapped and vm_map_max(kernel_map) 2040 * address is not rounded up to PTE1_SIZE. (For example, 2041 * it could be 0xFFFFFFFF.) 2042 */ 2043 kernel_vm_end = pte1_roundup(kernel_vm_end); 2044 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2045 addr = roundup2(addr, PTE1_SIZE); 2046 if (addr - 1 >= vm_map_max(kernel_map)) 2047 addr = vm_map_max(kernel_map); 2048 while (kernel_vm_end < addr) { 2049 pte1 = pte1_load(kern_pte1(kernel_vm_end)); 2050 if (pte1_is_valid(pte1)) { 2051 kernel_vm_end += PTE1_SIZE; 2052 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2053 kernel_vm_end = vm_map_max(kernel_map); 2054 break; 2055 } 2056 continue; 2057 } 2058 2059 /* 2060 * kernel_vm_end_new is used in pmap_pinit() when kernel 2061 * mappings are entered to new pmap all at once to avoid race 2062 * between pmap_kenter_pte1() and kernel_vm_end increase. 2063 * The same aplies to pmap_kenter_pt2tab(). 2064 */ 2065 kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; 2066 2067 pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); 2068 if (!pte2_is_valid(pte2)) { 2069 /* 2070 * Install new PT2s page into kernel PT2TAB. 2071 */ 2072 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 2073 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2074 if (m == NULL) 2075 panic("%s: no memory to grow kernel", __func__); 2076 m->pindex = pte1_index(kernel_vm_end) & ~PT2PG_MASK; 2077 2078 /* 2079 * QQQ: To link all new L2 page tables from L1 page 2080 * table now and so pmap_kenter_pte1() them 2081 * at once together with pmap_kenter_pt2tab() 2082 * could be nice speed up. However, 2083 * pmap_growkernel() does not happen so often... 2084 * QQQ: The other TTBR is another option. 2085 */ 2086 pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, 2087 m); 2088 } else 2089 pt2pg_pa = pte2_pa(pte2); 2090 2091 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); 2092 pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); 2093 2094 kernel_vm_end = kernel_vm_end_new; 2095 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2096 kernel_vm_end = vm_map_max(kernel_map); 2097 break; 2098 } 2099 } 2100 } 2101 2102 static int 2103 kvm_size(SYSCTL_HANDLER_ARGS) 2104 { 2105 unsigned long ksize = vm_max_kernel_address - KERNBASE; 2106 2107 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2108 } 2109 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, 2110 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_size, "IU", 2111 "Size of KVM"); 2112 2113 static int 2114 kvm_free(SYSCTL_HANDLER_ARGS) 2115 { 2116 unsigned long kfree = vm_max_kernel_address - kernel_vm_end; 2117 2118 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2119 } 2120 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, 2121 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_free, "IU", 2122 "Amount of KVM free"); 2123 2124 /*********************************************** 2125 * 2126 * Pmap allocation/deallocation routines. 2127 * 2128 ***********************************************/ 2129 2130 /* 2131 * Initialize the pmap for the swapper process. 2132 */ 2133 void 2134 pmap_pinit0(pmap_t pmap) 2135 { 2136 PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); 2137 2138 PMAP_LOCK_INIT(pmap); 2139 2140 /* 2141 * Kernel page table directory and pmap stuff around is already 2142 * initialized, we are using it right now and here. So, finish 2143 * only PMAP structures initialization for process0 ... 2144 * 2145 * Since the L1 page table and PT2TAB is shared with the kernel pmap, 2146 * which is already included in the list "allpmaps", this pmap does 2147 * not need to be inserted into that list. 2148 */ 2149 pmap->pm_pt1 = kern_pt1; 2150 pmap->pm_pt2tab = kern_pt2tab; 2151 CPU_ZERO(&pmap->pm_active); 2152 PCPU_SET(curpmap, pmap); 2153 TAILQ_INIT(&pmap->pm_pvchunk); 2154 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2155 CPU_SET(0, &pmap->pm_active); 2156 } 2157 2158 static __inline void 2159 pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, 2160 vm_offset_t eva) 2161 { 2162 u_int idx, count; 2163 2164 idx = pte1_index(sva); 2165 count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); 2166 bcopy(spte1p + idx, dpte1p + idx, count); 2167 } 2168 2169 static __inline void 2170 pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, 2171 vm_offset_t eva) 2172 { 2173 u_int idx, count; 2174 2175 idx = pt2tab_index(sva); 2176 count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); 2177 bcopy(spte2p + idx, dpte2p + idx, count); 2178 } 2179 2180 /* 2181 * Initialize a preallocated and zeroed pmap structure, 2182 * such as one in a vmspace structure. 2183 */ 2184 int 2185 pmap_pinit(pmap_t pmap) 2186 { 2187 pt1_entry_t *pte1p; 2188 pt2_entry_t *pte2p; 2189 vm_paddr_t pa, pt2tab_pa; 2190 u_int i; 2191 2192 PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, 2193 pmap->pm_pt1)); 2194 2195 /* 2196 * No need to allocate L2 page table space yet but we do need 2197 * a valid L1 page table and PT2TAB table. 2198 * 2199 * Install shared kernel mappings to these tables. It's a little 2200 * tricky as some parts of KVA are reserved for vectors, devices, 2201 * and whatever else. These parts are supposed to be above 2202 * vm_max_kernel_address. Thus two regions should be installed: 2203 * 2204 * (1) <KERNBASE, kernel_vm_end), 2205 * (2) <vm_max_kernel_address, 0xFFFFFFFF>. 2206 * 2207 * QQQ: The second region should be stable enough to be installed 2208 * only once in time when the tables are allocated. 2209 * QQQ: Maybe copy of both regions at once could be faster ... 2210 * QQQ: Maybe the other TTBR is an option. 2211 * 2212 * Finally, install own PT2TAB table to these tables. 2213 */ 2214 2215 if (pmap->pm_pt1 == NULL) { 2216 pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(NB_IN_PT1, 2217 M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); 2218 if (pmap->pm_pt1 == NULL) 2219 return (0); 2220 } 2221 if (pmap->pm_pt2tab == NULL) { 2222 /* 2223 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page 2224 * only, what should be the only size for 32 bit systems, 2225 * then we could allocate it with vm_page_alloc() and all 2226 * the stuff needed as other L2 page table pages. 2227 * (2) Note that a process PT2TAB is special L2 page table 2228 * page. Its mapping in kernel_arena is permanent and can 2229 * be used no matter which process is current. Its mapping 2230 * in PT2MAP can be used only for current process. 2231 */ 2232 pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(NB_IN_PT2TAB, 2233 M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); 2234 if (pmap->pm_pt2tab == NULL) { 2235 /* 2236 * QQQ: As struct pmap is allocated from UMA with 2237 * UMA_ZONE_NOFREE flag, it's important to leave 2238 * no allocation in pmap if initialization failed. 2239 */ 2240 kmem_free((vm_offset_t)pmap->pm_pt1, NB_IN_PT1); 2241 pmap->pm_pt1 = NULL; 2242 return (0); 2243 } 2244 /* 2245 * QQQ: Each L2 page table page vm_page_t has pindex set to 2246 * pte1 index of virtual address mapped by this page. 2247 * It's not valid for non kernel PT2TABs themselves. 2248 * The pindex of these pages can not be altered because 2249 * of the way how they are allocated now. However, it 2250 * should not be a problem. 2251 */ 2252 } 2253 2254 mtx_lock_spin(&allpmaps_lock); 2255 /* 2256 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), 2257 * kernel_vm_end_new is used here instead of kernel_vm_end. 2258 */ 2259 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, 2260 kernel_vm_end_new - 1); 2261 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 2262 0xFFFFFFFF); 2263 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, 2264 kernel_vm_end_new - 1); 2265 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 2266 0xFFFFFFFF); 2267 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 2268 mtx_unlock_spin(&allpmaps_lock); 2269 2270 /* 2271 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. 2272 * I.e. self reference mapping. The PT2TAB is private, however mapped 2273 * into shared PT2MAP space, so the mapping should be not global. 2274 */ 2275 pt2tab_pa = vtophys(pmap->pm_pt2tab); 2276 pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); 2277 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 2278 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 2279 } 2280 2281 /* Insert PT2MAP PT2s into pmap PT1. */ 2282 pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); 2283 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 2284 pte1_store(pte1p++, PTE1_LINK(pa)); 2285 } 2286 2287 /* 2288 * Now synchronize new mapping which was made above. 2289 */ 2290 pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); 2291 pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); 2292 2293 CPU_ZERO(&pmap->pm_active); 2294 TAILQ_INIT(&pmap->pm_pvchunk); 2295 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2296 2297 return (1); 2298 } 2299 2300 #ifdef INVARIANTS 2301 static boolean_t 2302 pt2tab_user_is_empty(pt2_entry_t *tab) 2303 { 2304 u_int i, end; 2305 2306 end = pt2tab_index(VM_MAXUSER_ADDRESS); 2307 for (i = 0; i < end; i++) 2308 if (tab[i] != 0) return (FALSE); 2309 return (TRUE); 2310 } 2311 #endif 2312 /* 2313 * Release any resources held by the given physical map. 2314 * Called when a pmap initialized by pmap_pinit is being released. 2315 * Should only be called if the map contains no valid mappings. 2316 */ 2317 void 2318 pmap_release(pmap_t pmap) 2319 { 2320 #ifdef INVARIANTS 2321 vm_offset_t start, end; 2322 #endif 2323 KASSERT(pmap->pm_stats.resident_count == 0, 2324 ("%s: pmap resident count %ld != 0", __func__, 2325 pmap->pm_stats.resident_count)); 2326 KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), 2327 ("%s: has allocated user PT2(s)", __func__)); 2328 KASSERT(CPU_EMPTY(&pmap->pm_active), 2329 ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); 2330 2331 mtx_lock_spin(&allpmaps_lock); 2332 LIST_REMOVE(pmap, pm_list); 2333 mtx_unlock_spin(&allpmaps_lock); 2334 2335 #ifdef INVARIANTS 2336 start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); 2337 end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); 2338 bzero((char *)pmap->pm_pt1 + start, end - start); 2339 2340 start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); 2341 end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); 2342 bzero((char *)pmap->pm_pt2tab + start, end - start); 2343 #endif 2344 /* 2345 * We are leaving PT1 and PT2TAB allocated on released pmap, 2346 * so hopefully UMA vmspace_zone will always be inited with 2347 * UMA_ZONE_NOFREE flag. 2348 */ 2349 } 2350 2351 /********************************************************* 2352 * 2353 * L2 table pages and their pages management routines. 2354 * 2355 *********************************************************/ 2356 2357 /* 2358 * Virtual interface for L2 page table wire counting. 2359 * 2360 * Each L2 page table in a page has own counter which counts a number of 2361 * valid mappings in a table. Global page counter counts mappings in all 2362 * tables in a page plus a single itself mapping in PT2TAB. 2363 * 2364 * During a promotion we leave the associated L2 page table counter 2365 * untouched, so the table (strictly speaking a page which holds it) 2366 * is never freed if promoted. 2367 * 2368 * If a page m->ref_count == 1 then no valid mappings exist in any L2 page 2369 * table in the page and the page itself is only mapped in PT2TAB. 2370 */ 2371 2372 static __inline void 2373 pt2_wirecount_init(vm_page_t m) 2374 { 2375 u_int i; 2376 2377 /* 2378 * Note: A page m is allocated with VM_ALLOC_WIRED flag and 2379 * m->ref_count should be already set correctly. 2380 * So, there is no need to set it again herein. 2381 */ 2382 for (i = 0; i < NPT2_IN_PG; i++) 2383 m->md.pt2_wirecount[i] = 0; 2384 } 2385 2386 static __inline void 2387 pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) 2388 { 2389 2390 /* 2391 * Note: A just modificated pte2 (i.e. already allocated) 2392 * is acquiring one extra reference which must be 2393 * explicitly cleared. It influences the KASSERTs herein. 2394 * All L2 page tables in a page always belong to the same 2395 * pmap, so we allow only one extra reference for the page. 2396 */ 2397 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), 2398 ("%s: PT2 is overflowing ...", __func__)); 2399 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2400 ("%s: PT2PG is overflowing ...", __func__)); 2401 2402 m->ref_count++; 2403 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; 2404 } 2405 2406 static __inline void 2407 pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) 2408 { 2409 2410 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, 2411 ("%s: PT2 is underflowing ...", __func__)); 2412 KASSERT(m->ref_count > 1, 2413 ("%s: PT2PG is underflowing ...", __func__)); 2414 2415 m->ref_count--; 2416 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; 2417 } 2418 2419 static __inline void 2420 pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) 2421 { 2422 2423 KASSERT(count <= NPTE2_IN_PT2, 2424 ("%s: invalid count %u", __func__, count)); 2425 KASSERT(m->ref_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], 2426 ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->ref_count, 2427 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); 2428 2429 m->ref_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; 2430 m->ref_count += count; 2431 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; 2432 2433 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2434 ("%s: PT2PG is overflowed (%u) ...", __func__, m->ref_count)); 2435 } 2436 2437 static __inline uint32_t 2438 pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) 2439 { 2440 2441 return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); 2442 } 2443 2444 static __inline boolean_t 2445 pt2_is_empty(vm_page_t m, vm_offset_t va) 2446 { 2447 2448 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); 2449 } 2450 2451 static __inline boolean_t 2452 pt2_is_full(vm_page_t m, vm_offset_t va) 2453 { 2454 2455 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 2456 NPTE2_IN_PT2); 2457 } 2458 2459 static __inline boolean_t 2460 pt2pg_is_empty(vm_page_t m) 2461 { 2462 2463 return (m->ref_count == 1); 2464 } 2465 2466 /* 2467 * This routine is called if the L2 page table 2468 * is not mapped correctly. 2469 */ 2470 static vm_page_t 2471 _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2472 { 2473 uint32_t pte1_idx; 2474 pt1_entry_t *pte1p; 2475 pt2_entry_t pte2; 2476 vm_page_t m; 2477 vm_paddr_t pt2pg_pa, pt2_pa; 2478 2479 pte1_idx = pte1_index(va); 2480 pte1p = pmap->pm_pt1 + pte1_idx; 2481 2482 KASSERT(pte1_load(pte1p) == 0, 2483 ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, 2484 pte1_load(pte1p))); 2485 2486 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); 2487 if (!pte2_is_valid(pte2)) { 2488 /* 2489 * Install new PT2s page into pmap PT2TAB. 2490 */ 2491 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2492 if (m == NULL) { 2493 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2494 PMAP_UNLOCK(pmap); 2495 rw_wunlock(&pvh_global_lock); 2496 vm_wait(NULL); 2497 rw_wlock(&pvh_global_lock); 2498 PMAP_LOCK(pmap); 2499 } 2500 2501 /* 2502 * Indicate the need to retry. While waiting, 2503 * the L2 page table page may have been allocated. 2504 */ 2505 return (NULL); 2506 } 2507 m->pindex = pte1_idx & ~PT2PG_MASK; 2508 pmap->pm_stats.resident_count++; 2509 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 2510 } else { 2511 pt2pg_pa = pte2_pa(pte2); 2512 m = PHYS_TO_VM_PAGE(pt2pg_pa); 2513 } 2514 2515 pt2_wirecount_inc(m, pte1_idx); 2516 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 2517 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 2518 2519 return (m); 2520 } 2521 2522 static vm_page_t 2523 pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2524 { 2525 u_int pte1_idx; 2526 pt1_entry_t *pte1p, pte1; 2527 vm_page_t m; 2528 2529 pte1_idx = pte1_index(va); 2530 retry: 2531 pte1p = pmap->pm_pt1 + pte1_idx; 2532 pte1 = pte1_load(pte1p); 2533 2534 /* 2535 * This supports switching from a 1MB page to a 2536 * normal 4K page. 2537 */ 2538 if (pte1_is_section(pte1)) { 2539 (void)pmap_demote_pte1(pmap, pte1p, va); 2540 /* 2541 * Reload pte1 after demotion. 2542 * 2543 * Note: Demotion can even fail as either PT2 is not find for 2544 * the virtual address or PT2PG can not be allocated. 2545 */ 2546 pte1 = pte1_load(pte1p); 2547 } 2548 2549 /* 2550 * If the L2 page table page is mapped, we just increment the 2551 * hold count, and activate it. 2552 */ 2553 if (pte1_is_link(pte1)) { 2554 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2555 pt2_wirecount_inc(m, pte1_idx); 2556 } else { 2557 /* 2558 * Here if the PT2 isn't mapped, or if it has 2559 * been deallocated. 2560 */ 2561 m = _pmap_allocpte2(pmap, va, flags); 2562 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2563 goto retry; 2564 } 2565 2566 return (m); 2567 } 2568 2569 /* 2570 * Schedule the specified unused L2 page table page to be freed. Specifically, 2571 * add the page to the specified list of pages that will be released to the 2572 * physical memory manager after the TLB has been updated. 2573 */ 2574 static __inline void 2575 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) 2576 { 2577 2578 /* 2579 * Put page on a list so that it is released after 2580 * *ALL* TLB shootdown is done 2581 */ 2582 #ifdef PMAP_DEBUG 2583 pmap_zero_page_check(m); 2584 #endif 2585 m->flags |= PG_ZERO; 2586 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2587 } 2588 2589 /* 2590 * Unwire L2 page tables page. 2591 */ 2592 static void 2593 pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) 2594 { 2595 pt1_entry_t *pte1p, opte1 __unused; 2596 pt2_entry_t *pte2p; 2597 uint32_t i; 2598 2599 KASSERT(pt2pg_is_empty(m), 2600 ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); 2601 2602 /* 2603 * Unmap all L2 page tables in the page from L1 page table. 2604 * 2605 * QQQ: Individual L2 page tables (except the last one) can be unmapped 2606 * earlier. However, we are doing that this way. 2607 */ 2608 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 2609 ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); 2610 pte1p = pmap->pm_pt1 + m->pindex; 2611 for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { 2612 KASSERT(m->md.pt2_wirecount[i] == 0, 2613 ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); 2614 opte1 = pte1_load(pte1p); 2615 if (pte1_is_link(opte1)) { 2616 pte1_clear(pte1p); 2617 /* 2618 * Flush intermediate TLB cache. 2619 */ 2620 pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); 2621 } 2622 #ifdef INVARIANTS 2623 else 2624 KASSERT((opte1 == 0) || pte1_is_section(opte1), 2625 ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, 2626 pmap, va, opte1, i)); 2627 #endif 2628 } 2629 2630 /* 2631 * Unmap the page from PT2TAB. 2632 */ 2633 pte2p = pmap_pt2tab_entry(pmap, va); 2634 (void)pt2tab_load_clear(pte2p); 2635 pmap_tlb_flush(pmap, pt2map_pt2pg(va)); 2636 2637 m->ref_count = 0; 2638 pmap->pm_stats.resident_count--; 2639 2640 /* 2641 * This barrier is so that the ordinary store unmapping 2642 * the L2 page table page is globally performed before TLB shoot- 2643 * down is begun. 2644 */ 2645 wmb(); 2646 vm_wire_sub(1); 2647 } 2648 2649 /* 2650 * Decrements a L2 page table page's wire count, which is used to record the 2651 * number of valid page table entries within the page. If the wire count 2652 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2653 * page table page was unmapped and FALSE otherwise. 2654 */ 2655 static __inline boolean_t 2656 pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2657 { 2658 pt2_wirecount_dec(m, pte1_index(va)); 2659 if (pt2pg_is_empty(m)) { 2660 /* 2661 * QQQ: Wire count is zero, so whole page should be zero and 2662 * we can set PG_ZERO flag to it. 2663 * Note that when promotion is enabled, it takes some 2664 * more efforts. See pmap_unwire_pt2_all() below. 2665 */ 2666 pmap_unwire_pt2pg(pmap, va, m); 2667 pmap_add_delayed_free_list(m, free); 2668 return (TRUE); 2669 } else 2670 return (FALSE); 2671 } 2672 2673 /* 2674 * Drop a L2 page table page's wire count at once, which is used to record 2675 * the number of valid L2 page table entries within the page. If the wire 2676 * count drops to zero, then the L2 page table page is unmapped. 2677 */ 2678 static __inline void 2679 pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, 2680 struct spglist *free) 2681 { 2682 u_int pte1_idx = pte1_index(va); 2683 2684 KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), 2685 ("%s: PT2 page's pindex is wrong", __func__)); 2686 KASSERT(m->ref_count > pt2_wirecount_get(m, pte1_idx), 2687 ("%s: bad pt2 wire count %u > %u", __func__, m->ref_count, 2688 pt2_wirecount_get(m, pte1_idx))); 2689 2690 /* 2691 * It's possible that the L2 page table was never used. 2692 * It happened in case that a section was created without promotion. 2693 */ 2694 if (pt2_is_full(m, va)) { 2695 pt2_wirecount_set(m, pte1_idx, 0); 2696 2697 /* 2698 * QQQ: We clear L2 page table now, so when L2 page table page 2699 * is going to be freed, we can set it PG_ZERO flag ... 2700 * This function is called only on section mappings, so 2701 * hopefully it's not to big overload. 2702 * 2703 * XXX: If pmap is current, existing PT2MAP mapping could be 2704 * used for zeroing. 2705 */ 2706 pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); 2707 } 2708 #ifdef INVARIANTS 2709 else 2710 KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", 2711 __func__, pt2_wirecount_get(m, pte1_idx))); 2712 #endif 2713 if (pt2pg_is_empty(m)) { 2714 pmap_unwire_pt2pg(pmap, va, m); 2715 pmap_add_delayed_free_list(m, free); 2716 } 2717 } 2718 2719 /* 2720 * After removing a L2 page table entry, this routine is used to 2721 * conditionally free the page, and manage the hold/wire counts. 2722 */ 2723 static boolean_t 2724 pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) 2725 { 2726 pt1_entry_t pte1; 2727 vm_page_t mpte; 2728 2729 if (va >= VM_MAXUSER_ADDRESS) 2730 return (FALSE); 2731 pte1 = pte1_load(pmap_pte1(pmap, va)); 2732 mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2733 return (pmap_unwire_pt2(pmap, va, mpte, free)); 2734 } 2735 2736 /************************************* 2737 * 2738 * Page management routines. 2739 * 2740 *************************************/ 2741 2742 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2743 CTASSERT(_NPCM == 11); 2744 CTASSERT(_NPCPV == 336); 2745 2746 static __inline struct pv_chunk * 2747 pv_to_chunk(pv_entry_t pv) 2748 { 2749 2750 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2751 } 2752 2753 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2754 2755 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2756 #define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2757 2758 static const uint32_t pc_freemask[_NPCM] = { 2759 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2760 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2761 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2762 PC_FREE0_9, PC_FREE10 2763 }; 2764 2765 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2766 "Current number of pv entries"); 2767 2768 #ifdef PV_STATS 2769 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2770 2771 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2772 "Current number of pv entry chunks"); 2773 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2774 "Current number of pv entry chunks allocated"); 2775 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2776 "Current number of pv entry chunks frees"); 2777 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 2778 0, "Number of times tried to get a chunk page but failed."); 2779 2780 static long pv_entry_frees, pv_entry_allocs; 2781 static int pv_entry_spare; 2782 2783 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2784 "Current number of pv entry frees"); 2785 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 2786 0, "Current number of pv entry allocs"); 2787 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2788 "Current number of spare pv entries"); 2789 #endif 2790 2791 /* 2792 * Is given page managed? 2793 */ 2794 static __inline bool 2795 is_managed(vm_paddr_t pa) 2796 { 2797 vm_page_t m; 2798 2799 m = PHYS_TO_VM_PAGE(pa); 2800 if (m == NULL) 2801 return (false); 2802 return ((m->oflags & VPO_UNMANAGED) == 0); 2803 } 2804 2805 static __inline bool 2806 pte1_is_managed(pt1_entry_t pte1) 2807 { 2808 2809 return (is_managed(pte1_pa(pte1))); 2810 } 2811 2812 static __inline bool 2813 pte2_is_managed(pt2_entry_t pte2) 2814 { 2815 2816 return (is_managed(pte2_pa(pte2))); 2817 } 2818 2819 /* 2820 * We are in a serious low memory condition. Resort to 2821 * drastic measures to free some pages so we can allocate 2822 * another pv entry chunk. 2823 */ 2824 static vm_page_t 2825 pmap_pv_reclaim(pmap_t locked_pmap) 2826 { 2827 struct pch newtail; 2828 struct pv_chunk *pc; 2829 struct md_page *pvh; 2830 pt1_entry_t *pte1p; 2831 pmap_t pmap; 2832 pt2_entry_t *pte2p, tpte2; 2833 pv_entry_t pv; 2834 vm_offset_t va; 2835 vm_page_t m, m_pc; 2836 struct spglist free; 2837 uint32_t inuse; 2838 int bit, field, freed; 2839 2840 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2841 pmap = NULL; 2842 m_pc = NULL; 2843 SLIST_INIT(&free); 2844 TAILQ_INIT(&newtail); 2845 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2846 SLIST_EMPTY(&free))) { 2847 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2848 if (pmap != pc->pc_pmap) { 2849 if (pmap != NULL) { 2850 if (pmap != locked_pmap) 2851 PMAP_UNLOCK(pmap); 2852 } 2853 pmap = pc->pc_pmap; 2854 /* Avoid deadlock and lock recursion. */ 2855 if (pmap > locked_pmap) 2856 PMAP_LOCK(pmap); 2857 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2858 pmap = NULL; 2859 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2860 continue; 2861 } 2862 } 2863 2864 /* 2865 * Destroy every non-wired, 4 KB page mapping in the chunk. 2866 */ 2867 freed = 0; 2868 for (field = 0; field < _NPCM; field++) { 2869 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2870 inuse != 0; inuse &= ~(1UL << bit)) { 2871 bit = ffs(inuse) - 1; 2872 pv = &pc->pc_pventry[field * 32 + bit]; 2873 va = pv->pv_va; 2874 pte1p = pmap_pte1(pmap, va); 2875 if (pte1_is_section(pte1_load(pte1p))) 2876 continue; 2877 pte2p = pmap_pte2(pmap, va); 2878 tpte2 = pte2_load(pte2p); 2879 if ((tpte2 & PTE2_W) == 0) 2880 tpte2 = pte2_load_clear(pte2p); 2881 pmap_pte2_release(pte2p); 2882 if ((tpte2 & PTE2_W) != 0) 2883 continue; 2884 KASSERT(tpte2 != 0, 2885 ("pmap_pv_reclaim: pmap %p va %#x zero pte", 2886 pmap, va)); 2887 pmap_tlb_flush(pmap, va); 2888 m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); 2889 if (pte2_is_dirty(tpte2)) 2890 vm_page_dirty(m); 2891 if ((tpte2 & PTE2_A) != 0) 2892 vm_page_aflag_set(m, PGA_REFERENCED); 2893 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2894 if (TAILQ_EMPTY(&m->md.pv_list) && 2895 (m->flags & PG_FICTITIOUS) == 0) { 2896 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2897 if (TAILQ_EMPTY(&pvh->pv_list)) { 2898 vm_page_aflag_clear(m, 2899 PGA_WRITEABLE); 2900 } 2901 } 2902 pc->pc_map[field] |= 1UL << bit; 2903 pmap_unuse_pt2(pmap, va, &free); 2904 freed++; 2905 } 2906 } 2907 if (freed == 0) { 2908 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2909 continue; 2910 } 2911 /* Every freed mapping is for a 4 KB page. */ 2912 pmap->pm_stats.resident_count -= freed; 2913 PV_STAT(pv_entry_frees += freed); 2914 PV_STAT(pv_entry_spare += freed); 2915 pv_entry_count -= freed; 2916 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2917 for (field = 0; field < _NPCM; field++) 2918 if (pc->pc_map[field] != pc_freemask[field]) { 2919 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2920 pc_list); 2921 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2922 2923 /* 2924 * One freed pv entry in locked_pmap is 2925 * sufficient. 2926 */ 2927 if (pmap == locked_pmap) 2928 goto out; 2929 break; 2930 } 2931 if (field == _NPCM) { 2932 PV_STAT(pv_entry_spare -= _NPCPV); 2933 PV_STAT(pc_chunk_count--); 2934 PV_STAT(pc_chunk_frees++); 2935 /* Entire chunk is free; return it. */ 2936 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2937 pmap_qremove((vm_offset_t)pc, 1); 2938 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2939 break; 2940 } 2941 } 2942 out: 2943 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2944 if (pmap != NULL) { 2945 if (pmap != locked_pmap) 2946 PMAP_UNLOCK(pmap); 2947 } 2948 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2949 m_pc = SLIST_FIRST(&free); 2950 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2951 /* Recycle a freed page table page. */ 2952 m_pc->ref_count = 1; 2953 vm_wire_add(1); 2954 } 2955 vm_page_free_pages_toq(&free, false); 2956 return (m_pc); 2957 } 2958 2959 static void 2960 free_pv_chunk(struct pv_chunk *pc) 2961 { 2962 vm_page_t m; 2963 2964 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2965 PV_STAT(pv_entry_spare -= _NPCPV); 2966 PV_STAT(pc_chunk_count--); 2967 PV_STAT(pc_chunk_frees++); 2968 /* entire chunk is free, return it */ 2969 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2970 pmap_qremove((vm_offset_t)pc, 1); 2971 vm_page_unwire_noq(m); 2972 vm_page_free(m); 2973 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2974 } 2975 2976 /* 2977 * Free the pv_entry back to the free list. 2978 */ 2979 static void 2980 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2981 { 2982 struct pv_chunk *pc; 2983 int idx, field, bit; 2984 2985 rw_assert(&pvh_global_lock, RA_WLOCKED); 2986 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2987 PV_STAT(pv_entry_frees++); 2988 PV_STAT(pv_entry_spare++); 2989 pv_entry_count--; 2990 pc = pv_to_chunk(pv); 2991 idx = pv - &pc->pc_pventry[0]; 2992 field = idx / 32; 2993 bit = idx % 32; 2994 pc->pc_map[field] |= 1ul << bit; 2995 for (idx = 0; idx < _NPCM; idx++) 2996 if (pc->pc_map[idx] != pc_freemask[idx]) { 2997 /* 2998 * 98% of the time, pc is already at the head of the 2999 * list. If it isn't already, move it to the head. 3000 */ 3001 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 3002 pc)) { 3003 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3004 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 3005 pc_list); 3006 } 3007 return; 3008 } 3009 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3010 free_pv_chunk(pc); 3011 } 3012 3013 /* 3014 * Get a new pv_entry, allocating a block from the system 3015 * when needed. 3016 */ 3017 static pv_entry_t 3018 get_pv_entry(pmap_t pmap, boolean_t try) 3019 { 3020 static const struct timeval printinterval = { 60, 0 }; 3021 static struct timeval lastprint; 3022 int bit, field; 3023 pv_entry_t pv; 3024 struct pv_chunk *pc; 3025 vm_page_t m; 3026 3027 rw_assert(&pvh_global_lock, RA_WLOCKED); 3028 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3029 PV_STAT(pv_entry_allocs++); 3030 pv_entry_count++; 3031 if (pv_entry_count > pv_entry_high_water) 3032 if (ratecheck(&lastprint, &printinterval)) 3033 printf("Approaching the limit on PV entries, consider " 3034 "increasing either the vm.pmap.shpgperproc or the " 3035 "vm.pmap.pv_entries tunable.\n"); 3036 retry: 3037 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3038 if (pc != NULL) { 3039 for (field = 0; field < _NPCM; field++) { 3040 if (pc->pc_map[field]) { 3041 bit = ffs(pc->pc_map[field]) - 1; 3042 break; 3043 } 3044 } 3045 if (field < _NPCM) { 3046 pv = &pc->pc_pventry[field * 32 + bit]; 3047 pc->pc_map[field] &= ~(1ul << bit); 3048 /* If this was the last item, move it to tail */ 3049 for (field = 0; field < _NPCM; field++) 3050 if (pc->pc_map[field] != 0) { 3051 PV_STAT(pv_entry_spare--); 3052 return (pv); /* not full, return */ 3053 } 3054 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3055 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3056 PV_STAT(pv_entry_spare--); 3057 return (pv); 3058 } 3059 } 3060 /* 3061 * Access to the pte2list "pv_vafree" is synchronized by the pvh 3062 * global lock. If "pv_vafree" is currently non-empty, it will 3063 * remain non-empty until pmap_pte2list_alloc() completes. 3064 */ 3065 if (pv_vafree == 0 || 3066 (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { 3067 if (try) { 3068 pv_entry_count--; 3069 PV_STAT(pc_chunk_tryfail++); 3070 return (NULL); 3071 } 3072 m = pmap_pv_reclaim(pmap); 3073 if (m == NULL) 3074 goto retry; 3075 } 3076 PV_STAT(pc_chunk_count++); 3077 PV_STAT(pc_chunk_allocs++); 3078 pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); 3079 pmap_qenter((vm_offset_t)pc, &m, 1); 3080 pc->pc_pmap = pmap; 3081 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 3082 for (field = 1; field < _NPCM; field++) 3083 pc->pc_map[field] = pc_freemask[field]; 3084 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3085 pv = &pc->pc_pventry[0]; 3086 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3087 PV_STAT(pv_entry_spare += _NPCPV - 1); 3088 return (pv); 3089 } 3090 3091 /* 3092 * Create a pv entry for page at pa for 3093 * (pmap, va). 3094 */ 3095 static void 3096 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3097 { 3098 pv_entry_t pv; 3099 3100 rw_assert(&pvh_global_lock, RA_WLOCKED); 3101 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3102 pv = get_pv_entry(pmap, FALSE); 3103 pv->pv_va = va; 3104 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3105 } 3106 3107 static __inline pv_entry_t 3108 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3109 { 3110 pv_entry_t pv; 3111 3112 rw_assert(&pvh_global_lock, RA_WLOCKED); 3113 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3114 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3115 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3116 break; 3117 } 3118 } 3119 return (pv); 3120 } 3121 3122 static void 3123 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3124 { 3125 pv_entry_t pv; 3126 3127 pv = pmap_pvh_remove(pvh, pmap, va); 3128 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3129 free_pv_entry(pmap, pv); 3130 } 3131 3132 static void 3133 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 3134 { 3135 struct md_page *pvh; 3136 3137 rw_assert(&pvh_global_lock, RA_WLOCKED); 3138 pmap_pvh_free(&m->md, pmap, va); 3139 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 3140 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3141 if (TAILQ_EMPTY(&pvh->pv_list)) 3142 vm_page_aflag_clear(m, PGA_WRITEABLE); 3143 } 3144 } 3145 3146 static void 3147 pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3148 { 3149 struct md_page *pvh; 3150 pv_entry_t pv; 3151 vm_offset_t va_last; 3152 vm_page_t m; 3153 3154 rw_assert(&pvh_global_lock, RA_WLOCKED); 3155 KASSERT((pa & PTE1_OFFSET) == 0, 3156 ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); 3157 3158 /* 3159 * Transfer the 1mpage's pv entry for this mapping to the first 3160 * page's pv list. 3161 */ 3162 pvh = pa_to_pvh(pa); 3163 va = pte1_trunc(va); 3164 pv = pmap_pvh_remove(pvh, pmap, va); 3165 KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); 3166 m = PHYS_TO_VM_PAGE(pa); 3167 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3168 /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3169 va_last = va + PTE1_SIZE - PAGE_SIZE; 3170 do { 3171 m++; 3172 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3173 ("pmap_pv_demote_pte1: page %p is not managed", m)); 3174 va += PAGE_SIZE; 3175 pmap_insert_entry(pmap, va, m); 3176 } while (va < va_last); 3177 } 3178 3179 #if VM_NRESERVLEVEL > 0 3180 static void 3181 pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3182 { 3183 struct md_page *pvh; 3184 pv_entry_t pv; 3185 vm_offset_t va_last; 3186 vm_page_t m; 3187 3188 rw_assert(&pvh_global_lock, RA_WLOCKED); 3189 KASSERT((pa & PTE1_OFFSET) == 0, 3190 ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); 3191 3192 /* 3193 * Transfer the first page's pv entry for this mapping to the 3194 * 1mpage's pv list. Aside from avoiding the cost of a call 3195 * to get_pv_entry(), a transfer avoids the possibility that 3196 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() 3197 * removes one of the mappings that is being promoted. 3198 */ 3199 m = PHYS_TO_VM_PAGE(pa); 3200 va = pte1_trunc(va); 3201 pv = pmap_pvh_remove(&m->md, pmap, va); 3202 KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); 3203 pvh = pa_to_pvh(pa); 3204 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3205 /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3206 va_last = va + PTE1_SIZE - PAGE_SIZE; 3207 do { 3208 m++; 3209 va += PAGE_SIZE; 3210 pmap_pvh_free(&m->md, pmap, va); 3211 } while (va < va_last); 3212 } 3213 #endif 3214 3215 /* 3216 * Conditionally create a pv entry. 3217 */ 3218 static boolean_t 3219 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3220 { 3221 pv_entry_t pv; 3222 3223 rw_assert(&pvh_global_lock, RA_WLOCKED); 3224 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3225 if (pv_entry_count < pv_entry_high_water && 3226 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3227 pv->pv_va = va; 3228 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3229 return (TRUE); 3230 } else 3231 return (FALSE); 3232 } 3233 3234 /* 3235 * Create the pv entries for each of the pages within a section. 3236 */ 3237 static bool 3238 pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags) 3239 { 3240 struct md_page *pvh; 3241 pv_entry_t pv; 3242 bool noreclaim; 3243 3244 rw_assert(&pvh_global_lock, RA_WLOCKED); 3245 noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; 3246 if ((noreclaim && pv_entry_count >= pv_entry_high_water) || 3247 (pv = get_pv_entry(pmap, noreclaim)) == NULL) 3248 return (false); 3249 pv->pv_va = va; 3250 pvh = pa_to_pvh(pte1_pa(pte1)); 3251 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3252 return (true); 3253 } 3254 3255 static inline void 3256 pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) 3257 { 3258 3259 /* Kill all the small mappings or the big one only. */ 3260 if (pte1_is_section(npte1)) 3261 pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); 3262 else 3263 pmap_tlb_flush(pmap, pte1_trunc(va)); 3264 } 3265 3266 /* 3267 * Update kernel pte1 on all pmaps. 3268 * 3269 * The following function is called only on one cpu with disabled interrupts. 3270 * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way 3271 * nobody can invoke explicit hardware table walk during the update of pte1. 3272 * Unsolicited hardware table walk can still happen, invoked by speculative 3273 * data or instruction prefetch or even by speculative hardware table walk. 3274 * 3275 * The break-before-make approach should be implemented here. However, it's 3276 * not so easy to do that for kernel mappings as it would be unhappy to unmap 3277 * itself unexpectedly but voluntarily. 3278 */ 3279 static void 3280 pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) 3281 { 3282 pmap_t pmap; 3283 pt1_entry_t *pte1p; 3284 3285 /* 3286 * Get current pmap. Interrupts should be disabled here 3287 * so PCPU_GET() is done atomically. 3288 */ 3289 pmap = PCPU_GET(curpmap); 3290 if (pmap == NULL) 3291 pmap = kernel_pmap; 3292 3293 /* 3294 * (1) Change pte1 on current pmap. 3295 * (2) Flush all obsolete TLB entries on current CPU. 3296 * (3) Change pte1 on all pmaps. 3297 * (4) Flush all obsolete TLB entries on all CPUs in SMP case. 3298 */ 3299 3300 pte1p = pmap_pte1(pmap, va); 3301 pte1_store(pte1p, npte1); 3302 3303 /* Kill all the small mappings or the big one only. */ 3304 if (pte1_is_section(npte1)) { 3305 pmap_pte1_kern_promotions++; 3306 tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); 3307 } else { 3308 pmap_pte1_kern_demotions++; 3309 tlb_flush_local(pte1_trunc(va)); 3310 } 3311 3312 /* 3313 * In SMP case, this function is called when all cpus are at smp 3314 * rendezvous, so there is no need to use 'allpmaps_lock' lock here. 3315 * In UP case, the function is called with this lock locked. 3316 */ 3317 LIST_FOREACH(pmap, &allpmaps, pm_list) { 3318 pte1p = pmap_pte1(pmap, va); 3319 pte1_store(pte1p, npte1); 3320 } 3321 3322 #ifdef SMP 3323 /* Kill all the small mappings or the big one only. */ 3324 if (pte1_is_section(npte1)) 3325 tlb_flush_range(pte1_trunc(va), PTE1_SIZE); 3326 else 3327 tlb_flush(pte1_trunc(va)); 3328 #endif 3329 } 3330 3331 #ifdef SMP 3332 struct pte1_action { 3333 vm_offset_t va; 3334 pt1_entry_t npte1; 3335 u_int update; /* CPU that updates the PTE1 */ 3336 }; 3337 3338 static void 3339 pmap_update_pte1_action(void *arg) 3340 { 3341 struct pte1_action *act = arg; 3342 3343 if (act->update == PCPU_GET(cpuid)) 3344 pmap_update_pte1_kernel(act->va, act->npte1); 3345 } 3346 3347 /* 3348 * Change pte1 on current pmap. 3349 * Note that kernel pte1 must be changed on all pmaps. 3350 * 3351 * According to the architecture reference manual published by ARM, 3352 * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. 3353 * According to this manual, UNPREDICTABLE behaviours must never happen in 3354 * a viable system. In contrast, on x86 processors, it is not specified which 3355 * TLB entry mapping the virtual address will be used, but the MMU doesn't 3356 * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone 3357 * Black). 3358 * 3359 * It's a problem when either promotion or demotion is being done. The pte1 3360 * update and appropriate TLB flush must be done atomically in general. 3361 */ 3362 static void 3363 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3364 pt1_entry_t npte1) 3365 { 3366 3367 if (pmap == kernel_pmap) { 3368 struct pte1_action act; 3369 3370 sched_pin(); 3371 act.va = va; 3372 act.npte1 = npte1; 3373 act.update = PCPU_GET(cpuid); 3374 smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, 3375 pmap_update_pte1_action, NULL, &act); 3376 sched_unpin(); 3377 } else { 3378 register_t cspr; 3379 3380 /* 3381 * Use break-before-make approach for changing userland 3382 * mappings. It can cause L1 translation aborts on other 3383 * cores in SMP case. So, special treatment is implemented 3384 * in pmap_fault(). To reduce the likelihood that another core 3385 * will be affected by the broken mapping, disable interrupts 3386 * until the mapping change is completed. 3387 */ 3388 cspr = disable_interrupts(PSR_I | PSR_F); 3389 pte1_clear(pte1p); 3390 pmap_tlb_flush_pte1(pmap, va, npte1); 3391 pte1_store(pte1p, npte1); 3392 restore_interrupts(cspr); 3393 } 3394 } 3395 #else 3396 static void 3397 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3398 pt1_entry_t npte1) 3399 { 3400 3401 if (pmap == kernel_pmap) { 3402 mtx_lock_spin(&allpmaps_lock); 3403 pmap_update_pte1_kernel(va, npte1); 3404 mtx_unlock_spin(&allpmaps_lock); 3405 } else { 3406 register_t cspr; 3407 3408 /* 3409 * Use break-before-make approach for changing userland 3410 * mappings. It's absolutely safe in UP case when interrupts 3411 * are disabled. 3412 */ 3413 cspr = disable_interrupts(PSR_I | PSR_F); 3414 pte1_clear(pte1p); 3415 pmap_tlb_flush_pte1(pmap, va, npte1); 3416 pte1_store(pte1p, npte1); 3417 restore_interrupts(cspr); 3418 } 3419 } 3420 #endif 3421 3422 #if VM_NRESERVLEVEL > 0 3423 /* 3424 * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are 3425 * within a single page table page (PT2) to a single 1MB page mapping. 3426 * For promotion to occur, two conditions must be met: (1) the 4KB page 3427 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3428 * mappings must have identical characteristics. 3429 * 3430 * Managed (PG_MANAGED) mappings within the kernel address space are not 3431 * promoted. The reason is that kernel PTE1s are replicated in each pmap but 3432 * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only 3433 * read the PTE1 from the kernel pmap. 3434 */ 3435 static void 3436 pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3437 { 3438 pt1_entry_t npte1; 3439 pt2_entry_t *fpte2p, fpte2, fpte2_fav; 3440 pt2_entry_t *pte2p, pte2; 3441 vm_offset_t pteva __unused; 3442 vm_page_t m __unused; 3443 3444 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3445 pmap, va, pte1_load(pte1p), pte1p)); 3446 3447 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3448 3449 /* 3450 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is 3451 * either invalid, unused, or does not map the first 4KB physical page 3452 * within a 1MB page. 3453 */ 3454 fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); 3455 fpte2 = pte2_load(fpte2p); 3456 if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != 3457 (PTE2_A | PTE2_V)) { 3458 pmap_pte1_p_failures++; 3459 CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", 3460 __func__, va, pmap); 3461 return; 3462 } 3463 if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { 3464 pmap_pte1_p_failures++; 3465 CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", 3466 __func__, va, pmap); 3467 return; 3468 } 3469 if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3470 /* 3471 * When page is not modified, PTE2_RO can be set without 3472 * a TLB invalidation. 3473 */ 3474 fpte2 |= PTE2_RO; 3475 pte2_store(fpte2p, fpte2); 3476 } 3477 3478 /* 3479 * Examine each of the other PTE2s in the specified PT2. Abort if this 3480 * PTE2 maps an unexpected 4KB physical page or does not have identical 3481 * characteristics to the first PTE2. 3482 */ 3483 fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); 3484 fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ 3485 for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { 3486 pte2 = pte2_load(pte2p); 3487 if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { 3488 pmap_pte1_p_failures++; 3489 CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", 3490 __func__, va, pmap); 3491 return; 3492 } 3493 if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3494 /* 3495 * When page is not modified, PTE2_RO can be set 3496 * without a TLB invalidation. See note above. 3497 */ 3498 pte2 |= PTE2_RO; 3499 pte2_store(pte2p, pte2); 3500 pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & 3501 PTE2_FRAME); 3502 CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", 3503 __func__, pteva, pmap); 3504 } 3505 if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { 3506 pmap_pte1_p_failures++; 3507 CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", 3508 __func__, va, pmap); 3509 return; 3510 } 3511 3512 fpte2_fav -= PTE2_SIZE; 3513 } 3514 /* 3515 * The page table page in its current state will stay in PT2TAB 3516 * until the PTE1 mapping the section is demoted by pmap_demote_pte1() 3517 * or destroyed by pmap_remove_pte1(). 3518 * 3519 * Note that L2 page table size is not equal to PAGE_SIZE. 3520 */ 3521 m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); 3522 KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], 3523 ("%s: PT2 page is out of range", __func__)); 3524 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 3525 ("%s: PT2 page's pindex is wrong", __func__)); 3526 3527 /* 3528 * Get pte1 from pte2 format. 3529 */ 3530 npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; 3531 3532 /* 3533 * Promote the pv entries. 3534 */ 3535 if (pte2_is_managed(fpte2)) 3536 pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); 3537 3538 /* 3539 * Promote the mappings. 3540 */ 3541 pmap_change_pte1(pmap, pte1p, va, npte1); 3542 3543 pmap_pte1_promotions++; 3544 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3545 __func__, va, pmap); 3546 3547 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3548 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3549 } 3550 #endif /* VM_NRESERVLEVEL > 0 */ 3551 3552 /* 3553 * Zero L2 page table page. 3554 */ 3555 static __inline void 3556 pmap_clear_pt2(pt2_entry_t *fpte2p) 3557 { 3558 pt2_entry_t *pte2p; 3559 3560 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) 3561 pte2_clear(pte2p); 3562 3563 } 3564 3565 /* 3566 * Removes a 1MB page mapping from the kernel pmap. 3567 */ 3568 static void 3569 pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3570 { 3571 vm_page_t m; 3572 uint32_t pte1_idx; 3573 pt2_entry_t *fpte2p; 3574 vm_paddr_t pt2_pa; 3575 3576 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3577 m = pmap_pt2_page(pmap, va); 3578 if (m == NULL) 3579 /* 3580 * QQQ: Is this function called only on promoted pte1? 3581 * We certainly do section mappings directly 3582 * (without promotion) in kernel !!! 3583 */ 3584 panic("%s: missing pt2 page", __func__); 3585 3586 pte1_idx = pte1_index(va); 3587 3588 /* 3589 * Initialize the L2 page table. 3590 */ 3591 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3592 pmap_clear_pt2(fpte2p); 3593 3594 /* 3595 * Remove the mapping. 3596 */ 3597 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); 3598 pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); 3599 3600 /* 3601 * QQQ: We do not need to invalidate PT2MAP mapping 3602 * as we did not change it. I.e. the L2 page table page 3603 * was and still is mapped the same way. 3604 */ 3605 } 3606 3607 /* 3608 * Do the things to unmap a section in a process 3609 */ 3610 static void 3611 pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 3612 struct spglist *free) 3613 { 3614 pt1_entry_t opte1; 3615 struct md_page *pvh; 3616 vm_offset_t eva, va; 3617 vm_page_t m; 3618 3619 PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, 3620 pte1_load(pte1p), pte1p)); 3621 3622 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3623 KASSERT((sva & PTE1_OFFSET) == 0, 3624 ("%s: sva is not 1mpage aligned", __func__)); 3625 3626 /* 3627 * Clear and invalidate the mapping. It should occupy one and only TLB 3628 * entry. So, pmap_tlb_flush() called with aligned address should be 3629 * sufficient. 3630 */ 3631 opte1 = pte1_load_clear(pte1p); 3632 pmap_tlb_flush(pmap, sva); 3633 3634 if (pte1_is_wired(opte1)) 3635 pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; 3636 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 3637 if (pte1_is_managed(opte1)) { 3638 pvh = pa_to_pvh(pte1_pa(opte1)); 3639 pmap_pvh_free(pvh, pmap, sva); 3640 eva = sva + PTE1_SIZE; 3641 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 3642 va < eva; va += PAGE_SIZE, m++) { 3643 if (pte1_is_dirty(opte1)) 3644 vm_page_dirty(m); 3645 if (opte1 & PTE1_A) 3646 vm_page_aflag_set(m, PGA_REFERENCED); 3647 if (TAILQ_EMPTY(&m->md.pv_list) && 3648 TAILQ_EMPTY(&pvh->pv_list)) 3649 vm_page_aflag_clear(m, PGA_WRITEABLE); 3650 } 3651 } 3652 if (pmap == kernel_pmap) { 3653 /* 3654 * L2 page table(s) can't be removed from kernel map as 3655 * kernel counts on it (stuff around pmap_growkernel()). 3656 */ 3657 pmap_remove_kernel_pte1(pmap, pte1p, sva); 3658 } else { 3659 /* 3660 * Get associated L2 page table page. 3661 * It's possible that the page was never allocated. 3662 */ 3663 m = pmap_pt2_page(pmap, sva); 3664 if (m != NULL) 3665 pmap_unwire_pt2_all(pmap, sva, m, free); 3666 } 3667 } 3668 3669 /* 3670 * Fills L2 page table page with mappings to consecutive physical pages. 3671 */ 3672 static __inline void 3673 pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) 3674 { 3675 pt2_entry_t *pte2p; 3676 3677 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { 3678 pte2_store(pte2p, npte2); 3679 npte2 += PTE2_SIZE; 3680 } 3681 } 3682 3683 /* 3684 * Tries to demote a 1MB page mapping. If demotion fails, the 3685 * 1MB page mapping is invalidated. 3686 */ 3687 static boolean_t 3688 pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3689 { 3690 pt1_entry_t opte1, npte1; 3691 pt2_entry_t *fpte2p, npte2; 3692 vm_paddr_t pt2pg_pa, pt2_pa; 3693 vm_page_t m; 3694 struct spglist free; 3695 uint32_t pte1_idx, isnew = 0; 3696 3697 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3698 pmap, va, pte1_load(pte1p), pte1p)); 3699 3700 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3701 3702 opte1 = pte1_load(pte1p); 3703 KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); 3704 3705 if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { 3706 KASSERT(!pte1_is_wired(opte1), 3707 ("%s: PT2 page for a wired mapping is missing", __func__)); 3708 3709 /* 3710 * Invalidate the 1MB page mapping and return 3711 * "failure" if the mapping was never accessed or the 3712 * allocation of the new page table page fails. 3713 */ 3714 if ((opte1 & PTE1_A) == 0 || 3715 (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { 3716 SLIST_INIT(&free); 3717 pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); 3718 vm_page_free_pages_toq(&free, false); 3719 CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", 3720 __func__, va, pmap); 3721 return (FALSE); 3722 } 3723 m->pindex = pte1_index(va) & ~PT2PG_MASK; 3724 if (va < VM_MAXUSER_ADDRESS) 3725 pmap->pm_stats.resident_count++; 3726 3727 isnew = 1; 3728 3729 /* 3730 * We init all L2 page tables in the page even if 3731 * we are going to change everything for one L2 page 3732 * table in a while. 3733 */ 3734 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 3735 } else { 3736 if (va < VM_MAXUSER_ADDRESS) { 3737 if (pt2_is_empty(m, va)) 3738 isnew = 1; /* Demoting section w/o promotion. */ 3739 #ifdef INVARIANTS 3740 else 3741 KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" 3742 " count %u", __func__, 3743 pt2_wirecount_get(m, pte1_index(va)))); 3744 #endif 3745 } 3746 } 3747 3748 pt2pg_pa = VM_PAGE_TO_PHYS(m); 3749 pte1_idx = pte1_index(va); 3750 /* 3751 * If the pmap is current, then the PT2MAP can provide access to 3752 * the page table page (promoted L2 page tables are not unmapped). 3753 * Otherwise, temporarily map the L2 page table page (m) into 3754 * the kernel's address space at either PADDR1 or PADDR2. 3755 * 3756 * Note that L2 page table size is not equal to PAGE_SIZE. 3757 */ 3758 if (pmap_is_current(pmap)) 3759 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3760 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 3761 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 3762 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 3763 #ifdef SMP 3764 PMAP1cpu = PCPU_GET(cpuid); 3765 #endif 3766 tlb_flush_local((vm_offset_t)PADDR1); 3767 PMAP1changed++; 3768 } else 3769 #ifdef SMP 3770 if (PMAP1cpu != PCPU_GET(cpuid)) { 3771 PMAP1cpu = PCPU_GET(cpuid); 3772 tlb_flush_local((vm_offset_t)PADDR1); 3773 PMAP1changedcpu++; 3774 } else 3775 #endif 3776 PMAP1unchanged++; 3777 fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); 3778 } else { 3779 mtx_lock(&PMAP2mutex); 3780 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 3781 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 3782 tlb_flush((vm_offset_t)PADDR2); 3783 } 3784 fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); 3785 } 3786 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 3787 npte1 = PTE1_LINK(pt2_pa); 3788 3789 KASSERT((opte1 & PTE1_A) != 0, 3790 ("%s: opte1 is missing PTE1_A", __func__)); 3791 KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, 3792 ("%s: opte1 has PTE1_NM", __func__)); 3793 3794 /* 3795 * Get pte2 from pte1 format. 3796 */ 3797 npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; 3798 3799 /* 3800 * If the L2 page table page is new, initialize it. If the mapping 3801 * has changed attributes, update the page table entries. 3802 */ 3803 if (isnew != 0) { 3804 pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); 3805 pmap_fill_pt2(fpte2p, npte2); 3806 } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != 3807 (npte2 & PTE2_PROMOTE)) 3808 pmap_fill_pt2(fpte2p, npte2); 3809 3810 KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), 3811 ("%s: fpte2p and npte2 map different physical addresses", 3812 __func__)); 3813 3814 if (fpte2p == PADDR2) 3815 mtx_unlock(&PMAP2mutex); 3816 3817 /* 3818 * Demote the mapping. This pmap is locked. The old PTE1 has 3819 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also 3820 * has not PTE1_NM set. Thus, there is no danger of a race with 3821 * another processor changing the setting of PTE1_A and/or PTE1_NM 3822 * between the read above and the store below. 3823 */ 3824 pmap_change_pte1(pmap, pte1p, va, npte1); 3825 3826 /* 3827 * Demote the pv entry. This depends on the earlier demotion 3828 * of the mapping. Specifically, the (re)creation of a per- 3829 * page pv entry might trigger the execution of pmap_pv_reclaim(), 3830 * which might reclaim a newly (re)created per-page pv entry 3831 * and destroy the associated mapping. In order to destroy 3832 * the mapping, the PTE1 must have already changed from mapping 3833 * the 1mpage to referencing the page table page. 3834 */ 3835 if (pte1_is_managed(opte1)) 3836 pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); 3837 3838 pmap_pte1_demotions++; 3839 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3840 __func__, va, pmap); 3841 3842 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3843 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3844 return (TRUE); 3845 } 3846 3847 /* 3848 * Insert the given physical page (p) at 3849 * the specified virtual address (v) in the 3850 * target physical map with the protection requested. 3851 * 3852 * If specified, the page will be wired down, meaning 3853 * that the related pte can not be reclaimed. 3854 * 3855 * NB: This is the only routine which MAY NOT lazy-evaluate 3856 * or lose information. That is, this routine must actually 3857 * insert this page into the given map NOW. 3858 */ 3859 int 3860 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3861 u_int flags, int8_t psind) 3862 { 3863 pt1_entry_t *pte1p; 3864 pt2_entry_t *pte2p; 3865 pt2_entry_t npte2, opte2; 3866 pv_entry_t pv; 3867 vm_paddr_t opa, pa; 3868 vm_page_t mpte2, om; 3869 int rv; 3870 3871 va = trunc_page(va); 3872 KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); 3873 KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, 3874 ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, 3875 va)); 3876 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 3877 ("%s: managed mapping within the clean submap", __func__)); 3878 if ((m->oflags & VPO_UNMANAGED) == 0) 3879 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3880 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 3881 ("%s: flags %u has reserved bits set", __func__, flags)); 3882 pa = VM_PAGE_TO_PHYS(m); 3883 npte2 = PTE2(pa, PTE2_A, vm_page_pte2_attr(m)); 3884 if ((flags & VM_PROT_WRITE) == 0) 3885 npte2 |= PTE2_NM; 3886 if ((prot & VM_PROT_WRITE) == 0) 3887 npte2 |= PTE2_RO; 3888 KASSERT((npte2 & (PTE2_NM | PTE2_RO)) != PTE2_RO, 3889 ("%s: flags includes VM_PROT_WRITE but prot doesn't", __func__)); 3890 if ((prot & VM_PROT_EXECUTE) == 0) 3891 npte2 |= PTE2_NX; 3892 if ((flags & PMAP_ENTER_WIRED) != 0) 3893 npte2 |= PTE2_W; 3894 if (va < VM_MAXUSER_ADDRESS) 3895 npte2 |= PTE2_U; 3896 if (pmap != kernel_pmap) 3897 npte2 |= PTE2_NG; 3898 3899 rw_wlock(&pvh_global_lock); 3900 PMAP_LOCK(pmap); 3901 sched_pin(); 3902 if (psind == 1) { 3903 /* Assert the required virtual and physical alignment. */ 3904 KASSERT((va & PTE1_OFFSET) == 0, 3905 ("%s: va unaligned", __func__)); 3906 KASSERT(m->psind > 0, ("%s: m->psind < psind", __func__)); 3907 rv = pmap_enter_pte1(pmap, va, PTE1_PA(pa) | ATTR_TO_L1(npte2) | 3908 PTE1_V, flags, m); 3909 goto out; 3910 } 3911 3912 /* 3913 * In the case that a page table page is not 3914 * resident, we are creating it here. 3915 */ 3916 if (va < VM_MAXUSER_ADDRESS) { 3917 mpte2 = pmap_allocpte2(pmap, va, flags); 3918 if (mpte2 == NULL) { 3919 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3920 ("pmap_allocpte2 failed with sleep allowed")); 3921 rv = KERN_RESOURCE_SHORTAGE; 3922 goto out; 3923 } 3924 } else 3925 mpte2 = NULL; 3926 pte1p = pmap_pte1(pmap, va); 3927 if (pte1_is_section(pte1_load(pte1p))) 3928 panic("%s: attempted on 1MB page", __func__); 3929 pte2p = pmap_pte2_quick(pmap, va); 3930 if (pte2p == NULL) 3931 panic("%s: invalid L1 page table entry va=%#x", __func__, va); 3932 3933 om = NULL; 3934 opte2 = pte2_load(pte2p); 3935 opa = pte2_pa(opte2); 3936 /* 3937 * Mapping has not changed, must be protection or wiring change. 3938 */ 3939 if (pte2_is_valid(opte2) && (opa == pa)) { 3940 /* 3941 * Wiring change, just update stats. We don't worry about 3942 * wiring PT2 pages as they remain resident as long as there 3943 * are valid mappings in them. Hence, if a user page is wired, 3944 * the PT2 page will be also. 3945 */ 3946 if (pte2_is_wired(npte2) && !pte2_is_wired(opte2)) 3947 pmap->pm_stats.wired_count++; 3948 else if (!pte2_is_wired(npte2) && pte2_is_wired(opte2)) 3949 pmap->pm_stats.wired_count--; 3950 3951 /* 3952 * Remove extra pte2 reference 3953 */ 3954 if (mpte2) 3955 pt2_wirecount_dec(mpte2, pte1_index(va)); 3956 if ((m->oflags & VPO_UNMANAGED) == 0) 3957 om = m; 3958 goto validate; 3959 } 3960 3961 /* 3962 * QQQ: We think that changing physical address on writeable mapping 3963 * is not safe. Well, maybe on kernel address space with correct 3964 * locking, it can make a sense. However, we have no idea why 3965 * anyone should do that on user address space. Are we wrong? 3966 */ 3967 KASSERT((opa == 0) || (opa == pa) || 3968 !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), 3969 ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", 3970 __func__, pmap, va, opte2, opa, pa, flags, prot)); 3971 3972 pv = NULL; 3973 3974 /* 3975 * Mapping has changed, invalidate old range and fall through to 3976 * handle validating new mapping. 3977 */ 3978 if (opa) { 3979 if (pte2_is_wired(opte2)) 3980 pmap->pm_stats.wired_count--; 3981 om = PHYS_TO_VM_PAGE(opa); 3982 if (om != NULL && (om->oflags & VPO_UNMANAGED) != 0) 3983 om = NULL; 3984 if (om != NULL) 3985 pv = pmap_pvh_remove(&om->md, pmap, va); 3986 3987 /* 3988 * Remove extra pte2 reference 3989 */ 3990 if (mpte2 != NULL) 3991 pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); 3992 } else 3993 pmap->pm_stats.resident_count++; 3994 3995 /* 3996 * Enter on the PV list if part of our managed memory. 3997 */ 3998 if ((m->oflags & VPO_UNMANAGED) == 0) { 3999 if (pv == NULL) { 4000 pv = get_pv_entry(pmap, FALSE); 4001 pv->pv_va = va; 4002 } 4003 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4004 } else if (pv != NULL) 4005 free_pv_entry(pmap, pv); 4006 4007 /* 4008 * Increment counters 4009 */ 4010 if (pte2_is_wired(npte2)) 4011 pmap->pm_stats.wired_count++; 4012 4013 validate: 4014 /* 4015 * Now validate mapping with desired protection/wiring. 4016 */ 4017 if (prot & VM_PROT_WRITE) { 4018 if ((m->oflags & VPO_UNMANAGED) == 0) 4019 vm_page_aflag_set(m, PGA_WRITEABLE); 4020 } 4021 4022 /* 4023 * If the mapping or permission bits are different, we need 4024 * to update the pte2. 4025 * 4026 * QQQ: Think again and again what to do 4027 * if the mapping is going to be changed! 4028 */ 4029 if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { 4030 /* 4031 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4032 * is set. Do it now, before the mapping is stored and made 4033 * valid for hardware table walk. If done later, there is a race 4034 * for other threads of current process in lazy loading case. 4035 * Don't do it for kernel memory which is mapped with exec 4036 * permission even if the memory isn't going to hold executable 4037 * code. The only time when icache sync is needed is after 4038 * kernel module is loaded and the relocation info is processed. 4039 * And it's done in elf_cpu_load_file(). 4040 * 4041 * QQQ: (1) Does it exist any better way where 4042 * or how to sync icache? 4043 * (2) Now, we do it on a page basis. 4044 */ 4045 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4046 m->md.pat_mode == VM_MEMATTR_WB_WA && 4047 (opa != pa || (opte2 & PTE2_NX))) 4048 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4049 4050 if (opte2 & PTE2_V) { 4051 /* Change mapping with break-before-make approach. */ 4052 opte2 = pte2_load_clear(pte2p); 4053 pmap_tlb_flush(pmap, va); 4054 pte2_store(pte2p, npte2); 4055 if (om != NULL) { 4056 KASSERT((om->oflags & VPO_UNMANAGED) == 0, 4057 ("%s: om %p unmanaged", __func__, om)); 4058 if ((opte2 & PTE2_A) != 0) 4059 vm_page_aflag_set(om, PGA_REFERENCED); 4060 if (pte2_is_dirty(opte2)) 4061 vm_page_dirty(om); 4062 if (TAILQ_EMPTY(&om->md.pv_list) && 4063 ((om->flags & PG_FICTITIOUS) != 0 || 4064 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4065 vm_page_aflag_clear(om, PGA_WRITEABLE); 4066 } 4067 } else 4068 pte2_store(pte2p, npte2); 4069 } 4070 #if 0 4071 else { 4072 /* 4073 * QQQ: In time when both access and not mofified bits are 4074 * emulated by software, this should not happen. Some 4075 * analysis is need, if this really happen. Missing 4076 * tlb flush somewhere could be the reason. 4077 */ 4078 panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, 4079 va, opte2, npte2); 4080 } 4081 #endif 4082 4083 #if VM_NRESERVLEVEL > 0 4084 /* 4085 * If both the L2 page table page and the reservation are fully 4086 * populated, then attempt promotion. 4087 */ 4088 if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && 4089 sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && 4090 vm_reserv_level_iffullpop(m) == 0) 4091 pmap_promote_pte1(pmap, pte1p, va); 4092 #endif 4093 4094 rv = KERN_SUCCESS; 4095 out: 4096 sched_unpin(); 4097 rw_wunlock(&pvh_global_lock); 4098 PMAP_UNLOCK(pmap); 4099 return (rv); 4100 } 4101 4102 /* 4103 * Do the things to unmap a page in a process. 4104 */ 4105 static int 4106 pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, 4107 struct spglist *free) 4108 { 4109 pt2_entry_t opte2; 4110 vm_page_t m; 4111 4112 rw_assert(&pvh_global_lock, RA_WLOCKED); 4113 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4114 4115 /* Clear and invalidate the mapping. */ 4116 opte2 = pte2_load_clear(pte2p); 4117 pmap_tlb_flush(pmap, va); 4118 4119 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", 4120 __func__, pmap, va, opte2)); 4121 4122 if (opte2 & PTE2_W) 4123 pmap->pm_stats.wired_count -= 1; 4124 pmap->pm_stats.resident_count -= 1; 4125 if (pte2_is_managed(opte2)) { 4126 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4127 if (pte2_is_dirty(opte2)) 4128 vm_page_dirty(m); 4129 if (opte2 & PTE2_A) 4130 vm_page_aflag_set(m, PGA_REFERENCED); 4131 pmap_remove_entry(pmap, m, va); 4132 } 4133 return (pmap_unuse_pt2(pmap, va, free)); 4134 } 4135 4136 /* 4137 * Remove a single page from a process address space. 4138 */ 4139 static void 4140 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 4141 { 4142 pt2_entry_t *pte2p; 4143 4144 rw_assert(&pvh_global_lock, RA_WLOCKED); 4145 KASSERT(curthread->td_pinned > 0, 4146 ("%s: curthread not pinned", __func__)); 4147 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4148 if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || 4149 !pte2_is_valid(pte2_load(pte2p))) 4150 return; 4151 pmap_remove_pte2(pmap, pte2p, va, free); 4152 } 4153 4154 /* 4155 * Remove the given range of addresses from the specified map. 4156 * 4157 * It is assumed that the start and end are properly 4158 * rounded to the page size. 4159 */ 4160 void 4161 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4162 { 4163 vm_offset_t nextva; 4164 pt1_entry_t *pte1p, pte1; 4165 pt2_entry_t *pte2p, pte2; 4166 struct spglist free; 4167 4168 /* 4169 * Perform an unsynchronized read. This is, however, safe. 4170 */ 4171 if (pmap->pm_stats.resident_count == 0) 4172 return; 4173 4174 SLIST_INIT(&free); 4175 4176 rw_wlock(&pvh_global_lock); 4177 sched_pin(); 4178 PMAP_LOCK(pmap); 4179 4180 /* 4181 * Special handling of removing one page. A very common 4182 * operation and easy to short circuit some code. 4183 */ 4184 if (sva + PAGE_SIZE == eva) { 4185 pte1 = pte1_load(pmap_pte1(pmap, sva)); 4186 if (pte1_is_link(pte1)) { 4187 pmap_remove_page(pmap, sva, &free); 4188 goto out; 4189 } 4190 } 4191 4192 for (; sva < eva; sva = nextva) { 4193 /* 4194 * Calculate address for next L2 page table. 4195 */ 4196 nextva = pte1_trunc(sva + PTE1_SIZE); 4197 if (nextva < sva) 4198 nextva = eva; 4199 if (pmap->pm_stats.resident_count == 0) 4200 break; 4201 4202 pte1p = pmap_pte1(pmap, sva); 4203 pte1 = pte1_load(pte1p); 4204 4205 /* 4206 * Weed out invalid mappings. Note: we assume that the L1 page 4207 * table is always allocated, and in kernel virtual. 4208 */ 4209 if (pte1 == 0) 4210 continue; 4211 4212 if (pte1_is_section(pte1)) { 4213 /* 4214 * Are we removing the entire large page? If not, 4215 * demote the mapping and fall through. 4216 */ 4217 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4218 pmap_remove_pte1(pmap, pte1p, sva, &free); 4219 continue; 4220 } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4221 /* The large page mapping was destroyed. */ 4222 continue; 4223 } 4224 #ifdef INVARIANTS 4225 else { 4226 /* Update pte1 after demotion. */ 4227 pte1 = pte1_load(pte1p); 4228 } 4229 #endif 4230 } 4231 4232 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4233 " is not link", __func__, pmap, sva, pte1, pte1p)); 4234 4235 /* 4236 * Limit our scan to either the end of the va represented 4237 * by the current L2 page table page, or to the end of the 4238 * range being removed. 4239 */ 4240 if (nextva > eva) 4241 nextva = eva; 4242 4243 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; 4244 pte2p++, sva += PAGE_SIZE) { 4245 pte2 = pte2_load(pte2p); 4246 if (!pte2_is_valid(pte2)) 4247 continue; 4248 if (pmap_remove_pte2(pmap, pte2p, sva, &free)) 4249 break; 4250 } 4251 } 4252 out: 4253 sched_unpin(); 4254 rw_wunlock(&pvh_global_lock); 4255 PMAP_UNLOCK(pmap); 4256 vm_page_free_pages_toq(&free, false); 4257 } 4258 4259 /* 4260 * Routine: pmap_remove_all 4261 * Function: 4262 * Removes this physical page from 4263 * all physical maps in which it resides. 4264 * Reflects back modify bits to the pager. 4265 * 4266 * Notes: 4267 * Original versions of this routine were very 4268 * inefficient because they iteratively called 4269 * pmap_remove (slow...) 4270 */ 4271 4272 void 4273 pmap_remove_all(vm_page_t m) 4274 { 4275 struct md_page *pvh; 4276 pv_entry_t pv; 4277 pmap_t pmap; 4278 pt2_entry_t *pte2p, opte2; 4279 pt1_entry_t *pte1p; 4280 vm_offset_t va; 4281 struct spglist free; 4282 4283 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4284 ("%s: page %p is not managed", __func__, m)); 4285 SLIST_INIT(&free); 4286 rw_wlock(&pvh_global_lock); 4287 sched_pin(); 4288 if ((m->flags & PG_FICTITIOUS) != 0) 4289 goto small_mappings; 4290 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4291 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4292 va = pv->pv_va; 4293 pmap = PV_PMAP(pv); 4294 PMAP_LOCK(pmap); 4295 pte1p = pmap_pte1(pmap, va); 4296 (void)pmap_demote_pte1(pmap, pte1p, va); 4297 PMAP_UNLOCK(pmap); 4298 } 4299 small_mappings: 4300 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4301 pmap = PV_PMAP(pv); 4302 PMAP_LOCK(pmap); 4303 pmap->pm_stats.resident_count--; 4304 pte1p = pmap_pte1(pmap, pv->pv_va); 4305 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " 4306 "a 1mpage in page %p's pv list", __func__, m)); 4307 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 4308 opte2 = pte2_load_clear(pte2p); 4309 pmap_tlb_flush(pmap, pv->pv_va); 4310 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", 4311 __func__, pmap, pv->pv_va)); 4312 if (pte2_is_wired(opte2)) 4313 pmap->pm_stats.wired_count--; 4314 if (opte2 & PTE2_A) 4315 vm_page_aflag_set(m, PGA_REFERENCED); 4316 4317 /* 4318 * Update the vm_page_t clean and reference bits. 4319 */ 4320 if (pte2_is_dirty(opte2)) 4321 vm_page_dirty(m); 4322 pmap_unuse_pt2(pmap, pv->pv_va, &free); 4323 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4324 free_pv_entry(pmap, pv); 4325 PMAP_UNLOCK(pmap); 4326 } 4327 vm_page_aflag_clear(m, PGA_WRITEABLE); 4328 sched_unpin(); 4329 rw_wunlock(&pvh_global_lock); 4330 vm_page_free_pages_toq(&free, false); 4331 } 4332 4333 /* 4334 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4335 * good coding style, a.k.a. 80 character line width limit hell. 4336 */ 4337 static __inline void 4338 pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, 4339 struct spglist *free) 4340 { 4341 vm_paddr_t pa; 4342 vm_page_t m, mt, mpt2pg; 4343 struct md_page *pvh; 4344 4345 pa = pte1_pa(pte1); 4346 m = PHYS_TO_VM_PAGE(pa); 4347 4348 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4349 __func__, m, m->phys_addr, pa)); 4350 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4351 m < &vm_page_array[vm_page_array_size], 4352 ("%s: bad pte1 %#x", __func__, pte1)); 4353 4354 if (pte1_is_dirty(pte1)) { 4355 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4356 vm_page_dirty(mt); 4357 } 4358 4359 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 4360 pvh = pa_to_pvh(pa); 4361 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4362 if (TAILQ_EMPTY(&pvh->pv_list)) { 4363 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4364 if (TAILQ_EMPTY(&mt->md.pv_list)) 4365 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4366 } 4367 mpt2pg = pmap_pt2_page(pmap, pv->pv_va); 4368 if (mpt2pg != NULL) 4369 pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); 4370 } 4371 4372 /* 4373 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4374 * good coding style, a.k.a. 80 character line width limit hell. 4375 */ 4376 static __inline void 4377 pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, 4378 struct spglist *free) 4379 { 4380 vm_paddr_t pa; 4381 vm_page_t m; 4382 struct md_page *pvh; 4383 4384 pa = pte2_pa(pte2); 4385 m = PHYS_TO_VM_PAGE(pa); 4386 4387 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4388 __func__, m, m->phys_addr, pa)); 4389 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4390 m < &vm_page_array[vm_page_array_size], 4391 ("%s: bad pte2 %#x", __func__, pte2)); 4392 4393 if (pte2_is_dirty(pte2)) 4394 vm_page_dirty(m); 4395 4396 pmap->pm_stats.resident_count--; 4397 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4398 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 4399 pvh = pa_to_pvh(pa); 4400 if (TAILQ_EMPTY(&pvh->pv_list)) 4401 vm_page_aflag_clear(m, PGA_WRITEABLE); 4402 } 4403 pmap_unuse_pt2(pmap, pv->pv_va, free); 4404 } 4405 4406 /* 4407 * Remove all pages from specified address space this aids process 4408 * exit speeds. Also, this code is special cased for current process 4409 * only, but can have the more generic (and slightly slower) mode enabled. 4410 * This is much faster than pmap_remove in the case of running down 4411 * an entire address space. 4412 */ 4413 void 4414 pmap_remove_pages(pmap_t pmap) 4415 { 4416 pt1_entry_t *pte1p, pte1; 4417 pt2_entry_t *pte2p, pte2; 4418 pv_entry_t pv; 4419 struct pv_chunk *pc, *npc; 4420 struct spglist free; 4421 int field, idx; 4422 int32_t bit; 4423 uint32_t inuse, bitmask; 4424 boolean_t allfree; 4425 4426 /* 4427 * Assert that the given pmap is only active on the current 4428 * CPU. Unfortunately, we cannot block another CPU from 4429 * activating the pmap while this function is executing. 4430 */ 4431 KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), 4432 ("%s: non-current pmap %p", __func__, pmap)); 4433 #if defined(SMP) && defined(INVARIANTS) 4434 { 4435 cpuset_t other_cpus; 4436 4437 sched_pin(); 4438 other_cpus = pmap->pm_active; 4439 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 4440 sched_unpin(); 4441 KASSERT(CPU_EMPTY(&other_cpus), 4442 ("%s: pmap %p active on other cpus", __func__, pmap)); 4443 } 4444 #endif 4445 SLIST_INIT(&free); 4446 rw_wlock(&pvh_global_lock); 4447 PMAP_LOCK(pmap); 4448 sched_pin(); 4449 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4450 KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", 4451 __func__, pmap, pc->pc_pmap)); 4452 allfree = TRUE; 4453 for (field = 0; field < _NPCM; field++) { 4454 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4455 while (inuse != 0) { 4456 bit = ffs(inuse) - 1; 4457 bitmask = 1UL << bit; 4458 idx = field * 32 + bit; 4459 pv = &pc->pc_pventry[idx]; 4460 inuse &= ~bitmask; 4461 4462 /* 4463 * Note that we cannot remove wired pages 4464 * from a process' mapping at this time 4465 */ 4466 pte1p = pmap_pte1(pmap, pv->pv_va); 4467 pte1 = pte1_load(pte1p); 4468 if (pte1_is_section(pte1)) { 4469 if (pte1_is_wired(pte1)) { 4470 allfree = FALSE; 4471 continue; 4472 } 4473 pte1_clear(pte1p); 4474 pmap_remove_pte1_quick(pmap, pte1, pv, 4475 &free); 4476 } 4477 else if (pte1_is_link(pte1)) { 4478 pte2p = pt2map_entry(pv->pv_va); 4479 pte2 = pte2_load(pte2p); 4480 4481 if (!pte2_is_valid(pte2)) { 4482 printf("%s: pmap %p va %#x " 4483 "pte2 %#x\n", __func__, 4484 pmap, pv->pv_va, pte2); 4485 panic("bad pte2"); 4486 } 4487 4488 if (pte2_is_wired(pte2)) { 4489 allfree = FALSE; 4490 continue; 4491 } 4492 pte2_clear(pte2p); 4493 pmap_remove_pte2_quick(pmap, pte2, pv, 4494 &free); 4495 } else { 4496 printf("%s: pmap %p va %#x pte1 %#x\n", 4497 __func__, pmap, pv->pv_va, pte1); 4498 panic("bad pte1"); 4499 } 4500 4501 /* Mark free */ 4502 PV_STAT(pv_entry_frees++); 4503 PV_STAT(pv_entry_spare++); 4504 pv_entry_count--; 4505 pc->pc_map[field] |= bitmask; 4506 } 4507 } 4508 if (allfree) { 4509 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4510 free_pv_chunk(pc); 4511 } 4512 } 4513 tlb_flush_all_ng_local(); 4514 sched_unpin(); 4515 rw_wunlock(&pvh_global_lock); 4516 PMAP_UNLOCK(pmap); 4517 vm_page_free_pages_toq(&free, false); 4518 } 4519 4520 /* 4521 * This code makes some *MAJOR* assumptions: 4522 * 1. Current pmap & pmap exists. 4523 * 2. Not wired. 4524 * 3. Read access. 4525 * 4. No L2 page table pages. 4526 * but is *MUCH* faster than pmap_enter... 4527 */ 4528 static vm_page_t 4529 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4530 vm_prot_t prot, vm_page_t mpt2pg) 4531 { 4532 pt2_entry_t *pte2p, pte2; 4533 vm_paddr_t pa; 4534 struct spglist free; 4535 uint32_t l2prot; 4536 4537 KASSERT(!VA_IS_CLEANMAP(va) || 4538 (m->oflags & VPO_UNMANAGED) != 0, 4539 ("%s: managed mapping within the clean submap", __func__)); 4540 rw_assert(&pvh_global_lock, RA_WLOCKED); 4541 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4542 4543 /* 4544 * In the case that a L2 page table page is not 4545 * resident, we are creating it here. 4546 */ 4547 if (va < VM_MAXUSER_ADDRESS) { 4548 u_int pte1_idx; 4549 pt1_entry_t pte1, *pte1p; 4550 vm_paddr_t pt2_pa; 4551 4552 /* 4553 * Get L1 page table things. 4554 */ 4555 pte1_idx = pte1_index(va); 4556 pte1p = pmap_pte1(pmap, va); 4557 pte1 = pte1_load(pte1p); 4558 4559 if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { 4560 /* 4561 * Each of NPT2_IN_PG L2 page tables on the page can 4562 * come here. Make sure that associated L1 page table 4563 * link is established. 4564 * 4565 * QQQ: It comes that we don't establish all links to 4566 * L2 page tables for newly allocated L2 page 4567 * tables page. 4568 */ 4569 KASSERT(!pte1_is_section(pte1), 4570 ("%s: pte1 %#x is section", __func__, pte1)); 4571 if (!pte1_is_link(pte1)) { 4572 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), 4573 pte1_idx); 4574 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 4575 } 4576 pt2_wirecount_inc(mpt2pg, pte1_idx); 4577 } else { 4578 /* 4579 * If the L2 page table page is mapped, we just 4580 * increment the hold count, and activate it. 4581 */ 4582 if (pte1_is_section(pte1)) { 4583 return (NULL); 4584 } else if (pte1_is_link(pte1)) { 4585 mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 4586 pt2_wirecount_inc(mpt2pg, pte1_idx); 4587 } else { 4588 mpt2pg = _pmap_allocpte2(pmap, va, 4589 PMAP_ENTER_NOSLEEP); 4590 if (mpt2pg == NULL) 4591 return (NULL); 4592 } 4593 } 4594 } else { 4595 mpt2pg = NULL; 4596 } 4597 4598 /* 4599 * This call to pt2map_entry() makes the assumption that we are 4600 * entering the page into the current pmap. In order to support 4601 * quick entry into any pmap, one would likely use pmap_pte2_quick(). 4602 * But that isn't as quick as pt2map_entry(). 4603 */ 4604 pte2p = pt2map_entry(va); 4605 pte2 = pte2_load(pte2p); 4606 if (pte2_is_valid(pte2)) { 4607 if (mpt2pg != NULL) { 4608 /* 4609 * Remove extra pte2 reference 4610 */ 4611 pt2_wirecount_dec(mpt2pg, pte1_index(va)); 4612 mpt2pg = NULL; 4613 } 4614 return (NULL); 4615 } 4616 4617 /* 4618 * Enter on the PV list if part of our managed memory. 4619 */ 4620 if ((m->oflags & VPO_UNMANAGED) == 0 && 4621 !pmap_try_insert_pv_entry(pmap, va, m)) { 4622 if (mpt2pg != NULL) { 4623 SLIST_INIT(&free); 4624 if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { 4625 pmap_tlb_flush(pmap, va); 4626 vm_page_free_pages_toq(&free, false); 4627 } 4628 4629 mpt2pg = NULL; 4630 } 4631 return (NULL); 4632 } 4633 4634 /* 4635 * Increment counters 4636 */ 4637 pmap->pm_stats.resident_count++; 4638 4639 /* 4640 * Now validate mapping with RO protection 4641 */ 4642 pa = VM_PAGE_TO_PHYS(m); 4643 l2prot = PTE2_RO | PTE2_NM; 4644 if (va < VM_MAXUSER_ADDRESS) 4645 l2prot |= PTE2_U | PTE2_NG; 4646 if ((prot & VM_PROT_EXECUTE) == 0) 4647 l2prot |= PTE2_NX; 4648 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4649 /* 4650 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4651 * is set. QQQ: For more info, see comments in pmap_enter(). 4652 */ 4653 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4654 } 4655 pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); 4656 4657 return (mpt2pg); 4658 } 4659 4660 void 4661 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4662 { 4663 4664 rw_wlock(&pvh_global_lock); 4665 PMAP_LOCK(pmap); 4666 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4667 rw_wunlock(&pvh_global_lock); 4668 PMAP_UNLOCK(pmap); 4669 } 4670 4671 /* 4672 * Tries to create a read- and/or execute-only 1 MB page mapping. Returns 4673 * true if successful. Returns false if (1) a mapping already exists at the 4674 * specified virtual address or (2) a PV entry cannot be allocated without 4675 * reclaiming another PV entry. 4676 */ 4677 static bool 4678 pmap_enter_1mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4679 { 4680 pt1_entry_t pte1; 4681 vm_paddr_t pa; 4682 4683 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4684 pa = VM_PAGE_TO_PHYS(m); 4685 pte1 = PTE1(pa, PTE1_NM | PTE1_RO, ATTR_TO_L1(vm_page_pte2_attr(m))); 4686 if ((prot & VM_PROT_EXECUTE) == 0) 4687 pte1 |= PTE1_NX; 4688 if (va < VM_MAXUSER_ADDRESS) 4689 pte1 |= PTE1_U; 4690 if (pmap != kernel_pmap) 4691 pte1 |= PTE1_NG; 4692 return (pmap_enter_pte1(pmap, va, pte1, PMAP_ENTER_NOSLEEP | 4693 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m) == KERN_SUCCESS); 4694 } 4695 4696 /* 4697 * Tries to create the specified 1 MB page mapping. Returns KERN_SUCCESS if 4698 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4699 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4700 * a mapping already exists at the specified virtual address. Returns 4701 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NORECLAIM was specified and PV entry 4702 * allocation failed. 4703 */ 4704 static int 4705 pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, 4706 vm_page_t m) 4707 { 4708 struct spglist free; 4709 pt1_entry_t opte1, *pte1p; 4710 pt2_entry_t pte2, *pte2p; 4711 vm_offset_t cur, end; 4712 vm_page_t mt; 4713 4714 rw_assert(&pvh_global_lock, RA_WLOCKED); 4715 KASSERT((pte1 & (PTE1_NM | PTE1_RO)) == 0 || 4716 (pte1 & (PTE1_NM | PTE1_RO)) == (PTE1_NM | PTE1_RO), 4717 ("%s: pte1 has inconsistent NM and RO attributes", __func__)); 4718 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4719 pte1p = pmap_pte1(pmap, va); 4720 opte1 = pte1_load(pte1p); 4721 if (pte1_is_valid(opte1)) { 4722 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 4723 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4724 __func__, va, pmap); 4725 return (KERN_FAILURE); 4726 } 4727 /* Break the existing mapping(s). */ 4728 SLIST_INIT(&free); 4729 if (pte1_is_section(opte1)) { 4730 /* 4731 * If the section resulted from a promotion, then a 4732 * reserved PT page could be freed. 4733 */ 4734 pmap_remove_pte1(pmap, pte1p, va, &free); 4735 } else { 4736 sched_pin(); 4737 end = va + PTE1_SIZE; 4738 for (cur = va, pte2p = pmap_pte2_quick(pmap, va); 4739 cur != end; cur += PAGE_SIZE, pte2p++) { 4740 pte2 = pte2_load(pte2p); 4741 if (!pte2_is_valid(pte2)) 4742 continue; 4743 if (pmap_remove_pte2(pmap, pte2p, cur, &free)) 4744 break; 4745 } 4746 sched_unpin(); 4747 } 4748 vm_page_free_pages_toq(&free, false); 4749 } 4750 if ((m->oflags & VPO_UNMANAGED) == 0) { 4751 /* 4752 * Abort this mapping if its PV entry could not be created. 4753 */ 4754 if (!pmap_pv_insert_pte1(pmap, va, pte1, flags)) { 4755 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4756 __func__, va, pmap); 4757 return (KERN_RESOURCE_SHORTAGE); 4758 } 4759 if ((pte1 & PTE1_RO) == 0) { 4760 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4761 vm_page_aflag_set(mt, PGA_WRITEABLE); 4762 } 4763 } 4764 4765 /* 4766 * Increment counters. 4767 */ 4768 if (pte1_is_wired(pte1)) 4769 pmap->pm_stats.wired_count += PTE1_SIZE / PAGE_SIZE; 4770 pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; 4771 4772 /* 4773 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4774 * is set. QQQ: For more info, see comments in pmap_enter(). 4775 */ 4776 if ((pte1 & PTE1_NX) == 0 && m->md.pat_mode == VM_MEMATTR_WB_WA && 4777 pmap != kernel_pmap && (!pte1_is_section(opte1) || 4778 pte1_pa(opte1) != VM_PAGE_TO_PHYS(m) || (opte1 & PTE2_NX) != 0)) 4779 cache_icache_sync_fresh(va, VM_PAGE_TO_PHYS(m), PTE1_SIZE); 4780 4781 /* 4782 * Map the section. 4783 */ 4784 pte1_store(pte1p, pte1); 4785 4786 pmap_pte1_mappings++; 4787 CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, 4788 pmap); 4789 return (KERN_SUCCESS); 4790 } 4791 4792 /* 4793 * Maps a sequence of resident pages belonging to the same object. 4794 * The sequence begins with the given page m_start. This page is 4795 * mapped at the given virtual address start. Each subsequent page is 4796 * mapped at a virtual address that is offset from start by the same 4797 * amount as the page is offset from m_start within the object. The 4798 * last page in the sequence is the page with the largest offset from 4799 * m_start that can be mapped at a virtual address less than the given 4800 * virtual address end. Not every virtual page between start and end 4801 * is mapped; only those for which a resident page exists with the 4802 * corresponding offset from m_start are mapped. 4803 */ 4804 void 4805 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4806 vm_page_t m_start, vm_prot_t prot) 4807 { 4808 vm_offset_t va; 4809 vm_page_t m, mpt2pg; 4810 vm_pindex_t diff, psize; 4811 4812 PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", 4813 __func__, pmap, start, end, m_start, prot)); 4814 4815 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4816 psize = atop(end - start); 4817 mpt2pg = NULL; 4818 m = m_start; 4819 rw_wlock(&pvh_global_lock); 4820 PMAP_LOCK(pmap); 4821 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4822 va = start + ptoa(diff); 4823 if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && 4824 m->psind == 1 && sp_enabled && 4825 pmap_enter_1mpage(pmap, va, m, prot)) 4826 m = &m[PTE1_SIZE / PAGE_SIZE - 1]; 4827 else 4828 mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, 4829 mpt2pg); 4830 m = TAILQ_NEXT(m, listq); 4831 } 4832 rw_wunlock(&pvh_global_lock); 4833 PMAP_UNLOCK(pmap); 4834 } 4835 4836 /* 4837 * This code maps large physical mmap regions into the 4838 * processor address space. Note that some shortcuts 4839 * are taken, but the code works. 4840 */ 4841 void 4842 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4843 vm_pindex_t pindex, vm_size_t size) 4844 { 4845 pt1_entry_t *pte1p; 4846 vm_paddr_t pa, pte2_pa; 4847 vm_page_t p; 4848 vm_memattr_t pat_mode; 4849 u_int l1attr, l1prot; 4850 4851 VM_OBJECT_ASSERT_WLOCKED(object); 4852 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4853 ("%s: non-device object", __func__)); 4854 if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { 4855 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4856 return; 4857 p = vm_page_lookup(object, pindex); 4858 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4859 ("%s: invalid page %p", __func__, p)); 4860 pat_mode = p->md.pat_mode; 4861 4862 /* 4863 * Abort the mapping if the first page is not physically 4864 * aligned to a 1MB page boundary. 4865 */ 4866 pte2_pa = VM_PAGE_TO_PHYS(p); 4867 if (pte2_pa & PTE1_OFFSET) 4868 return; 4869 4870 /* 4871 * Skip the first page. Abort the mapping if the rest of 4872 * the pages are not physically contiguous or have differing 4873 * memory attributes. 4874 */ 4875 p = TAILQ_NEXT(p, listq); 4876 for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; 4877 pa += PAGE_SIZE) { 4878 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4879 ("%s: invalid page %p", __func__, p)); 4880 if (pa != VM_PAGE_TO_PHYS(p) || 4881 pat_mode != p->md.pat_mode) 4882 return; 4883 p = TAILQ_NEXT(p, listq); 4884 } 4885 4886 /* 4887 * Map using 1MB pages. 4888 * 4889 * QQQ: Well, we are mapping a section, so same condition must 4890 * be hold like during promotion. It looks that only RW mapping 4891 * is done here, so readonly mapping must be done elsewhere. 4892 */ 4893 l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; 4894 l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); 4895 PMAP_LOCK(pmap); 4896 for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { 4897 pte1p = pmap_pte1(pmap, addr); 4898 if (!pte1_is_valid(pte1_load(pte1p))) { 4899 pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); 4900 pmap->pm_stats.resident_count += PTE1_SIZE / 4901 PAGE_SIZE; 4902 pmap_pte1_mappings++; 4903 } 4904 /* Else continue on if the PTE1 is already valid. */ 4905 addr += PTE1_SIZE; 4906 } 4907 PMAP_UNLOCK(pmap); 4908 } 4909 } 4910 4911 /* 4912 * Do the things to protect a 1mpage in a process. 4913 */ 4914 static void 4915 pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 4916 vm_prot_t prot) 4917 { 4918 pt1_entry_t npte1, opte1; 4919 vm_offset_t eva, va; 4920 vm_page_t m; 4921 4922 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4923 KASSERT((sva & PTE1_OFFSET) == 0, 4924 ("%s: sva is not 1mpage aligned", __func__)); 4925 4926 opte1 = npte1 = pte1_load(pte1p); 4927 if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { 4928 eva = sva + PTE1_SIZE; 4929 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 4930 va < eva; va += PAGE_SIZE, m++) 4931 vm_page_dirty(m); 4932 } 4933 if ((prot & VM_PROT_WRITE) == 0) 4934 npte1 |= PTE1_RO | PTE1_NM; 4935 if ((prot & VM_PROT_EXECUTE) == 0) 4936 npte1 |= PTE1_NX; 4937 4938 /* 4939 * QQQ: Herein, execute permission is never set. 4940 * It only can be cleared. So, no icache 4941 * syncing is needed. 4942 */ 4943 4944 if (npte1 != opte1) { 4945 pte1_store(pte1p, npte1); 4946 pmap_tlb_flush(pmap, sva); 4947 } 4948 } 4949 4950 /* 4951 * Set the physical protection on the 4952 * specified range of this map as requested. 4953 */ 4954 void 4955 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4956 { 4957 boolean_t pv_lists_locked; 4958 vm_offset_t nextva; 4959 pt1_entry_t *pte1p, pte1; 4960 pt2_entry_t *pte2p, opte2, npte2; 4961 4962 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4963 if (prot == VM_PROT_NONE) { 4964 pmap_remove(pmap, sva, eva); 4965 return; 4966 } 4967 4968 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 4969 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 4970 return; 4971 4972 if (pmap_is_current(pmap)) 4973 pv_lists_locked = FALSE; 4974 else { 4975 pv_lists_locked = TRUE; 4976 resume: 4977 rw_wlock(&pvh_global_lock); 4978 sched_pin(); 4979 } 4980 4981 PMAP_LOCK(pmap); 4982 for (; sva < eva; sva = nextva) { 4983 /* 4984 * Calculate address for next L2 page table. 4985 */ 4986 nextva = pte1_trunc(sva + PTE1_SIZE); 4987 if (nextva < sva) 4988 nextva = eva; 4989 4990 pte1p = pmap_pte1(pmap, sva); 4991 pte1 = pte1_load(pte1p); 4992 4993 /* 4994 * Weed out invalid mappings. Note: we assume that L1 page 4995 * page table is always allocated, and in kernel virtual. 4996 */ 4997 if (pte1 == 0) 4998 continue; 4999 5000 if (pte1_is_section(pte1)) { 5001 /* 5002 * Are we protecting the entire large page? If not, 5003 * demote the mapping and fall through. 5004 */ 5005 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5006 pmap_protect_pte1(pmap, pte1p, sva, prot); 5007 continue; 5008 } else { 5009 if (!pv_lists_locked) { 5010 pv_lists_locked = TRUE; 5011 if (!rw_try_wlock(&pvh_global_lock)) { 5012 PMAP_UNLOCK(pmap); 5013 goto resume; 5014 } 5015 sched_pin(); 5016 } 5017 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5018 /* 5019 * The large page mapping 5020 * was destroyed. 5021 */ 5022 continue; 5023 } 5024 #ifdef INVARIANTS 5025 else { 5026 /* Update pte1 after demotion */ 5027 pte1 = pte1_load(pte1p); 5028 } 5029 #endif 5030 } 5031 } 5032 5033 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5034 " is not link", __func__, pmap, sva, pte1, pte1p)); 5035 5036 /* 5037 * Limit our scan to either the end of the va represented 5038 * by the current L2 page table page, or to the end of the 5039 * range being protected. 5040 */ 5041 if (nextva > eva) 5042 nextva = eva; 5043 5044 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5045 sva += PAGE_SIZE) { 5046 vm_page_t m; 5047 5048 opte2 = npte2 = pte2_load(pte2p); 5049 if (!pte2_is_valid(opte2)) 5050 continue; 5051 5052 if ((prot & VM_PROT_WRITE) == 0) { 5053 if (pte2_is_managed(opte2) && 5054 pte2_is_dirty(opte2)) { 5055 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 5056 vm_page_dirty(m); 5057 } 5058 npte2 |= PTE2_RO | PTE2_NM; 5059 } 5060 5061 if ((prot & VM_PROT_EXECUTE) == 0) 5062 npte2 |= PTE2_NX; 5063 5064 /* 5065 * QQQ: Herein, execute permission is never set. 5066 * It only can be cleared. So, no icache 5067 * syncing is needed. 5068 */ 5069 5070 if (npte2 != opte2) { 5071 pte2_store(pte2p, npte2); 5072 pmap_tlb_flush(pmap, sva); 5073 } 5074 } 5075 } 5076 if (pv_lists_locked) { 5077 sched_unpin(); 5078 rw_wunlock(&pvh_global_lock); 5079 } 5080 PMAP_UNLOCK(pmap); 5081 } 5082 5083 /* 5084 * pmap_pvh_wired_mappings: 5085 * 5086 * Return the updated number "count" of managed mappings that are wired. 5087 */ 5088 static int 5089 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 5090 { 5091 pmap_t pmap; 5092 pt1_entry_t pte1; 5093 pt2_entry_t pte2; 5094 pv_entry_t pv; 5095 5096 rw_assert(&pvh_global_lock, RA_WLOCKED); 5097 sched_pin(); 5098 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5099 pmap = PV_PMAP(pv); 5100 PMAP_LOCK(pmap); 5101 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5102 if (pte1_is_section(pte1)) { 5103 if (pte1_is_wired(pte1)) 5104 count++; 5105 } else { 5106 KASSERT(pte1_is_link(pte1), 5107 ("%s: pte1 %#x is not link", __func__, pte1)); 5108 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5109 if (pte2_is_wired(pte2)) 5110 count++; 5111 } 5112 PMAP_UNLOCK(pmap); 5113 } 5114 sched_unpin(); 5115 return (count); 5116 } 5117 5118 /* 5119 * pmap_page_wired_mappings: 5120 * 5121 * Return the number of managed mappings to the given physical page 5122 * that are wired. 5123 */ 5124 int 5125 pmap_page_wired_mappings(vm_page_t m) 5126 { 5127 int count; 5128 5129 count = 0; 5130 if ((m->oflags & VPO_UNMANAGED) != 0) 5131 return (count); 5132 rw_wlock(&pvh_global_lock); 5133 count = pmap_pvh_wired_mappings(&m->md, count); 5134 if ((m->flags & PG_FICTITIOUS) == 0) { 5135 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 5136 count); 5137 } 5138 rw_wunlock(&pvh_global_lock); 5139 return (count); 5140 } 5141 5142 /* 5143 * Returns TRUE if any of the given mappings were used to modify 5144 * physical memory. Otherwise, returns FALSE. Both page and 1mpage 5145 * mappings are supported. 5146 */ 5147 static boolean_t 5148 pmap_is_modified_pvh(struct md_page *pvh) 5149 { 5150 pv_entry_t pv; 5151 pt1_entry_t pte1; 5152 pt2_entry_t pte2; 5153 pmap_t pmap; 5154 boolean_t rv; 5155 5156 rw_assert(&pvh_global_lock, RA_WLOCKED); 5157 rv = FALSE; 5158 sched_pin(); 5159 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5160 pmap = PV_PMAP(pv); 5161 PMAP_LOCK(pmap); 5162 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5163 if (pte1_is_section(pte1)) { 5164 rv = pte1_is_dirty(pte1); 5165 } else { 5166 KASSERT(pte1_is_link(pte1), 5167 ("%s: pte1 %#x is not link", __func__, pte1)); 5168 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5169 rv = pte2_is_dirty(pte2); 5170 } 5171 PMAP_UNLOCK(pmap); 5172 if (rv) 5173 break; 5174 } 5175 sched_unpin(); 5176 return (rv); 5177 } 5178 5179 /* 5180 * pmap_is_modified: 5181 * 5182 * Return whether or not the specified physical page was modified 5183 * in any physical maps. 5184 */ 5185 boolean_t 5186 pmap_is_modified(vm_page_t m) 5187 { 5188 boolean_t rv; 5189 5190 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5191 ("%s: page %p is not managed", __func__, m)); 5192 5193 /* 5194 * If the page is not busied then this check is racy. 5195 */ 5196 if (!pmap_page_is_write_mapped(m)) 5197 return (FALSE); 5198 rw_wlock(&pvh_global_lock); 5199 rv = pmap_is_modified_pvh(&m->md) || 5200 ((m->flags & PG_FICTITIOUS) == 0 && 5201 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5202 rw_wunlock(&pvh_global_lock); 5203 return (rv); 5204 } 5205 5206 /* 5207 * pmap_is_prefaultable: 5208 * 5209 * Return whether or not the specified virtual address is eligible 5210 * for prefault. 5211 */ 5212 boolean_t 5213 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5214 { 5215 pt1_entry_t pte1; 5216 pt2_entry_t pte2; 5217 boolean_t rv; 5218 5219 rv = FALSE; 5220 PMAP_LOCK(pmap); 5221 pte1 = pte1_load(pmap_pte1(pmap, addr)); 5222 if (pte1_is_link(pte1)) { 5223 pte2 = pte2_load(pt2map_entry(addr)); 5224 rv = !pte2_is_valid(pte2) ; 5225 } 5226 PMAP_UNLOCK(pmap); 5227 return (rv); 5228 } 5229 5230 /* 5231 * Returns TRUE if any of the given mappings were referenced and FALSE 5232 * otherwise. Both page and 1mpage mappings are supported. 5233 */ 5234 static boolean_t 5235 pmap_is_referenced_pvh(struct md_page *pvh) 5236 { 5237 5238 pv_entry_t pv; 5239 pt1_entry_t pte1; 5240 pt2_entry_t pte2; 5241 pmap_t pmap; 5242 boolean_t rv; 5243 5244 rw_assert(&pvh_global_lock, RA_WLOCKED); 5245 rv = FALSE; 5246 sched_pin(); 5247 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5248 pmap = PV_PMAP(pv); 5249 PMAP_LOCK(pmap); 5250 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5251 if (pte1_is_section(pte1)) { 5252 rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); 5253 } else { 5254 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5255 rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); 5256 } 5257 PMAP_UNLOCK(pmap); 5258 if (rv) 5259 break; 5260 } 5261 sched_unpin(); 5262 return (rv); 5263 } 5264 5265 /* 5266 * pmap_is_referenced: 5267 * 5268 * Return whether or not the specified physical page was referenced 5269 * in any physical maps. 5270 */ 5271 boolean_t 5272 pmap_is_referenced(vm_page_t m) 5273 { 5274 boolean_t rv; 5275 5276 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5277 ("%s: page %p is not managed", __func__, m)); 5278 rw_wlock(&pvh_global_lock); 5279 rv = pmap_is_referenced_pvh(&m->md) || 5280 ((m->flags & PG_FICTITIOUS) == 0 && 5281 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5282 rw_wunlock(&pvh_global_lock); 5283 return (rv); 5284 } 5285 5286 /* 5287 * pmap_ts_referenced: 5288 * 5289 * Return a count of reference bits for a page, clearing those bits. 5290 * It is not necessary for every reference bit to be cleared, but it 5291 * is necessary that 0 only be returned when there are truly no 5292 * reference bits set. 5293 * 5294 * As an optimization, update the page's dirty field if a modified bit is 5295 * found while counting reference bits. This opportunistic update can be 5296 * performed at low cost and can eliminate the need for some future calls 5297 * to pmap_is_modified(). However, since this function stops after 5298 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5299 * dirty pages. Those dirty pages will only be detected by a future call 5300 * to pmap_is_modified(). 5301 */ 5302 int 5303 pmap_ts_referenced(vm_page_t m) 5304 { 5305 struct md_page *pvh; 5306 pv_entry_t pv, pvf; 5307 pmap_t pmap; 5308 pt1_entry_t *pte1p, opte1; 5309 pt2_entry_t *pte2p, opte2; 5310 vm_paddr_t pa; 5311 int rtval = 0; 5312 5313 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5314 ("%s: page %p is not managed", __func__, m)); 5315 pa = VM_PAGE_TO_PHYS(m); 5316 pvh = pa_to_pvh(pa); 5317 rw_wlock(&pvh_global_lock); 5318 sched_pin(); 5319 if ((m->flags & PG_FICTITIOUS) != 0 || 5320 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5321 goto small_mappings; 5322 pv = pvf; 5323 do { 5324 pmap = PV_PMAP(pv); 5325 PMAP_LOCK(pmap); 5326 pte1p = pmap_pte1(pmap, pv->pv_va); 5327 opte1 = pte1_load(pte1p); 5328 if (pte1_is_dirty(opte1)) { 5329 /* 5330 * Although "opte1" is mapping a 1MB page, because 5331 * this function is called at a 4KB page granularity, 5332 * we only update the 4KB page under test. 5333 */ 5334 vm_page_dirty(m); 5335 } 5336 if ((opte1 & PTE1_A) != 0) { 5337 /* 5338 * Since this reference bit is shared by 256 4KB pages, 5339 * it should not be cleared every time it is tested. 5340 * Apply a simple "hash" function on the physical page 5341 * number, the virtual section number, and the pmap 5342 * address to select one 4KB page out of the 256 5343 * on which testing the reference bit will result 5344 * in clearing that bit. This function is designed 5345 * to avoid the selection of the same 4KB page 5346 * for every 1MB page mapping. 5347 * 5348 * On demotion, a mapping that hasn't been referenced 5349 * is simply destroyed. To avoid the possibility of a 5350 * subsequent page fault on a demoted wired mapping, 5351 * always leave its reference bit set. Moreover, 5352 * since the section is wired, the current state of 5353 * its reference bit won't affect page replacement. 5354 */ 5355 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ 5356 (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && 5357 !pte1_is_wired(opte1)) { 5358 pte1_clear_bit(pte1p, PTE1_A); 5359 pmap_tlb_flush(pmap, pv->pv_va); 5360 } 5361 rtval++; 5362 } 5363 PMAP_UNLOCK(pmap); 5364 /* Rotate the PV list if it has more than one entry. */ 5365 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5366 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5367 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5368 } 5369 if (rtval >= PMAP_TS_REFERENCED_MAX) 5370 goto out; 5371 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5372 small_mappings: 5373 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5374 goto out; 5375 pv = pvf; 5376 do { 5377 pmap = PV_PMAP(pv); 5378 PMAP_LOCK(pmap); 5379 pte1p = pmap_pte1(pmap, pv->pv_va); 5380 KASSERT(pte1_is_link(pte1_load(pte1p)), 5381 ("%s: not found a link in page %p's pv list", __func__, m)); 5382 5383 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5384 opte2 = pte2_load(pte2p); 5385 if (pte2_is_dirty(opte2)) 5386 vm_page_dirty(m); 5387 if ((opte2 & PTE2_A) != 0) { 5388 pte2_clear_bit(pte2p, PTE2_A); 5389 pmap_tlb_flush(pmap, pv->pv_va); 5390 rtval++; 5391 } 5392 PMAP_UNLOCK(pmap); 5393 /* Rotate the PV list if it has more than one entry. */ 5394 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5395 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5396 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5397 } 5398 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5399 PMAP_TS_REFERENCED_MAX); 5400 out: 5401 sched_unpin(); 5402 rw_wunlock(&pvh_global_lock); 5403 return (rtval); 5404 } 5405 5406 /* 5407 * Clear the wired attribute from the mappings for the specified range of 5408 * addresses in the given pmap. Every valid mapping within that range 5409 * must have the wired attribute set. In contrast, invalid mappings 5410 * cannot have the wired attribute set, so they are ignored. 5411 * 5412 * The wired attribute of the page table entry is not a hardware feature, 5413 * so there is no need to invalidate any TLB entries. 5414 */ 5415 void 5416 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5417 { 5418 vm_offset_t nextva; 5419 pt1_entry_t *pte1p, pte1; 5420 pt2_entry_t *pte2p, pte2; 5421 boolean_t pv_lists_locked; 5422 5423 if (pmap_is_current(pmap)) 5424 pv_lists_locked = FALSE; 5425 else { 5426 pv_lists_locked = TRUE; 5427 resume: 5428 rw_wlock(&pvh_global_lock); 5429 sched_pin(); 5430 } 5431 PMAP_LOCK(pmap); 5432 for (; sva < eva; sva = nextva) { 5433 nextva = pte1_trunc(sva + PTE1_SIZE); 5434 if (nextva < sva) 5435 nextva = eva; 5436 5437 pte1p = pmap_pte1(pmap, sva); 5438 pte1 = pte1_load(pte1p); 5439 5440 /* 5441 * Weed out invalid mappings. Note: we assume that L1 page 5442 * page table is always allocated, and in kernel virtual. 5443 */ 5444 if (pte1 == 0) 5445 continue; 5446 5447 if (pte1_is_section(pte1)) { 5448 if (!pte1_is_wired(pte1)) 5449 panic("%s: pte1 %#x not wired", __func__, pte1); 5450 5451 /* 5452 * Are we unwiring the entire large page? If not, 5453 * demote the mapping and fall through. 5454 */ 5455 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5456 pte1_clear_bit(pte1p, PTE1_W); 5457 pmap->pm_stats.wired_count -= PTE1_SIZE / 5458 PAGE_SIZE; 5459 continue; 5460 } else { 5461 if (!pv_lists_locked) { 5462 pv_lists_locked = TRUE; 5463 if (!rw_try_wlock(&pvh_global_lock)) { 5464 PMAP_UNLOCK(pmap); 5465 /* Repeat sva. */ 5466 goto resume; 5467 } 5468 sched_pin(); 5469 } 5470 if (!pmap_demote_pte1(pmap, pte1p, sva)) 5471 panic("%s: demotion failed", __func__); 5472 #ifdef INVARIANTS 5473 else { 5474 /* Update pte1 after demotion */ 5475 pte1 = pte1_load(pte1p); 5476 } 5477 #endif 5478 } 5479 } 5480 5481 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5482 " is not link", __func__, pmap, sva, pte1, pte1p)); 5483 5484 /* 5485 * Limit our scan to either the end of the va represented 5486 * by the current L2 page table page, or to the end of the 5487 * range being protected. 5488 */ 5489 if (nextva > eva) 5490 nextva = eva; 5491 5492 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5493 sva += PAGE_SIZE) { 5494 pte2 = pte2_load(pte2p); 5495 if (!pte2_is_valid(pte2)) 5496 continue; 5497 if (!pte2_is_wired(pte2)) 5498 panic("%s: pte2 %#x is missing PTE2_W", 5499 __func__, pte2); 5500 5501 /* 5502 * PTE2_W must be cleared atomically. Although the pmap 5503 * lock synchronizes access to PTE2_W, another processor 5504 * could be changing PTE2_NM and/or PTE2_A concurrently. 5505 */ 5506 pte2_clear_bit(pte2p, PTE2_W); 5507 pmap->pm_stats.wired_count--; 5508 } 5509 } 5510 if (pv_lists_locked) { 5511 sched_unpin(); 5512 rw_wunlock(&pvh_global_lock); 5513 } 5514 PMAP_UNLOCK(pmap); 5515 } 5516 5517 /* 5518 * Clear the write and modified bits in each of the given page's mappings. 5519 */ 5520 void 5521 pmap_remove_write(vm_page_t m) 5522 { 5523 struct md_page *pvh; 5524 pv_entry_t next_pv, pv; 5525 pmap_t pmap; 5526 pt1_entry_t *pte1p; 5527 pt2_entry_t *pte2p, opte2; 5528 vm_offset_t va; 5529 5530 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5531 ("%s: page %p is not managed", __func__, m)); 5532 vm_page_assert_busied(m); 5533 5534 if (!pmap_page_is_write_mapped(m)) 5535 return; 5536 rw_wlock(&pvh_global_lock); 5537 sched_pin(); 5538 if ((m->flags & PG_FICTITIOUS) != 0) 5539 goto small_mappings; 5540 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5541 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5542 va = pv->pv_va; 5543 pmap = PV_PMAP(pv); 5544 PMAP_LOCK(pmap); 5545 pte1p = pmap_pte1(pmap, va); 5546 if (!(pte1_load(pte1p) & PTE1_RO)) 5547 (void)pmap_demote_pte1(pmap, pte1p, va); 5548 PMAP_UNLOCK(pmap); 5549 } 5550 small_mappings: 5551 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5552 pmap = PV_PMAP(pv); 5553 PMAP_LOCK(pmap); 5554 pte1p = pmap_pte1(pmap, pv->pv_va); 5555 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5556 " a section in page %p's pv list", __func__, m)); 5557 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5558 opte2 = pte2_load(pte2p); 5559 if (!(opte2 & PTE2_RO)) { 5560 pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); 5561 if (pte2_is_dirty(opte2)) 5562 vm_page_dirty(m); 5563 pmap_tlb_flush(pmap, pv->pv_va); 5564 } 5565 PMAP_UNLOCK(pmap); 5566 } 5567 vm_page_aflag_clear(m, PGA_WRITEABLE); 5568 sched_unpin(); 5569 rw_wunlock(&pvh_global_lock); 5570 } 5571 5572 /* 5573 * Apply the given advice to the specified range of addresses within the 5574 * given pmap. Depending on the advice, clear the referenced and/or 5575 * modified flags in each mapping and set the mapped page's dirty field. 5576 */ 5577 void 5578 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5579 { 5580 pt1_entry_t *pte1p, opte1; 5581 pt2_entry_t *pte2p, pte2; 5582 vm_offset_t pdnxt; 5583 vm_page_t m; 5584 boolean_t pv_lists_locked; 5585 5586 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5587 return; 5588 if (pmap_is_current(pmap)) 5589 pv_lists_locked = FALSE; 5590 else { 5591 pv_lists_locked = TRUE; 5592 resume: 5593 rw_wlock(&pvh_global_lock); 5594 sched_pin(); 5595 } 5596 PMAP_LOCK(pmap); 5597 for (; sva < eva; sva = pdnxt) { 5598 pdnxt = pte1_trunc(sva + PTE1_SIZE); 5599 if (pdnxt < sva) 5600 pdnxt = eva; 5601 pte1p = pmap_pte1(pmap, sva); 5602 opte1 = pte1_load(pte1p); 5603 if (!pte1_is_valid(opte1)) /* XXX */ 5604 continue; 5605 else if (pte1_is_section(opte1)) { 5606 if (!pte1_is_managed(opte1)) 5607 continue; 5608 if (!pv_lists_locked) { 5609 pv_lists_locked = TRUE; 5610 if (!rw_try_wlock(&pvh_global_lock)) { 5611 PMAP_UNLOCK(pmap); 5612 goto resume; 5613 } 5614 sched_pin(); 5615 } 5616 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5617 /* 5618 * The large page mapping was destroyed. 5619 */ 5620 continue; 5621 } 5622 5623 /* 5624 * Unless the page mappings are wired, remove the 5625 * mapping to a single page so that a subsequent 5626 * access may repromote. Since the underlying L2 page 5627 * table is fully populated, this removal never 5628 * frees a L2 page table page. 5629 */ 5630 if (!pte1_is_wired(opte1)) { 5631 pte2p = pmap_pte2_quick(pmap, sva); 5632 KASSERT(pte2_is_valid(pte2_load(pte2p)), 5633 ("%s: invalid PTE2", __func__)); 5634 pmap_remove_pte2(pmap, pte2p, sva, NULL); 5635 } 5636 } 5637 if (pdnxt > eva) 5638 pdnxt = eva; 5639 for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, 5640 sva += PAGE_SIZE) { 5641 pte2 = pte2_load(pte2p); 5642 if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) 5643 continue; 5644 else if (pte2_is_dirty(pte2)) { 5645 if (advice == MADV_DONTNEED) { 5646 /* 5647 * Future calls to pmap_is_modified() 5648 * can be avoided by making the page 5649 * dirty now. 5650 */ 5651 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 5652 vm_page_dirty(m); 5653 } 5654 pte2_set_bit(pte2p, PTE2_NM); 5655 pte2_clear_bit(pte2p, PTE2_A); 5656 } else if ((pte2 & PTE2_A) != 0) 5657 pte2_clear_bit(pte2p, PTE2_A); 5658 else 5659 continue; 5660 pmap_tlb_flush(pmap, sva); 5661 } 5662 } 5663 if (pv_lists_locked) { 5664 sched_unpin(); 5665 rw_wunlock(&pvh_global_lock); 5666 } 5667 PMAP_UNLOCK(pmap); 5668 } 5669 5670 /* 5671 * Clear the modify bits on the specified physical page. 5672 */ 5673 void 5674 pmap_clear_modify(vm_page_t m) 5675 { 5676 struct md_page *pvh; 5677 pv_entry_t next_pv, pv; 5678 pmap_t pmap; 5679 pt1_entry_t *pte1p, opte1; 5680 pt2_entry_t *pte2p, opte2; 5681 vm_offset_t va; 5682 5683 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5684 ("%s: page %p is not managed", __func__, m)); 5685 vm_page_assert_busied(m); 5686 5687 if (!pmap_page_is_write_mapped(m)) 5688 return; 5689 rw_wlock(&pvh_global_lock); 5690 sched_pin(); 5691 if ((m->flags & PG_FICTITIOUS) != 0) 5692 goto small_mappings; 5693 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5694 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5695 va = pv->pv_va; 5696 pmap = PV_PMAP(pv); 5697 PMAP_LOCK(pmap); 5698 pte1p = pmap_pte1(pmap, va); 5699 opte1 = pte1_load(pte1p); 5700 if (!(opte1 & PTE1_RO)) { 5701 if (pmap_demote_pte1(pmap, pte1p, va) && 5702 !pte1_is_wired(opte1)) { 5703 /* 5704 * Write protect the mapping to a 5705 * single page so that a subsequent 5706 * write access may repromote. 5707 */ 5708 va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); 5709 pte2p = pmap_pte2_quick(pmap, va); 5710 opte2 = pte2_load(pte2p); 5711 if ((opte2 & PTE2_V)) { 5712 pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); 5713 vm_page_dirty(m); 5714 pmap_tlb_flush(pmap, va); 5715 } 5716 } 5717 } 5718 PMAP_UNLOCK(pmap); 5719 } 5720 small_mappings: 5721 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5722 pmap = PV_PMAP(pv); 5723 PMAP_LOCK(pmap); 5724 pte1p = pmap_pte1(pmap, pv->pv_va); 5725 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5726 " a section in page %p's pv list", __func__, m)); 5727 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5728 if (pte2_is_dirty(pte2_load(pte2p))) { 5729 pte2_set_bit(pte2p, PTE2_NM); 5730 pmap_tlb_flush(pmap, pv->pv_va); 5731 } 5732 PMAP_UNLOCK(pmap); 5733 } 5734 sched_unpin(); 5735 rw_wunlock(&pvh_global_lock); 5736 } 5737 5738 /* 5739 * Sets the memory attribute for the specified page. 5740 */ 5741 void 5742 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5743 { 5744 pt2_entry_t *cmap2_pte2p; 5745 vm_memattr_t oma; 5746 vm_paddr_t pa; 5747 struct pcpu *pc; 5748 5749 oma = m->md.pat_mode; 5750 m->md.pat_mode = ma; 5751 5752 CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, 5753 VM_PAGE_TO_PHYS(m), oma, ma); 5754 if ((m->flags & PG_FICTITIOUS) != 0) 5755 return; 5756 #if 0 5757 /* 5758 * If "m" is a normal page, flush it from the cache. 5759 * 5760 * First, try to find an existing mapping of the page by sf 5761 * buffer. sf_buf_invalidate_cache() modifies mapping and 5762 * flushes the cache. 5763 */ 5764 if (sf_buf_invalidate_cache(m, oma)) 5765 return; 5766 #endif 5767 /* 5768 * If page is not mapped by sf buffer, map the page 5769 * transient and do invalidation. 5770 */ 5771 if (ma != oma) { 5772 pa = VM_PAGE_TO_PHYS(m); 5773 sched_pin(); 5774 pc = get_pcpu(); 5775 cmap2_pte2p = pc->pc_cmap2_pte2p; 5776 mtx_lock(&pc->pc_cmap_lock); 5777 if (pte2_load(cmap2_pte2p) != 0) 5778 panic("%s: CMAP2 busy", __func__); 5779 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 5780 vm_memattr_to_pte2(ma))); 5781 dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); 5782 pte2_clear(cmap2_pte2p); 5783 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5784 sched_unpin(); 5785 mtx_unlock(&pc->pc_cmap_lock); 5786 } 5787 } 5788 5789 /* 5790 * Miscellaneous support routines follow 5791 */ 5792 5793 /* 5794 * Returns TRUE if the given page is mapped individually or as part of 5795 * a 1mpage. Otherwise, returns FALSE. 5796 */ 5797 boolean_t 5798 pmap_page_is_mapped(vm_page_t m) 5799 { 5800 boolean_t rv; 5801 5802 if ((m->oflags & VPO_UNMANAGED) != 0) 5803 return (FALSE); 5804 rw_wlock(&pvh_global_lock); 5805 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5806 ((m->flags & PG_FICTITIOUS) == 0 && 5807 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5808 rw_wunlock(&pvh_global_lock); 5809 return (rv); 5810 } 5811 5812 /* 5813 * Returns true if the pmap's pv is one of the first 5814 * 16 pvs linked to from this page. This count may 5815 * be changed upwards or downwards in the future; it 5816 * is only necessary that true be returned for a small 5817 * subset of pmaps for proper page aging. 5818 */ 5819 boolean_t 5820 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5821 { 5822 struct md_page *pvh; 5823 pv_entry_t pv; 5824 int loops = 0; 5825 boolean_t rv; 5826 5827 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5828 ("%s: page %p is not managed", __func__, m)); 5829 rv = FALSE; 5830 rw_wlock(&pvh_global_lock); 5831 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5832 if (PV_PMAP(pv) == pmap) { 5833 rv = TRUE; 5834 break; 5835 } 5836 loops++; 5837 if (loops >= 16) 5838 break; 5839 } 5840 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5841 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5842 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5843 if (PV_PMAP(pv) == pmap) { 5844 rv = TRUE; 5845 break; 5846 } 5847 loops++; 5848 if (loops >= 16) 5849 break; 5850 } 5851 } 5852 rw_wunlock(&pvh_global_lock); 5853 return (rv); 5854 } 5855 5856 /* 5857 * pmap_zero_page zeros the specified hardware page by mapping 5858 * the page into KVM and using bzero to clear its contents. 5859 */ 5860 void 5861 pmap_zero_page(vm_page_t m) 5862 { 5863 pt2_entry_t *cmap2_pte2p; 5864 struct pcpu *pc; 5865 5866 sched_pin(); 5867 pc = get_pcpu(); 5868 cmap2_pte2p = pc->pc_cmap2_pte2p; 5869 mtx_lock(&pc->pc_cmap_lock); 5870 if (pte2_load(cmap2_pte2p) != 0) 5871 panic("%s: CMAP2 busy", __func__); 5872 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5873 vm_page_pte2_attr(m))); 5874 pagezero(pc->pc_cmap2_addr); 5875 pte2_clear(cmap2_pte2p); 5876 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5877 sched_unpin(); 5878 mtx_unlock(&pc->pc_cmap_lock); 5879 } 5880 5881 /* 5882 * pmap_zero_page_area zeros the specified hardware page by mapping 5883 * the page into KVM and using bzero to clear its contents. 5884 * 5885 * off and size may not cover an area beyond a single hardware page. 5886 */ 5887 void 5888 pmap_zero_page_area(vm_page_t m, int off, int size) 5889 { 5890 pt2_entry_t *cmap2_pte2p; 5891 struct pcpu *pc; 5892 5893 sched_pin(); 5894 pc = get_pcpu(); 5895 cmap2_pte2p = pc->pc_cmap2_pte2p; 5896 mtx_lock(&pc->pc_cmap_lock); 5897 if (pte2_load(cmap2_pte2p) != 0) 5898 panic("%s: CMAP2 busy", __func__); 5899 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5900 vm_page_pte2_attr(m))); 5901 if (off == 0 && size == PAGE_SIZE) 5902 pagezero(pc->pc_cmap2_addr); 5903 else 5904 bzero(pc->pc_cmap2_addr + off, size); 5905 pte2_clear(cmap2_pte2p); 5906 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5907 sched_unpin(); 5908 mtx_unlock(&pc->pc_cmap_lock); 5909 } 5910 5911 /* 5912 * pmap_copy_page copies the specified (machine independent) 5913 * page by mapping the page into virtual memory and using 5914 * bcopy to copy the page, one machine dependent page at a 5915 * time. 5916 */ 5917 void 5918 pmap_copy_page(vm_page_t src, vm_page_t dst) 5919 { 5920 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5921 struct pcpu *pc; 5922 5923 sched_pin(); 5924 pc = get_pcpu(); 5925 cmap1_pte2p = pc->pc_cmap1_pte2p; 5926 cmap2_pte2p = pc->pc_cmap2_pte2p; 5927 mtx_lock(&pc->pc_cmap_lock); 5928 if (pte2_load(cmap1_pte2p) != 0) 5929 panic("%s: CMAP1 busy", __func__); 5930 if (pte2_load(cmap2_pte2p) != 0) 5931 panic("%s: CMAP2 busy", __func__); 5932 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), 5933 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); 5934 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), 5935 PTE2_AP_KRW, vm_page_pte2_attr(dst))); 5936 bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); 5937 pte2_clear(cmap1_pte2p); 5938 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5939 pte2_clear(cmap2_pte2p); 5940 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5941 sched_unpin(); 5942 mtx_unlock(&pc->pc_cmap_lock); 5943 } 5944 5945 int unmapped_buf_allowed = 1; 5946 5947 void 5948 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5949 vm_offset_t b_offset, int xfersize) 5950 { 5951 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5952 vm_page_t a_pg, b_pg; 5953 char *a_cp, *b_cp; 5954 vm_offset_t a_pg_offset, b_pg_offset; 5955 struct pcpu *pc; 5956 int cnt; 5957 5958 sched_pin(); 5959 pc = get_pcpu(); 5960 cmap1_pte2p = pc->pc_cmap1_pte2p; 5961 cmap2_pte2p = pc->pc_cmap2_pte2p; 5962 mtx_lock(&pc->pc_cmap_lock); 5963 if (pte2_load(cmap1_pte2p) != 0) 5964 panic("pmap_copy_pages: CMAP1 busy"); 5965 if (pte2_load(cmap2_pte2p) != 0) 5966 panic("pmap_copy_pages: CMAP2 busy"); 5967 while (xfersize > 0) { 5968 a_pg = ma[a_offset >> PAGE_SHIFT]; 5969 a_pg_offset = a_offset & PAGE_MASK; 5970 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5971 b_pg = mb[b_offset >> PAGE_SHIFT]; 5972 b_pg_offset = b_offset & PAGE_MASK; 5973 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5974 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), 5975 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); 5976 tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); 5977 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), 5978 PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); 5979 tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); 5980 a_cp = pc->pc_cmap1_addr + a_pg_offset; 5981 b_cp = pc->pc_cmap2_addr + b_pg_offset; 5982 bcopy(a_cp, b_cp, cnt); 5983 a_offset += cnt; 5984 b_offset += cnt; 5985 xfersize -= cnt; 5986 } 5987 pte2_clear(cmap1_pte2p); 5988 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5989 pte2_clear(cmap2_pte2p); 5990 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5991 sched_unpin(); 5992 mtx_unlock(&pc->pc_cmap_lock); 5993 } 5994 5995 vm_offset_t 5996 pmap_quick_enter_page(vm_page_t m) 5997 { 5998 struct pcpu *pc; 5999 pt2_entry_t *pte2p; 6000 6001 critical_enter(); 6002 pc = get_pcpu(); 6003 pte2p = pc->pc_qmap_pte2p; 6004 6005 KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); 6006 6007 pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6008 vm_page_pte2_attr(m))); 6009 return (pc->pc_qmap_addr); 6010 } 6011 6012 void 6013 pmap_quick_remove_page(vm_offset_t addr) 6014 { 6015 struct pcpu *pc; 6016 pt2_entry_t *pte2p; 6017 6018 pc = get_pcpu(); 6019 pte2p = pc->pc_qmap_pte2p; 6020 6021 KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); 6022 KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); 6023 6024 pte2_clear(pte2p); 6025 tlb_flush(pc->pc_qmap_addr); 6026 critical_exit(); 6027 } 6028 6029 /* 6030 * Copy the range specified by src_addr/len 6031 * from the source map to the range dst_addr/len 6032 * in the destination map. 6033 * 6034 * This routine is only advisory and need not do anything. 6035 */ 6036 void 6037 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6038 vm_offset_t src_addr) 6039 { 6040 struct spglist free; 6041 vm_offset_t addr; 6042 vm_offset_t end_addr = src_addr + len; 6043 vm_offset_t nextva; 6044 6045 if (dst_addr != src_addr) 6046 return; 6047 6048 if (!pmap_is_current(src_pmap)) 6049 return; 6050 6051 rw_wlock(&pvh_global_lock); 6052 if (dst_pmap < src_pmap) { 6053 PMAP_LOCK(dst_pmap); 6054 PMAP_LOCK(src_pmap); 6055 } else { 6056 PMAP_LOCK(src_pmap); 6057 PMAP_LOCK(dst_pmap); 6058 } 6059 sched_pin(); 6060 for (addr = src_addr; addr < end_addr; addr = nextva) { 6061 pt2_entry_t *src_pte2p, *dst_pte2p; 6062 vm_page_t dst_mpt2pg, src_mpt2pg; 6063 pt1_entry_t src_pte1; 6064 u_int pte1_idx; 6065 6066 KASSERT(addr < VM_MAXUSER_ADDRESS, 6067 ("%s: invalid to pmap_copy page tables", __func__)); 6068 6069 nextva = pte1_trunc(addr + PTE1_SIZE); 6070 if (nextva < addr) 6071 nextva = end_addr; 6072 6073 pte1_idx = pte1_index(addr); 6074 src_pte1 = src_pmap->pm_pt1[pte1_idx]; 6075 if (pte1_is_section(src_pte1)) { 6076 if ((addr & PTE1_OFFSET) != 0 || 6077 (addr + PTE1_SIZE) > end_addr) 6078 continue; 6079 if (dst_pmap->pm_pt1[pte1_idx] == 0 && 6080 (!pte1_is_managed(src_pte1) || 6081 pmap_pv_insert_pte1(dst_pmap, addr, src_pte1, 6082 PMAP_ENTER_NORECLAIM))) { 6083 dst_pmap->pm_pt1[pte1_idx] = src_pte1 & 6084 ~PTE1_W; 6085 dst_pmap->pm_stats.resident_count += 6086 PTE1_SIZE / PAGE_SIZE; 6087 pmap_pte1_mappings++; 6088 } 6089 continue; 6090 } else if (!pte1_is_link(src_pte1)) 6091 continue; 6092 6093 src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); 6094 6095 /* 6096 * We leave PT2s to be linked from PT1 even if they are not 6097 * referenced until all PT2s in a page are without reference. 6098 * 6099 * QQQ: It could be changed ... 6100 */ 6101 #if 0 /* single_pt2_link_is_cleared */ 6102 KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, 6103 ("%s: source page table page is unused", __func__)); 6104 #else 6105 if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) 6106 continue; 6107 #endif 6108 if (nextva > end_addr) 6109 nextva = end_addr; 6110 6111 src_pte2p = pt2map_entry(addr); 6112 while (addr < nextva) { 6113 pt2_entry_t temp_pte2; 6114 temp_pte2 = pte2_load(src_pte2p); 6115 /* 6116 * we only virtual copy managed pages 6117 */ 6118 if (pte2_is_managed(temp_pte2)) { 6119 dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, 6120 PMAP_ENTER_NOSLEEP); 6121 if (dst_mpt2pg == NULL) 6122 goto out; 6123 dst_pte2p = pmap_pte2_quick(dst_pmap, addr); 6124 if (!pte2_is_valid(pte2_load(dst_pte2p)) && 6125 pmap_try_insert_pv_entry(dst_pmap, addr, 6126 PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { 6127 /* 6128 * Clear the wired, modified, and 6129 * accessed (referenced) bits 6130 * during the copy. 6131 */ 6132 temp_pte2 &= ~(PTE2_W | PTE2_A); 6133 temp_pte2 |= PTE2_NM; 6134 pte2_store(dst_pte2p, temp_pte2); 6135 dst_pmap->pm_stats.resident_count++; 6136 } else { 6137 SLIST_INIT(&free); 6138 if (pmap_unwire_pt2(dst_pmap, addr, 6139 dst_mpt2pg, &free)) { 6140 pmap_tlb_flush(dst_pmap, addr); 6141 vm_page_free_pages_toq(&free, 6142 false); 6143 } 6144 goto out; 6145 } 6146 if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= 6147 pt2_wirecount_get(src_mpt2pg, pte1_idx)) 6148 break; 6149 } 6150 addr += PAGE_SIZE; 6151 src_pte2p++; 6152 } 6153 } 6154 out: 6155 sched_unpin(); 6156 rw_wunlock(&pvh_global_lock); 6157 PMAP_UNLOCK(src_pmap); 6158 PMAP_UNLOCK(dst_pmap); 6159 } 6160 6161 /* 6162 * Increase the starting virtual address of the given mapping if a 6163 * different alignment might result in more section mappings. 6164 */ 6165 void 6166 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6167 vm_offset_t *addr, vm_size_t size) 6168 { 6169 vm_offset_t pte1_offset; 6170 6171 if (size < PTE1_SIZE) 6172 return; 6173 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6174 offset += ptoa(object->pg_color); 6175 pte1_offset = offset & PTE1_OFFSET; 6176 if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || 6177 (*addr & PTE1_OFFSET) == pte1_offset) 6178 return; 6179 if ((*addr & PTE1_OFFSET) < pte1_offset) 6180 *addr = pte1_trunc(*addr) + pte1_offset; 6181 else 6182 *addr = pte1_roundup(*addr) + pte1_offset; 6183 } 6184 6185 void 6186 pmap_activate(struct thread *td) 6187 { 6188 pmap_t pmap, oldpmap; 6189 u_int cpuid, ttb; 6190 6191 PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); 6192 6193 critical_enter(); 6194 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6195 oldpmap = PCPU_GET(curpmap); 6196 cpuid = PCPU_GET(cpuid); 6197 6198 #if defined(SMP) 6199 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6200 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6201 #else 6202 CPU_CLR(cpuid, &oldpmap->pm_active); 6203 CPU_SET(cpuid, &pmap->pm_active); 6204 #endif 6205 6206 ttb = pmap_ttb_get(pmap); 6207 6208 /* 6209 * pmap_activate is for the current thread on the current cpu 6210 */ 6211 td->td_pcb->pcb_pagedir = ttb; 6212 cp15_ttbr_set(ttb); 6213 PCPU_SET(curpmap, pmap); 6214 critical_exit(); 6215 } 6216 6217 /* 6218 * Perform the pmap work for mincore(2). If the page is not both referenced and 6219 * modified by this pmap, returns its physical address so that the caller can 6220 * find other mappings. 6221 */ 6222 int 6223 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6224 { 6225 pt1_entry_t *pte1p, pte1; 6226 pt2_entry_t *pte2p, pte2; 6227 vm_paddr_t pa; 6228 bool managed; 6229 int val; 6230 6231 PMAP_LOCK(pmap); 6232 pte1p = pmap_pte1(pmap, addr); 6233 pte1 = pte1_load(pte1p); 6234 if (pte1_is_section(pte1)) { 6235 pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); 6236 managed = pte1_is_managed(pte1); 6237 val = MINCORE_PSIND(1) | MINCORE_INCORE; 6238 if (pte1_is_dirty(pte1)) 6239 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6240 if (pte1 & PTE1_A) 6241 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6242 } else if (pte1_is_link(pte1)) { 6243 pte2p = pmap_pte2(pmap, addr); 6244 pte2 = pte2_load(pte2p); 6245 pmap_pte2_release(pte2p); 6246 pa = pte2_pa(pte2); 6247 managed = pte2_is_managed(pte2); 6248 val = MINCORE_INCORE; 6249 if (pte2_is_dirty(pte2)) 6250 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6251 if (pte2 & PTE2_A) 6252 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6253 } else { 6254 managed = false; 6255 val = 0; 6256 } 6257 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6258 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6259 *pap = pa; 6260 } 6261 PMAP_UNLOCK(pmap); 6262 return (val); 6263 } 6264 6265 void 6266 pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) 6267 { 6268 vm_offset_t sva; 6269 uint32_t l2attr; 6270 6271 KASSERT((size & PAGE_MASK) == 0, 6272 ("%s: device mapping not page-sized", __func__)); 6273 6274 sva = va; 6275 l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); 6276 while (size != 0) { 6277 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); 6278 va += PAGE_SIZE; 6279 pa += PAGE_SIZE; 6280 size -= PAGE_SIZE; 6281 } 6282 tlb_flush_range(sva, va - sva); 6283 } 6284 6285 void 6286 pmap_kremove_device(vm_offset_t va, vm_size_t size) 6287 { 6288 vm_offset_t sva; 6289 6290 KASSERT((size & PAGE_MASK) == 0, 6291 ("%s: device mapping not page-sized", __func__)); 6292 6293 sva = va; 6294 while (size != 0) { 6295 pmap_kremove(va); 6296 va += PAGE_SIZE; 6297 size -= PAGE_SIZE; 6298 } 6299 tlb_flush_range(sva, va - sva); 6300 } 6301 6302 void 6303 pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) 6304 { 6305 6306 pcb->pcb_pagedir = pmap_ttb_get(pmap); 6307 } 6308 6309 /* 6310 * Clean L1 data cache range by physical address. 6311 * The range must be within a single page. 6312 */ 6313 static void 6314 pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) 6315 { 6316 pt2_entry_t *cmap2_pte2p; 6317 struct pcpu *pc; 6318 6319 KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, 6320 ("%s: not on single page", __func__)); 6321 6322 sched_pin(); 6323 pc = get_pcpu(); 6324 cmap2_pte2p = pc->pc_cmap2_pte2p; 6325 mtx_lock(&pc->pc_cmap_lock); 6326 if (pte2_load(cmap2_pte2p) != 0) 6327 panic("%s: CMAP2 busy", __func__); 6328 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); 6329 dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); 6330 pte2_clear(cmap2_pte2p); 6331 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6332 sched_unpin(); 6333 mtx_unlock(&pc->pc_cmap_lock); 6334 } 6335 6336 /* 6337 * Sync instruction cache range which is not mapped yet. 6338 */ 6339 void 6340 cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) 6341 { 6342 uint32_t len, offset; 6343 vm_page_t m; 6344 6345 /* Write back d-cache on given address range. */ 6346 offset = pa & PAGE_MASK; 6347 for ( ; size != 0; size -= len, pa += len, offset = 0) { 6348 len = min(PAGE_SIZE - offset, size); 6349 m = PHYS_TO_VM_PAGE(pa); 6350 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6351 __func__, pa)); 6352 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6353 } 6354 /* 6355 * I-cache is VIPT. Only way how to flush all virtual mappings 6356 * on given physical address is to invalidate all i-cache. 6357 */ 6358 icache_inv_all(); 6359 } 6360 6361 void 6362 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) 6363 { 6364 6365 /* Write back d-cache on given address range. */ 6366 if (va >= VM_MIN_KERNEL_ADDRESS) { 6367 dcache_wb_pou(va, size); 6368 } else { 6369 uint32_t len, offset; 6370 vm_paddr_t pa; 6371 vm_page_t m; 6372 6373 offset = va & PAGE_MASK; 6374 for ( ; size != 0; size -= len, va += len, offset = 0) { 6375 pa = pmap_extract(pmap, va); /* offset is preserved */ 6376 len = min(PAGE_SIZE - offset, size); 6377 m = PHYS_TO_VM_PAGE(pa); 6378 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6379 __func__, pa)); 6380 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6381 } 6382 } 6383 /* 6384 * I-cache is VIPT. Only way how to flush all virtual mappings 6385 * on given physical address is to invalidate all i-cache. 6386 */ 6387 icache_inv_all(); 6388 } 6389 6390 /* 6391 * The implementation of pmap_fault() uses IN_RANGE2() macro which 6392 * depends on the fact that given range size is a power of 2. 6393 */ 6394 CTASSERT(powerof2(NB_IN_PT1)); 6395 CTASSERT(powerof2(PT2MAP_SIZE)); 6396 6397 #define IN_RANGE2(addr, start, size) \ 6398 ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) 6399 6400 /* 6401 * Handle access and R/W emulation faults. 6402 */ 6403 int 6404 pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) 6405 { 6406 pt1_entry_t *pte1p, pte1; 6407 pt2_entry_t *pte2p, pte2; 6408 6409 if (pmap == NULL) 6410 pmap = kernel_pmap; 6411 6412 /* 6413 * In kernel, we should never get abort with FAR which is in range of 6414 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here 6415 * and print out a useful abort message and even get to the debugger 6416 * otherwise it likely ends with never ending loop of aborts. 6417 */ 6418 if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { 6419 /* 6420 * All L1 tables should always be mapped and present. 6421 * However, we check only current one herein. For user mode, 6422 * only permission abort from malicious user is not fatal. 6423 * And alignment abort as it may have higher priority. 6424 */ 6425 if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { 6426 CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", 6427 __func__, pmap, pmap->pm_pt1, far); 6428 panic("%s: pm_pt1 abort", __func__); 6429 } 6430 return (KERN_INVALID_ADDRESS); 6431 } 6432 if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { 6433 /* 6434 * PT2MAP should be always mapped and present in current 6435 * L1 table. However, only existing L2 tables are mapped 6436 * in PT2MAP. For user mode, only L2 translation abort and 6437 * permission abort from malicious user is not fatal. 6438 * And alignment abort as it may have higher priority. 6439 */ 6440 if (!usermode || (idx != FAULT_ALIGN && 6441 idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { 6442 CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", 6443 __func__, pmap, PT2MAP, far); 6444 panic("%s: PT2MAP abort", __func__); 6445 } 6446 return (KERN_INVALID_ADDRESS); 6447 } 6448 6449 /* 6450 * A pmap lock is used below for handling of access and R/W emulation 6451 * aborts. They were handled by atomic operations before so some 6452 * analysis of new situation is needed to answer the following question: 6453 * Is it safe to use the lock even for these aborts? 6454 * 6455 * There may happen two cases in general: 6456 * 6457 * (1) Aborts while the pmap lock is locked already - this should not 6458 * happen as pmap lock is not recursive. However, under pmap lock only 6459 * internal kernel data should be accessed and such data should be 6460 * mapped with A bit set and NM bit cleared. If double abort happens, 6461 * then a mapping of data which has caused it must be fixed. Further, 6462 * all new mappings are always made with A bit set and the bit can be 6463 * cleared only on managed mappings. 6464 * 6465 * (2) Aborts while another lock(s) is/are locked - this already can 6466 * happen. However, there is no difference here if it's either access or 6467 * R/W emulation abort, or if it's some other abort. 6468 */ 6469 6470 PMAP_LOCK(pmap); 6471 #ifdef INVARIANTS 6472 pte1 = pte1_load(pmap_pte1(pmap, far)); 6473 if (pte1_is_link(pte1)) { 6474 /* 6475 * Check in advance that associated L2 page table is mapped into 6476 * PT2MAP space. Note that faulty access to not mapped L2 page 6477 * table is caught in more general check above where "far" is 6478 * checked that it does not lay in PT2MAP space. Note also that 6479 * L1 page table and PT2TAB always exist and are mapped. 6480 */ 6481 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); 6482 if (!pte2_is_valid(pte2)) 6483 panic("%s: missing L2 page table (%p, %#x)", 6484 __func__, pmap, far); 6485 } 6486 #endif 6487 #ifdef SMP 6488 /* 6489 * Special treatment is due to break-before-make approach done when 6490 * pte1 is updated for userland mapping during section promotion or 6491 * demotion. If not caught here, pmap_enter() can find a section 6492 * mapping on faulting address. That is not allowed. 6493 */ 6494 if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { 6495 PMAP_UNLOCK(pmap); 6496 return (KERN_SUCCESS); 6497 } 6498 #endif 6499 /* 6500 * Accesss bits for page and section. Note that the entry 6501 * is not in TLB yet, so TLB flush is not necessary. 6502 * 6503 * QQQ: This is hardware emulation, we do not call userret() 6504 * for aborts from user mode. 6505 */ 6506 if (idx == FAULT_ACCESS_L2) { 6507 pte1 = pte1_load(pmap_pte1(pmap, far)); 6508 if (pte1_is_link(pte1)) { 6509 /* L2 page table should exist and be mapped. */ 6510 pte2p = pt2map_entry(far); 6511 pte2 = pte2_load(pte2p); 6512 if (pte2_is_valid(pte2)) { 6513 pte2_store(pte2p, pte2 | PTE2_A); 6514 PMAP_UNLOCK(pmap); 6515 return (KERN_SUCCESS); 6516 } 6517 } else { 6518 /* 6519 * We got L2 access fault but PTE1 is not a link. 6520 * Probably some race happened, do nothing. 6521 */ 6522 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L2 - pmap %#x far %#x", 6523 __func__, pmap, far); 6524 PMAP_UNLOCK(pmap); 6525 return (KERN_SUCCESS); 6526 } 6527 } 6528 if (idx == FAULT_ACCESS_L1) { 6529 pte1p = pmap_pte1(pmap, far); 6530 pte1 = pte1_load(pte1p); 6531 if (pte1_is_section(pte1)) { 6532 pte1_store(pte1p, pte1 | PTE1_A); 6533 PMAP_UNLOCK(pmap); 6534 return (KERN_SUCCESS); 6535 } else { 6536 /* 6537 * We got L1 access fault but PTE1 is not section 6538 * mapping. Probably some race happened, do nothing. 6539 */ 6540 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L1 - pmap %#x far %#x", 6541 __func__, pmap, far); 6542 PMAP_UNLOCK(pmap); 6543 return (KERN_SUCCESS); 6544 } 6545 } 6546 6547 /* 6548 * Handle modify bits for page and section. Note that the modify 6549 * bit is emulated by software. So PTEx_RO is software read only 6550 * bit and PTEx_NM flag is real hardware read only bit. 6551 * 6552 * QQQ: This is hardware emulation, we do not call userret() 6553 * for aborts from user mode. 6554 */ 6555 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { 6556 pte1 = pte1_load(pmap_pte1(pmap, far)); 6557 if (pte1_is_link(pte1)) { 6558 /* L2 page table should exist and be mapped. */ 6559 pte2p = pt2map_entry(far); 6560 pte2 = pte2_load(pte2p); 6561 if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && 6562 (pte2 & PTE2_NM)) { 6563 pte2_store(pte2p, pte2 & ~PTE2_NM); 6564 tlb_flush(trunc_page(far)); 6565 PMAP_UNLOCK(pmap); 6566 return (KERN_SUCCESS); 6567 } 6568 } else { 6569 /* 6570 * We got L2 permission fault but PTE1 is not a link. 6571 * Probably some race happened, do nothing. 6572 */ 6573 CTR3(KTR_PMAP, "%s: FAULT_PERM_L2 - pmap %#x far %#x", 6574 __func__, pmap, far); 6575 PMAP_UNLOCK(pmap); 6576 return (KERN_SUCCESS); 6577 } 6578 } 6579 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { 6580 pte1p = pmap_pte1(pmap, far); 6581 pte1 = pte1_load(pte1p); 6582 if (pte1_is_section(pte1)) { 6583 if (!(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { 6584 pte1_store(pte1p, pte1 & ~PTE1_NM); 6585 tlb_flush(pte1_trunc(far)); 6586 PMAP_UNLOCK(pmap); 6587 return (KERN_SUCCESS); 6588 } 6589 } else { 6590 /* 6591 * We got L1 permission fault but PTE1 is not section 6592 * mapping. Probably some race happened, do nothing. 6593 */ 6594 CTR3(KTR_PMAP, "%s: FAULT_PERM_L1 - pmap %#x far %#x", 6595 __func__, pmap, far); 6596 PMAP_UNLOCK(pmap); 6597 return (KERN_SUCCESS); 6598 } 6599 } 6600 6601 /* 6602 * QQQ: The previous code, mainly fast handling of access and 6603 * modify bits aborts, could be moved to ASM. Now we are 6604 * starting to deal with not fast aborts. 6605 */ 6606 PMAP_UNLOCK(pmap); 6607 return (KERN_FAILURE); 6608 } 6609 6610 #if defined(PMAP_DEBUG) 6611 /* 6612 * Reusing of KVA used in pmap_zero_page function !!! 6613 */ 6614 static void 6615 pmap_zero_page_check(vm_page_t m) 6616 { 6617 pt2_entry_t *cmap2_pte2p; 6618 uint32_t *p, *end; 6619 struct pcpu *pc; 6620 6621 sched_pin(); 6622 pc = get_pcpu(); 6623 cmap2_pte2p = pc->pc_cmap2_pte2p; 6624 mtx_lock(&pc->pc_cmap_lock); 6625 if (pte2_load(cmap2_pte2p) != 0) 6626 panic("%s: CMAP2 busy", __func__); 6627 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6628 vm_page_pte2_attr(m))); 6629 end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); 6630 for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) 6631 if (*p != 0) 6632 panic("%s: page %p not zero, va: %p", __func__, m, 6633 pc->pc_cmap2_addr); 6634 pte2_clear(cmap2_pte2p); 6635 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6636 sched_unpin(); 6637 mtx_unlock(&pc->pc_cmap_lock); 6638 } 6639 6640 int 6641 pmap_pid_dump(int pid) 6642 { 6643 pmap_t pmap; 6644 struct proc *p; 6645 int npte2 = 0; 6646 int i, j, index; 6647 6648 sx_slock(&allproc_lock); 6649 FOREACH_PROC_IN_SYSTEM(p) { 6650 if (p->p_pid != pid || p->p_vmspace == NULL) 6651 continue; 6652 index = 0; 6653 pmap = vmspace_pmap(p->p_vmspace); 6654 for (i = 0; i < NPTE1_IN_PT1; i++) { 6655 pt1_entry_t pte1; 6656 pt2_entry_t *pte2p, pte2; 6657 vm_offset_t base, va; 6658 vm_paddr_t pa; 6659 vm_page_t m; 6660 6661 base = i << PTE1_SHIFT; 6662 pte1 = pte1_load(&pmap->pm_pt1[i]); 6663 6664 if (pte1_is_section(pte1)) { 6665 /* 6666 * QQQ: Do something here! 6667 */ 6668 } else if (pte1_is_link(pte1)) { 6669 for (j = 0; j < NPTE2_IN_PT2; j++) { 6670 va = base + (j << PAGE_SHIFT); 6671 if (va >= VM_MIN_KERNEL_ADDRESS) { 6672 if (index) { 6673 index = 0; 6674 printf("\n"); 6675 } 6676 sx_sunlock(&allproc_lock); 6677 return (npte2); 6678 } 6679 pte2p = pmap_pte2(pmap, va); 6680 pte2 = pte2_load(pte2p); 6681 pmap_pte2_release(pte2p); 6682 if (!pte2_is_valid(pte2)) 6683 continue; 6684 6685 pa = pte2_pa(pte2); 6686 m = PHYS_TO_VM_PAGE(pa); 6687 printf("va: 0x%x, pa: 0x%x, w: %d, " 6688 "f: 0x%x", va, pa, 6689 m->ref_count, m->flags); 6690 npte2++; 6691 index++; 6692 if (index >= 2) { 6693 index = 0; 6694 printf("\n"); 6695 } else { 6696 printf(" "); 6697 } 6698 } 6699 } 6700 } 6701 } 6702 sx_sunlock(&allproc_lock); 6703 return (npte2); 6704 } 6705 6706 #endif 6707 6708 #ifdef DDB 6709 static pt2_entry_t * 6710 pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) 6711 { 6712 pt1_entry_t pte1; 6713 vm_paddr_t pt2pg_pa; 6714 6715 pte1 = pte1_load(pmap_pte1(pmap, va)); 6716 if (!pte1_is_link(pte1)) 6717 return (NULL); 6718 6719 if (pmap_is_current(pmap)) 6720 return (pt2map_entry(va)); 6721 6722 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 6723 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 6724 if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { 6725 pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); 6726 #ifdef SMP 6727 PMAP3cpu = PCPU_GET(cpuid); 6728 #endif 6729 tlb_flush_local((vm_offset_t)PADDR3); 6730 } 6731 #ifdef SMP 6732 else if (PMAP3cpu != PCPU_GET(cpuid)) { 6733 PMAP3cpu = PCPU_GET(cpuid); 6734 tlb_flush_local((vm_offset_t)PADDR3); 6735 } 6736 #endif 6737 return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 6738 } 6739 6740 static void 6741 dump_pmap(pmap_t pmap) 6742 { 6743 6744 printf("pmap %p\n", pmap); 6745 printf(" pm_pt1: %p\n", pmap->pm_pt1); 6746 printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); 6747 printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); 6748 } 6749 6750 DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) 6751 { 6752 6753 pmap_t pmap; 6754 LIST_FOREACH(pmap, &allpmaps, pm_list) { 6755 dump_pmap(pmap); 6756 } 6757 } 6758 6759 static int 6760 pte2_class(pt2_entry_t pte2) 6761 { 6762 int cls; 6763 6764 cls = (pte2 >> 2) & 0x03; 6765 cls |= (pte2 >> 4) & 0x04; 6766 return (cls); 6767 } 6768 6769 static void 6770 dump_section(pmap_t pmap, uint32_t pte1_idx) 6771 { 6772 } 6773 6774 static void 6775 dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) 6776 { 6777 uint32_t i; 6778 vm_offset_t va; 6779 pt2_entry_t *pte2p, pte2; 6780 vm_page_t m; 6781 6782 va = pte1_idx << PTE1_SHIFT; 6783 pte2p = pmap_pte2_ddb(pmap, va); 6784 for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { 6785 pte2 = pte2_load(pte2p); 6786 if (pte2 == 0) 6787 continue; 6788 if (!pte2_is_valid(pte2)) { 6789 printf(" 0x%08X: 0x%08X", va, pte2); 6790 if (!invalid_ok) 6791 printf(" - not valid !!!"); 6792 printf("\n"); 6793 continue; 6794 } 6795 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 6796 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, 6797 pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); 6798 if (m != NULL) { 6799 printf(" v:%d w:%d f:0x%04X\n", m->valid, 6800 m->ref_count, m->flags); 6801 } else { 6802 printf("\n"); 6803 } 6804 } 6805 } 6806 6807 static __inline boolean_t 6808 is_pv_chunk_space(vm_offset_t va) 6809 { 6810 6811 if ((((vm_offset_t)pv_chunkbase) <= va) && 6812 (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) 6813 return (TRUE); 6814 return (FALSE); 6815 } 6816 6817 DB_SHOW_COMMAND(pmap, pmap_pmap_print) 6818 { 6819 /* XXX convert args. */ 6820 pmap_t pmap = (pmap_t)addr; 6821 pt1_entry_t pte1; 6822 pt2_entry_t pte2; 6823 vm_offset_t va, eva; 6824 vm_page_t m; 6825 uint32_t i; 6826 boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; 6827 6828 if (have_addr) { 6829 pmap_t pm; 6830 6831 LIST_FOREACH(pm, &allpmaps, pm_list) 6832 if (pm == pmap) break; 6833 if (pm == NULL) { 6834 printf("given pmap %p is not in allpmaps list\n", pmap); 6835 return; 6836 } 6837 } else 6838 pmap = PCPU_GET(curpmap); 6839 6840 eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; 6841 dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ 6842 6843 printf("pmap: 0x%08X\n", (uint32_t)pmap); 6844 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6845 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6846 6847 for(i = 0; i < NPTE1_IN_PT1; i++) { 6848 pte1 = pte1_load(&pmap->pm_pt1[i]); 6849 if (pte1 == 0) 6850 continue; 6851 va = i << PTE1_SHIFT; 6852 if (va >= eva) 6853 break; 6854 6855 if (pte1_is_section(pte1)) { 6856 printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, 6857 !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); 6858 dump_section(pmap, i); 6859 } else if (pte1_is_link(pte1)) { 6860 dump_link_ok = TRUE; 6861 invalid_ok = FALSE; 6862 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6863 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 6864 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", 6865 va, pte1, pte2, m); 6866 if (is_pv_chunk_space(va)) { 6867 printf(" - pv_chunk space"); 6868 if (dump_pv_chunk) 6869 invalid_ok = TRUE; 6870 else 6871 dump_link_ok = FALSE; 6872 } 6873 else if (m != NULL) 6874 printf(" w:%d w2:%u", m->ref_count, 6875 pt2_wirecount_get(m, pte1_index(va))); 6876 if (pte2 == 0) 6877 printf(" !!! pt2tab entry is ZERO"); 6878 else if (pte2_pa(pte1) != pte2_pa(pte2)) 6879 printf(" !!! pt2tab entry is DIFFERENT - m: %p", 6880 PHYS_TO_VM_PAGE(pte2_pa(pte2))); 6881 printf("\n"); 6882 if (dump_link_ok) 6883 dump_link(pmap, i, invalid_ok); 6884 } else 6885 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6886 } 6887 } 6888 6889 static void 6890 dump_pt2tab(pmap_t pmap) 6891 { 6892 uint32_t i; 6893 pt2_entry_t pte2; 6894 vm_offset_t va; 6895 vm_paddr_t pa; 6896 vm_page_t m; 6897 6898 printf("PT2TAB:\n"); 6899 for (i = 0; i < PT2TAB_ENTRIES; i++) { 6900 pte2 = pte2_load(&pmap->pm_pt2tab[i]); 6901 if (!pte2_is_valid(pte2)) 6902 continue; 6903 va = i << PT2TAB_SHIFT; 6904 pa = pte2_pa(pte2); 6905 m = PHYS_TO_VM_PAGE(pa); 6906 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, 6907 pte2_class(pte2), !!(pte2 & PTE2_S), m); 6908 if (m != NULL) 6909 printf(" , w: %d, f: 0x%04X pidx: %lld", 6910 m->ref_count, m->flags, m->pindex); 6911 printf("\n"); 6912 } 6913 } 6914 6915 DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) 6916 { 6917 /* XXX convert args. */ 6918 pmap_t pmap = (pmap_t)addr; 6919 pt1_entry_t pte1; 6920 pt2_entry_t pte2; 6921 vm_offset_t va; 6922 uint32_t i, start; 6923 6924 if (have_addr) { 6925 printf("supported only on current pmap\n"); 6926 return; 6927 } 6928 6929 pmap = PCPU_GET(curpmap); 6930 printf("curpmap: 0x%08X\n", (uint32_t)pmap); 6931 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6932 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6933 6934 start = pte1_index((vm_offset_t)PT2MAP); 6935 for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { 6936 pte1 = pte1_load(&pmap->pm_pt1[i]); 6937 if (pte1 == 0) 6938 continue; 6939 va = i << PTE1_SHIFT; 6940 if (pte1_is_section(pte1)) { 6941 printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, 6942 !!(pte1 & PTE1_S)); 6943 dump_section(pmap, i); 6944 } else if (pte1_is_link(pte1)) { 6945 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6946 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, 6947 pte1, pte2); 6948 if (pte2 == 0) 6949 printf(" !!! pt2tab entry is ZERO\n"); 6950 } else 6951 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6952 } 6953 dump_pt2tab(pmap); 6954 } 6955 #endif 6956