1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 8 * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org> 9 * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org> 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 41 */ 42 /*- 43 * Copyright (c) 2003 Networks Associates Technology, Inc. 44 * All rights reserved. 45 * 46 * This software was developed for the FreeBSD Project by Jake Burkholder, 47 * Safeport Network Services, and Network Associates Laboratories, the 48 * Security Research Division of Network Associates, Inc. under 49 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 50 * CHATS research program. 51 * 52 * Redistribution and use in source and binary forms, with or without 53 * modification, are permitted provided that the following conditions 54 * are met: 55 * 1. Redistributions of source code must retain the above copyright 56 * notice, this list of conditions and the following disclaimer. 57 * 2. Redistributions in binary form must reproduce the above copyright 58 * notice, this list of conditions and the following disclaimer in the 59 * documentation and/or other materials provided with the distribution. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 */ 73 74 #include <sys/cdefs.h> 75 __FBSDID("$FreeBSD$"); 76 77 /* 78 * Manages physical address maps. 79 * 80 * Since the information managed by this module is 81 * also stored by the logical address mapping module, 82 * this module may throw away valid virtual-to-physical 83 * mappings at almost any time. However, invalidations 84 * of virtual-to-physical mappings must be done as 85 * requested. 86 * 87 * In order to cope with hardware architectures which 88 * make virtual-to-physical map invalidates expensive, 89 * this module may delay invalidate or reduced protection 90 * operations until such time as they are actually 91 * necessary. This module is given full information as 92 * to which processors are currently using which maps, 93 * and to when physical maps must be made correct. 94 */ 95 96 #include "opt_vm.h" 97 #include "opt_pmap.h" 98 #include "opt_ddb.h" 99 100 #include <sys/param.h> 101 #include <sys/systm.h> 102 #include <sys/kernel.h> 103 #include <sys/ktr.h> 104 #include <sys/lock.h> 105 #include <sys/proc.h> 106 #include <sys/rwlock.h> 107 #include <sys/malloc.h> 108 #include <sys/vmmeter.h> 109 #include <sys/malloc.h> 110 #include <sys/mman.h> 111 #include <sys/sf_buf.h> 112 #include <sys/smp.h> 113 #include <sys/sched.h> 114 #include <sys/sysctl.h> 115 116 #ifdef DDB 117 #include <ddb/ddb.h> 118 #endif 119 120 #include <vm/vm.h> 121 #include <vm/uma.h> 122 #include <vm/pmap.h> 123 #include <vm/vm_param.h> 124 #include <vm/vm_kern.h> 125 #include <vm/vm_object.h> 126 #include <vm/vm_map.h> 127 #include <vm/vm_page.h> 128 #include <vm/vm_pageout.h> 129 #include <vm/vm_phys.h> 130 #include <vm/vm_extern.h> 131 #include <vm/vm_reserv.h> 132 #include <sys/lock.h> 133 #include <sys/mutex.h> 134 135 #include <machine/md_var.h> 136 #include <machine/pmap_var.h> 137 #include <machine/cpu.h> 138 #include <machine/pcb.h> 139 #include <machine/sf_buf.h> 140 #ifdef SMP 141 #include <machine/smp.h> 142 #endif 143 #ifndef PMAP_SHPGPERPROC 144 #define PMAP_SHPGPERPROC 200 145 #endif 146 147 #ifndef DIAGNOSTIC 148 #define PMAP_INLINE __inline 149 #else 150 #define PMAP_INLINE 151 #endif 152 153 #ifdef PMAP_DEBUG 154 static void pmap_zero_page_check(vm_page_t m); 155 void pmap_debug(int level); 156 int pmap_pid_dump(int pid); 157 158 #define PDEBUG(_lev_,_stat_) \ 159 if (pmap_debug_level >= (_lev_)) \ 160 ((_stat_)) 161 #define dprintf printf 162 int pmap_debug_level = 1; 163 #else /* PMAP_DEBUG */ 164 #define PDEBUG(_lev_,_stat_) /* Nothing */ 165 #define dprintf(x, arg...) 166 #endif /* PMAP_DEBUG */ 167 168 /* 169 * Level 2 page tables map definion ('max' is excluded). 170 */ 171 172 #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 173 #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) 174 175 #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 176 #define UPT2V_MAX_ADDRESS \ 177 ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) 178 179 /* 180 * Promotion to a 1MB (PTE1) page mapping requires that the corresponding 181 * 4KB (PTE2) page mappings have identical settings for the following fields: 182 */ 183 #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ 184 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ 185 PTE2_ATTR_MASK) 186 187 #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ 188 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ 189 PTE1_ATTR_MASK) 190 191 #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ 192 (((l2_attr) & L2_C) ? L1_S_C : 0) | \ 193 (((l2_attr) & L2_B) ? L1_S_B : 0) | \ 194 (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ 195 (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ 196 (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ 197 (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ 198 (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ 199 (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ 200 (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ 201 (((l2_attr) & PTE2_W) ? PTE1_W : 0)) 202 203 #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ 204 (((l1_attr) & L1_S_C) ? L2_C : 0) | \ 205 (((l1_attr) & L1_S_B) ? L2_B : 0) | \ 206 (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ 207 (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ 208 (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ 209 (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ 210 (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ 211 (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ 212 (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ 213 (((l1_attr) & PTE1_W) ? PTE2_W : 0)) 214 215 /* 216 * PTE2 descriptors creation macros. 217 */ 218 #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) 219 #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) 220 221 #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 222 #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 223 224 #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) 225 #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) 226 227 #define PV_STATS 228 #ifdef PV_STATS 229 #define PV_STAT(x) do { x ; } while (0) 230 #else 231 #define PV_STAT(x) do { } while (0) 232 #endif 233 234 /* 235 * The boot_pt1 is used temporary in very early boot stage as L1 page table. 236 * We can init many things with no memory allocation thanks to its static 237 * allocation and this brings two main advantages: 238 * (1) other cores can be started very simply, 239 * (2) various boot loaders can be supported as its arguments can be processed 240 * in virtual address space and can be moved to safe location before 241 * first allocation happened. 242 * Only disadvantage is that boot_pt1 is used only in very early boot stage. 243 * However, the table is uninitialized and so lays in bss. Therefore kernel 244 * image size is not influenced. 245 * 246 * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and 247 * CPU suspend/resume game. 248 */ 249 extern pt1_entry_t boot_pt1[]; 250 251 vm_paddr_t base_pt1; 252 pt1_entry_t *kern_pt1; 253 pt2_entry_t *kern_pt2tab; 254 pt2_entry_t *PT2MAP; 255 256 static uint32_t ttb_flags; 257 static vm_memattr_t pt_memattr; 258 ttb_entry_t pmap_kern_ttb; 259 260 struct pmap kernel_pmap_store; 261 LIST_HEAD(pmaplist, pmap); 262 static struct pmaplist allpmaps; 263 static struct mtx allpmaps_lock; 264 265 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 266 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 267 268 static vm_offset_t kernel_vm_end_new; 269 vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; 270 vm_offset_t vm_max_kernel_address; 271 vm_paddr_t kernel_l1pa; 272 273 static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; 274 275 /* 276 * Data for the pv entry allocation mechanism 277 */ 278 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 279 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 280 static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ 281 static int shpgperproc = PMAP_SHPGPERPROC; 282 283 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 284 int pv_maxchunks; /* How many chunks we have KVA for */ 285 vm_offset_t pv_vafree; /* freelist stored in the PTE */ 286 287 vm_paddr_t first_managed_pa; 288 #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) 289 290 /* 291 * All those kernel PT submaps that BSD is so fond of 292 */ 293 caddr_t _tmppt = 0; 294 295 /* 296 * Crashdump maps. 297 */ 298 static caddr_t crashdumpmap; 299 300 static pt2_entry_t *PMAP1 = NULL, *PMAP2; 301 static pt2_entry_t *PADDR1 = NULL, *PADDR2; 302 #ifdef DDB 303 static pt2_entry_t *PMAP3; 304 static pt2_entry_t *PADDR3; 305 static int PMAP3cpu __unused; /* for SMP only */ 306 #endif 307 #ifdef SMP 308 static int PMAP1cpu; 309 static int PMAP1changedcpu; 310 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 311 &PMAP1changedcpu, 0, 312 "Number of times pmap_pte2_quick changed CPU with same PMAP1"); 313 #endif 314 static int PMAP1changed; 315 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 316 &PMAP1changed, 0, 317 "Number of times pmap_pte2_quick changed PMAP1"); 318 static int PMAP1unchanged; 319 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 320 &PMAP1unchanged, 0, 321 "Number of times pmap_pte2_quick didn't change PMAP1"); 322 static struct mtx PMAP2mutex; 323 324 /* 325 * Internal flags for pmap_enter()'s helper functions. 326 */ 327 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 328 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 329 330 static __inline void pt2_wirecount_init(vm_page_t m); 331 static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, 332 vm_offset_t va); 333 static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, 334 u_int flags, vm_page_t m); 335 void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); 336 337 /* 338 * Function to set the debug level of the pmap code. 339 */ 340 #ifdef PMAP_DEBUG 341 void 342 pmap_debug(int level) 343 { 344 345 pmap_debug_level = level; 346 dprintf("pmap_debug: level=%d\n", pmap_debug_level); 347 } 348 #endif /* PMAP_DEBUG */ 349 350 /* 351 * This table must corespond with memory attribute configuration in vm.h. 352 * First entry is used for normal system mapping. 353 * 354 * Device memory is always marked as shared. 355 * Normal memory is shared only in SMP . 356 * Not outer shareable bits are not used yet. 357 * Class 6 cannot be used on ARM11. 358 */ 359 #define TEXDEF_TYPE_SHIFT 0 360 #define TEXDEF_TYPE_MASK 0x3 361 #define TEXDEF_INNER_SHIFT 2 362 #define TEXDEF_INNER_MASK 0x3 363 #define TEXDEF_OUTER_SHIFT 4 364 #define TEXDEF_OUTER_MASK 0x3 365 #define TEXDEF_NOS_SHIFT 6 366 #define TEXDEF_NOS_MASK 0x1 367 368 #define TEX(t, i, o, s) \ 369 ((t) << TEXDEF_TYPE_SHIFT) | \ 370 ((i) << TEXDEF_INNER_SHIFT) | \ 371 ((o) << TEXDEF_OUTER_SHIFT | \ 372 ((s) << TEXDEF_NOS_SHIFT)) 373 374 static uint32_t tex_class[8] = { 375 /* type inner cache outer cache */ 376 TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ 377 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ 378 TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ 379 TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ 380 TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ 381 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ 382 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ 383 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ 384 }; 385 #undef TEX 386 387 static uint32_t pte2_attr_tab[8] = { 388 PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ 389 PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ 390 PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ 391 PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ 392 PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 393 0, /* 5 - NOT USED YET */ 394 0, /* 6 - NOT USED YET */ 395 0 /* 7 - NOT USED YET */ 396 }; 397 CTASSERT(VM_MEMATTR_WB_WA == 0); 398 CTASSERT(VM_MEMATTR_NOCACHE == 1); 399 CTASSERT(VM_MEMATTR_DEVICE == 2); 400 CTASSERT(VM_MEMATTR_SO == 3); 401 CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); 402 #define VM_MEMATTR_END (VM_MEMATTR_WRITE_THROUGH + 1) 403 404 boolean_t 405 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 406 { 407 408 return (mode >= 0 && mode < VM_MEMATTR_END); 409 } 410 411 static inline uint32_t 412 vm_memattr_to_pte2(vm_memattr_t ma) 413 { 414 415 KASSERT((u_int)ma < VM_MEMATTR_END, 416 ("%s: bad vm_memattr_t %d", __func__, ma)); 417 return (pte2_attr_tab[(u_int)ma]); 418 } 419 420 static inline uint32_t 421 vm_page_pte2_attr(vm_page_t m) 422 { 423 424 return (vm_memattr_to_pte2(m->md.pat_mode)); 425 } 426 427 /* 428 * Convert TEX definition entry to TTB flags. 429 */ 430 static uint32_t 431 encode_ttb_flags(int idx) 432 { 433 uint32_t inner, outer, nos, reg; 434 435 inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & 436 TEXDEF_INNER_MASK; 437 outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & 438 TEXDEF_OUTER_MASK; 439 nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & 440 TEXDEF_NOS_MASK; 441 442 reg = nos << 5; 443 reg |= outer << 3; 444 if (cpuinfo.coherent_walk) 445 reg |= (inner & 0x1) << 6; 446 reg |= (inner & 0x2) >> 1; 447 #ifdef SMP 448 ARM_SMP_UP( 449 reg |= 1 << 1, 450 ); 451 #endif 452 return reg; 453 } 454 455 /* 456 * Set TEX remapping registers in current CPU. 457 */ 458 void 459 pmap_set_tex(void) 460 { 461 uint32_t prrr, nmrr; 462 uint32_t type, inner, outer, nos; 463 int i; 464 465 #ifdef PMAP_PTE_NOCACHE 466 /* XXX fixme */ 467 if (cpuinfo.coherent_walk) { 468 pt_memattr = VM_MEMATTR_WB_WA; 469 ttb_flags = encode_ttb_flags(0); 470 } 471 else { 472 pt_memattr = VM_MEMATTR_NOCACHE; 473 ttb_flags = encode_ttb_flags(1); 474 } 475 #else 476 pt_memattr = VM_MEMATTR_WB_WA; 477 ttb_flags = encode_ttb_flags(0); 478 #endif 479 480 prrr = 0; 481 nmrr = 0; 482 483 /* Build remapping register from TEX classes. */ 484 for (i = 0; i < 8; i++) { 485 type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & 486 TEXDEF_TYPE_MASK; 487 inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & 488 TEXDEF_INNER_MASK; 489 outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & 490 TEXDEF_OUTER_MASK; 491 nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & 492 TEXDEF_NOS_MASK; 493 494 prrr |= type << (i * 2); 495 prrr |= nos << (i + 24); 496 nmrr |= inner << (i * 2); 497 nmrr |= outer << (i * 2 + 16); 498 } 499 /* Add shareable bits for device memory. */ 500 prrr |= PRRR_DS0 | PRRR_DS1; 501 502 /* Add shareable bits for normal memory in SMP case. */ 503 #ifdef SMP 504 ARM_SMP_UP( 505 prrr |= PRRR_NS1, 506 ); 507 #endif 508 cp15_prrr_set(prrr); 509 cp15_nmrr_set(nmrr); 510 511 /* Caches are disabled, so full TLB flush should be enough. */ 512 tlb_flush_all_local(); 513 } 514 515 /* 516 * Remap one vm_meattr class to another one. This can be useful as 517 * workaround for SOC errata, e.g. if devices must be accessed using 518 * SO memory class. 519 * 520 * !!! Please note that this function is absolutely last resort thing. 521 * It should not be used under normal circumstances. !!! 522 * 523 * Usage rules: 524 * - it shall be called after pmap_bootstrap_prepare() and before 525 * cpu_mp_start() (thus only on boot CPU). In practice, it's expected 526 * to be called from platform_attach() or platform_late_init(). 527 * 528 * - if remapping doesn't change caching mode, or until uncached class 529 * is remapped to any kind of cached one, then no other restriction exists. 530 * 531 * - if pmap_remap_vm_attr() changes caching mode, but both (original and 532 * remapped) remain cached, then caller is resposible for calling 533 * of dcache_wbinv_poc_all(). 534 * 535 * - remapping of any kind of cached class to uncached is not permitted. 536 */ 537 void 538 pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) 539 { 540 int old_idx, new_idx; 541 542 /* Map VM memattrs to indexes to tex_class table. */ 543 old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); 544 new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); 545 546 /* Replace TEX attribute and apply it. */ 547 tex_class[old_idx] = tex_class[new_idx]; 548 pmap_set_tex(); 549 } 550 551 /* 552 * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, 553 * KERNBASE is mapped by first L2 page table in L2 page table page. It 554 * meets same constrain due to PT2MAP being placed just under KERNBASE. 555 */ 556 CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); 557 CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); 558 559 /* 560 * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. 561 * For now, anyhow, the following check must be fulfilled. 562 */ 563 CTASSERT(PAGE_SIZE == PTE2_SIZE); 564 /* 565 * We don't want to mess up MI code with all MMU and PMAP definitions, 566 * so some things, which depend on other ones, are defined independently. 567 * Now, it is time to check that we don't screw up something. 568 */ 569 CTASSERT(PDRSHIFT == PTE1_SHIFT); 570 /* 571 * Check L1 and L2 page table entries definitions consistency. 572 */ 573 CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); 574 CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); 575 /* 576 * Check L2 page tables page consistency. 577 */ 578 CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); 579 CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); 580 /* 581 * Check PT2TAB consistency. 582 * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. 583 * This should be done without remainder. 584 */ 585 CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); 586 587 /* 588 * A PT2MAP magic. 589 * 590 * All level 2 page tables (PT2s) are mapped continuously and accordingly 591 * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can 592 * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page 593 * must be used together, but not necessary at once. The first PT2 in a page 594 * must map things on correctly aligned address and the others must follow 595 * in right order. 596 */ 597 #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) 598 #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) 599 #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) 600 601 /* 602 * Check PT2TAB consistency. 603 * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. 604 * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. 605 * The both should be done without remainder. 606 */ 607 CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); 608 CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); 609 /* 610 * The implementation was made general, however, with the assumption 611 * bellow in mind. In case of another value of NPG_IN_PT2TAB, 612 * the code should be once more rechecked. 613 */ 614 CTASSERT(NPG_IN_PT2TAB == 1); 615 616 /* 617 * Get offset of PT2 in a page 618 * associated with given PT1 index. 619 */ 620 static __inline u_int 621 page_pt2off(u_int pt1_idx) 622 { 623 624 return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); 625 } 626 627 /* 628 * Get physical address of PT2 629 * associated with given PT2s page and PT1 index. 630 */ 631 static __inline vm_paddr_t 632 page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) 633 { 634 635 return (pgpa + page_pt2off(pt1_idx)); 636 } 637 638 /* 639 * Get first entry of PT2 640 * associated with given PT2s page and PT1 index. 641 */ 642 static __inline pt2_entry_t * 643 page_pt2(vm_offset_t pgva, u_int pt1_idx) 644 { 645 646 return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); 647 } 648 649 /* 650 * Get virtual address of PT2s page (mapped in PT2MAP) 651 * which holds PT2 which holds entry which maps given virtual address. 652 */ 653 static __inline vm_offset_t 654 pt2map_pt2pg(vm_offset_t va) 655 { 656 657 va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); 658 return ((vm_offset_t)pt2map_entry(va)); 659 } 660 661 /***************************************************************************** 662 * 663 * THREE pmap initialization milestones exist: 664 * 665 * locore.S 666 * -> fundamental init (including MMU) in ASM 667 * 668 * initarm() 669 * -> fundamental init continues in C 670 * -> first available physical address is known 671 * 672 * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) 673 * -> basic (safe) interface for physical address allocation is made 674 * -> basic (safe) interface for virtual mapping is made 675 * -> limited not SMP coherent work is possible 676 * 677 * -> more fundamental init continues in C 678 * -> locks and some more things are available 679 * -> all fundamental allocations and mappings are done 680 * 681 * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) 682 * -> phys_avail[] and virtual_avail is set 683 * -> control is passed to vm subsystem 684 * -> physical and virtual address allocation are off limit 685 * -> low level mapping functions, some SMP coherent, 686 * are available, which cannot be used before vm subsystem 687 * is being inited 688 * 689 * mi_startup() 690 * -> vm subsystem is being inited 691 * 692 * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) 693 * -> pmap is fully inited 694 * 695 *****************************************************************************/ 696 697 /***************************************************************************** 698 * 699 * PMAP first stage initialization and utility functions 700 * for pre-bootstrap epoch. 701 * 702 * After pmap_bootstrap_prepare() is called, the following functions 703 * can be used: 704 * 705 * (1) strictly only for this stage functions for physical page allocations, 706 * virtual space allocations, and mappings: 707 * 708 * vm_paddr_t pmap_preboot_get_pages(u_int num); 709 * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); 710 * vm_offset_t pmap_preboot_reserve_pages(u_int num); 711 * vm_offset_t pmap_preboot_get_vpages(u_int num); 712 * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 713 * vm_prot_t prot, vm_memattr_t attr); 714 * 715 * (2) for all stages: 716 * 717 * vm_paddr_t pmap_kextract(vm_offset_t va); 718 * 719 * NOTE: This is not SMP coherent stage. 720 * 721 *****************************************************************************/ 722 723 #define KERNEL_P2V(pa) \ 724 ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) 725 #define KERNEL_V2P(va) \ 726 ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) 727 728 static vm_paddr_t last_paddr; 729 730 /* 731 * Pre-bootstrap epoch page allocator. 732 */ 733 vm_paddr_t 734 pmap_preboot_get_pages(u_int num) 735 { 736 vm_paddr_t ret; 737 738 ret = last_paddr; 739 last_paddr += num * PAGE_SIZE; 740 741 return (ret); 742 } 743 744 /* 745 * The fundamental initialization of PMAP stuff. 746 * 747 * Some things already happened in locore.S and some things could happen 748 * before pmap_bootstrap_prepare() is called, so let's recall what is done: 749 * 1. Caches are disabled. 750 * 2. We are running on virtual addresses already with 'boot_pt1' 751 * as L1 page table. 752 * 3. So far, all virtual addresses can be converted to physical ones and 753 * vice versa by the following macros: 754 * KERNEL_P2V(pa) .... physical to virtual ones, 755 * KERNEL_V2P(va) .... virtual to physical ones. 756 * 757 * What is done herein: 758 * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. 759 * 2. PT2MAP magic is brought to live. 760 * 3. Basic preboot functions for page allocations and mappings can be used. 761 * 4. Everything is prepared for L1 cache enabling. 762 * 763 * Variations: 764 * 1. To use second TTB register, so kernel and users page tables will be 765 * separated. This way process forking - pmap_pinit() - could be faster, 766 * it saves physical pages and KVA per a process, and it's simple change. 767 * However, it will lead, due to hardware matter, to the following: 768 * (a) 2G space for kernel and 2G space for users. 769 * (b) 1G space for kernel in low addresses and 3G for users above it. 770 * A question is: Is the case (b) really an option? Note that case (b) 771 * does save neither physical memory and KVA. 772 */ 773 void 774 pmap_bootstrap_prepare(vm_paddr_t last) 775 { 776 vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; 777 vm_offset_t pt2pg_va; 778 pt1_entry_t *pte1p; 779 pt2_entry_t *pte2p; 780 u_int i; 781 uint32_t l1_attr; 782 783 /* 784 * Now, we are going to make real kernel mapping. Note that we are 785 * already running on some mapping made in locore.S and we expect 786 * that it's large enough to ensure nofault access to physical memory 787 * allocated herein before switch. 788 * 789 * As kernel image and everything needed before are and will be mapped 790 * by section mappings, we align last physical address to PTE1_SIZE. 791 */ 792 last_paddr = pte1_roundup(last); 793 794 /* 795 * Allocate and zero page(s) for kernel L1 page table. 796 * 797 * Note that it's first allocation on space which was PTE1_SIZE 798 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. 799 */ 800 base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); 801 kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); 802 bzero((void*)kern_pt1, NB_IN_PT1); 803 pte1_sync_range(kern_pt1, NB_IN_PT1); 804 805 /* Allocate and zero page(s) for kernel PT2TAB. */ 806 pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); 807 kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); 808 bzero(kern_pt2tab, NB_IN_PT2TAB); 809 pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); 810 811 /* Allocate and zero page(s) for kernel L2 page tables. */ 812 pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); 813 pt2pg_va = KERNEL_P2V(pt2pg_pa); 814 size = NKPT2PG * PAGE_SIZE; 815 bzero((void*)pt2pg_va, size); 816 pte2_sync_range((pt2_entry_t *)pt2pg_va, size); 817 818 /* 819 * Add a physical memory segment (vm_phys_seg) corresponding to the 820 * preallocated pages for kernel L2 page tables so that vm_page 821 * structures representing these pages will be created. The vm_page 822 * structures are required for promotion of the corresponding kernel 823 * virtual addresses to section mappings. 824 */ 825 vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); 826 827 /* 828 * Insert allocated L2 page table pages to PT2TAB and make 829 * link to all PT2s in L1 page table. See how kernel_vm_end 830 * is initialized. 831 * 832 * We play simple and safe. So every KVA will have underlaying 833 * L2 page table, even kernel image mapped by sections. 834 */ 835 pte2p = kern_pt2tab_entry(KERNBASE); 836 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) 837 pt2tab_store(pte2p++, PTE2_KPT(pa)); 838 839 pte1p = kern_pte1(KERNBASE); 840 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) 841 pte1_store(pte1p++, PTE1_LINK(pa)); 842 843 /* Make section mappings for kernel. */ 844 l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); 845 pte1p = kern_pte1(KERNBASE); 846 for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) 847 pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); 848 849 /* 850 * Get free and aligned space for PT2MAP and make L1 page table links 851 * to L2 page tables held in PT2TAB. 852 * 853 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t 854 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus 855 * each entry in PT2TAB maps all PT2s in a page. This implies that 856 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. 857 */ 858 PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); 859 pte1p = kern_pte1((vm_offset_t)PT2MAP); 860 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 861 pte1_store(pte1p++, PTE1_LINK(pa)); 862 } 863 864 /* 865 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. 866 * Each pmap will hold own PT2TAB, so the mapping should be not global. 867 */ 868 pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); 869 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 870 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 871 } 872 873 /* 874 * Choose correct L2 page table and make mappings for allocations 875 * made herein which replaces temporary locore.S mappings after a while. 876 * Note that PT2MAP cannot be used until we switch to kern_pt1. 877 * 878 * Note, that these allocations started aligned on 1M section and 879 * kernel PT1 was allocated first. Making of mappings must follow 880 * order of physical allocations as we've used KERNEL_P2V() macro 881 * for virtual addresses resolution. 882 */ 883 pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); 884 pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); 885 886 pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); 887 888 /* Make mapping for kernel L1 page table. */ 889 for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) 890 pte2_store(pte2p++, PTE2_KPT(pa)); 891 892 /* Make mapping for kernel PT2TAB. */ 893 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) 894 pte2_store(pte2p++, PTE2_KPT(pa)); 895 896 /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ 897 pmap_kern_ttb = base_pt1 | ttb_flags; 898 cpuinfo_reinit_mmu(pmap_kern_ttb); 899 /* 900 * Initialize the first available KVA. As kernel image is mapped by 901 * sections, we are leaving some gap behind. 902 */ 903 virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; 904 } 905 906 /* 907 * Setup L2 page table page for given KVA. 908 * Used in pre-bootstrap epoch. 909 * 910 * Note that we have allocated NKPT2PG pages for L2 page tables in advance 911 * and used them for mapping KVA starting from KERNBASE. However, this is not 912 * enough. Vectors and devices need L2 page tables too. Note that they are 913 * even above VM_MAX_KERNEL_ADDRESS. 914 */ 915 static __inline vm_paddr_t 916 pmap_preboot_pt2pg_setup(vm_offset_t va) 917 { 918 pt2_entry_t *pte2p, pte2; 919 vm_paddr_t pt2pg_pa; 920 921 /* Get associated entry in PT2TAB. */ 922 pte2p = kern_pt2tab_entry(va); 923 924 /* Just return, if PT2s page exists already. */ 925 pte2 = pt2tab_load(pte2p); 926 if (pte2_is_valid(pte2)) 927 return (pte2_pa(pte2)); 928 929 KASSERT(va >= VM_MAX_KERNEL_ADDRESS, 930 ("%s: NKPT2PG too small", __func__)); 931 932 /* 933 * Allocate page for PT2s and insert it to PT2TAB. 934 * In other words, map it into PT2MAP space. 935 */ 936 pt2pg_pa = pmap_preboot_get_pages(1); 937 pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); 938 939 /* Zero all PT2s in allocated page. */ 940 bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); 941 pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); 942 943 return (pt2pg_pa); 944 } 945 946 /* 947 * Setup L2 page table for given KVA. 948 * Used in pre-bootstrap epoch. 949 */ 950 static void 951 pmap_preboot_pt2_setup(vm_offset_t va) 952 { 953 pt1_entry_t *pte1p; 954 vm_paddr_t pt2pg_pa, pt2_pa; 955 956 /* Setup PT2's page. */ 957 pt2pg_pa = pmap_preboot_pt2pg_setup(va); 958 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); 959 960 /* Insert PT2 to PT1. */ 961 pte1p = kern_pte1(va); 962 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 963 } 964 965 /* 966 * Get L2 page entry associated with given KVA. 967 * Used in pre-bootstrap epoch. 968 */ 969 static __inline pt2_entry_t* 970 pmap_preboot_vtopte2(vm_offset_t va) 971 { 972 pt1_entry_t *pte1p; 973 974 /* Setup PT2 if needed. */ 975 pte1p = kern_pte1(va); 976 if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ 977 pmap_preboot_pt2_setup(va); 978 979 return (pt2map_entry(va)); 980 } 981 982 /* 983 * Pre-bootstrap epoch page(s) mapping(s). 984 */ 985 void 986 pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) 987 { 988 u_int i; 989 pt2_entry_t *pte2p; 990 991 /* Map all the pages. */ 992 for (i = 0; i < num; i++) { 993 pte2p = pmap_preboot_vtopte2(va); 994 pte2_store(pte2p, PTE2_KRW(pa)); 995 va += PAGE_SIZE; 996 pa += PAGE_SIZE; 997 } 998 } 999 1000 /* 1001 * Pre-bootstrap epoch virtual space alocator. 1002 */ 1003 vm_offset_t 1004 pmap_preboot_reserve_pages(u_int num) 1005 { 1006 u_int i; 1007 vm_offset_t start, va; 1008 pt2_entry_t *pte2p; 1009 1010 /* Allocate virtual space. */ 1011 start = va = virtual_avail; 1012 virtual_avail += num * PAGE_SIZE; 1013 1014 /* Zero the mapping. */ 1015 for (i = 0; i < num; i++) { 1016 pte2p = pmap_preboot_vtopte2(va); 1017 pte2_store(pte2p, 0); 1018 va += PAGE_SIZE; 1019 } 1020 1021 return (start); 1022 } 1023 1024 /* 1025 * Pre-bootstrap epoch page(s) allocation and mapping(s). 1026 */ 1027 vm_offset_t 1028 pmap_preboot_get_vpages(u_int num) 1029 { 1030 vm_paddr_t pa; 1031 vm_offset_t va; 1032 1033 /* Allocate physical page(s). */ 1034 pa = pmap_preboot_get_pages(num); 1035 1036 /* Allocate virtual space. */ 1037 va = virtual_avail; 1038 virtual_avail += num * PAGE_SIZE; 1039 1040 /* Map and zero all. */ 1041 pmap_preboot_map_pages(pa, va, num); 1042 bzero((void *)va, num * PAGE_SIZE); 1043 1044 return (va); 1045 } 1046 1047 /* 1048 * Pre-bootstrap epoch page mapping(s) with attributes. 1049 */ 1050 void 1051 pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 1052 vm_prot_t prot, vm_memattr_t attr) 1053 { 1054 u_int num; 1055 u_int l1_attr, l1_prot, l2_prot, l2_attr; 1056 pt1_entry_t *pte1p; 1057 pt2_entry_t *pte2p; 1058 1059 l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; 1060 l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1061 l2_attr = vm_memattr_to_pte2(attr); 1062 l1_prot = ATTR_TO_L1(l2_prot); 1063 l1_attr = ATTR_TO_L1(l2_attr); 1064 1065 /* Map all the pages. */ 1066 num = round_page(size); 1067 while (num > 0) { 1068 if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { 1069 pte1p = kern_pte1(va); 1070 pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); 1071 va += PTE1_SIZE; 1072 pa += PTE1_SIZE; 1073 num -= PTE1_SIZE; 1074 } else { 1075 pte2p = pmap_preboot_vtopte2(va); 1076 pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); 1077 va += PAGE_SIZE; 1078 pa += PAGE_SIZE; 1079 num -= PAGE_SIZE; 1080 } 1081 } 1082 } 1083 1084 /* 1085 * Extract from the kernel page table the physical address 1086 * that is mapped by the given virtual address "va". 1087 */ 1088 vm_paddr_t 1089 pmap_kextract(vm_offset_t va) 1090 { 1091 vm_paddr_t pa; 1092 pt1_entry_t pte1; 1093 pt2_entry_t pte2; 1094 1095 pte1 = pte1_load(kern_pte1(va)); 1096 if (pte1_is_section(pte1)) { 1097 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1098 } else if (pte1_is_link(pte1)) { 1099 /* 1100 * We should beware of concurrent promotion that changes 1101 * pte1 at this point. However, it's not a problem as PT2 1102 * page is preserved by promotion in PT2TAB. So even if 1103 * it happens, using of PT2MAP is still safe. 1104 * 1105 * QQQ: However, concurrent removing is a problem which 1106 * ends in abort on PT2MAP space. Locking must be used 1107 * to deal with this. 1108 */ 1109 pte2 = pte2_load(pt2map_entry(va)); 1110 pa = pte2_pa(pte2) | (va & PTE2_OFFSET); 1111 } 1112 else { 1113 panic("%s: va %#x pte1 %#x", __func__, va, pte1); 1114 } 1115 return (pa); 1116 } 1117 1118 /* 1119 * Extract from the kernel page table the physical address 1120 * that is mapped by the given virtual address "va". Also 1121 * return L2 page table entry which maps the address. 1122 * 1123 * This is only intended to be used for panic dumps. 1124 */ 1125 vm_paddr_t 1126 pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) 1127 { 1128 vm_paddr_t pa; 1129 pt1_entry_t pte1; 1130 pt2_entry_t pte2; 1131 1132 pte1 = pte1_load(kern_pte1(va)); 1133 if (pte1_is_section(pte1)) { 1134 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1135 pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; 1136 } else if (pte1_is_link(pte1)) { 1137 pte2 = pte2_load(pt2map_entry(va)); 1138 pa = pte2_pa(pte2); 1139 } else { 1140 pte2 = 0; 1141 pa = 0; 1142 } 1143 if (pte2p != NULL) 1144 *pte2p = pte2; 1145 return (pa); 1146 } 1147 1148 /***************************************************************************** 1149 * 1150 * PMAP second stage initialization and utility functions 1151 * for bootstrap epoch. 1152 * 1153 * After pmap_bootstrap() is called, the following functions for 1154 * mappings can be used: 1155 * 1156 * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); 1157 * void pmap_kremove(vm_offset_t va); 1158 * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1159 * int prot); 1160 * 1161 * NOTE: This is not SMP coherent stage. And physical page allocation is not 1162 * allowed during this stage. 1163 * 1164 *****************************************************************************/ 1165 1166 /* 1167 * Initialize kernel PMAP locks and lists, kernel_pmap itself, and 1168 * reserve various virtual spaces for temporary mappings. 1169 */ 1170 void 1171 pmap_bootstrap(vm_offset_t firstaddr) 1172 { 1173 pt2_entry_t *unused __unused; 1174 struct pcpu *pc; 1175 1176 /* 1177 * Initialize the kernel pmap (which is statically allocated). 1178 */ 1179 PMAP_LOCK_INIT(kernel_pmap); 1180 kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ 1181 kernel_pmap->pm_pt1 = kern_pt1; 1182 kernel_pmap->pm_pt2tab = kern_pt2tab; 1183 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1184 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1185 1186 /* 1187 * Initialize the global pv list lock. 1188 */ 1189 rw_init(&pvh_global_lock, "pmap pv global"); 1190 1191 LIST_INIT(&allpmaps); 1192 1193 /* 1194 * Request a spin mutex so that changes to allpmaps cannot be 1195 * preempted by smp_rendezvous_cpus(). 1196 */ 1197 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 1198 mtx_lock_spin(&allpmaps_lock); 1199 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 1200 mtx_unlock_spin(&allpmaps_lock); 1201 1202 /* 1203 * Reserve some special page table entries/VA space for temporary 1204 * mapping of pages. 1205 */ 1206 #define SYSMAP(c, p, v, n) do { \ 1207 v = (c)pmap_preboot_reserve_pages(n); \ 1208 p = pt2map_entry((vm_offset_t)v); \ 1209 } while (0) 1210 1211 /* 1212 * Local CMAP1/CMAP2 are used for zeroing and copying pages. 1213 * Local CMAP2 is also used for data cache cleaning. 1214 */ 1215 pc = get_pcpu(); 1216 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1217 SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); 1218 SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); 1219 SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); 1220 1221 /* 1222 * Crashdump maps. 1223 */ 1224 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); 1225 1226 /* 1227 * _tmppt is used for reading arbitrary physical pages via /dev/mem. 1228 */ 1229 SYSMAP(caddr_t, unused, _tmppt, 1); 1230 1231 /* 1232 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), 1233 * respectively. PADDR3 is used by pmap_pte2_ddb(). 1234 */ 1235 SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); 1236 SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); 1237 #ifdef DDB 1238 SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); 1239 #endif 1240 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 1241 1242 /* 1243 * Note that in very short time in initarm(), we are going to 1244 * initialize phys_avail[] array and no further page allocation 1245 * can happen after that until vm subsystem will be initialized. 1246 */ 1247 kernel_vm_end_new = kernel_vm_end; 1248 virtual_end = vm_max_kernel_address; 1249 } 1250 1251 static void 1252 pmap_init_reserved_pages(void) 1253 { 1254 struct pcpu *pc; 1255 vm_offset_t pages; 1256 int i; 1257 1258 CPU_FOREACH(i) { 1259 pc = pcpu_find(i); 1260 /* 1261 * Skip if the mapping has already been initialized, 1262 * i.e. this is the BSP. 1263 */ 1264 if (pc->pc_cmap1_addr != 0) 1265 continue; 1266 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1267 pages = kva_alloc(PAGE_SIZE * 3); 1268 if (pages == 0) 1269 panic("%s: unable to allocate KVA", __func__); 1270 pc->pc_cmap1_pte2p = pt2map_entry(pages); 1271 pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); 1272 pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); 1273 pc->pc_cmap1_addr = (caddr_t)pages; 1274 pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); 1275 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 1276 } 1277 } 1278 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 1279 1280 /* 1281 * The function can already be use in second initialization stage. 1282 * As such, the function DOES NOT call pmap_growkernel() where PT2 1283 * allocation can happen. So if used, be sure that PT2 for given 1284 * virtual address is allocated already! 1285 * 1286 * Add a wired page to the kva. 1287 * Note: not SMP coherent. 1288 */ 1289 static __inline void 1290 pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, 1291 uint32_t attr) 1292 { 1293 pt1_entry_t *pte1p; 1294 pt2_entry_t *pte2p; 1295 1296 pte1p = kern_pte1(va); 1297 if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ 1298 /* 1299 * This is a very low level function, so PT2 and particularly 1300 * PT2PG associated with given virtual address must be already 1301 * allocated. It's a pain mainly during pmap initialization 1302 * stage. However, called after pmap initialization with 1303 * virtual address not under kernel_vm_end will lead to 1304 * the same misery. 1305 */ 1306 if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) 1307 panic("%s: kernel PT2 not allocated!", __func__); 1308 } 1309 1310 pte2p = pt2map_entry(va); 1311 pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); 1312 } 1313 1314 PMAP_INLINE void 1315 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1316 { 1317 1318 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); 1319 } 1320 1321 /* 1322 * Remove a page from the kernel pagetables. 1323 * Note: not SMP coherent. 1324 */ 1325 PMAP_INLINE void 1326 pmap_kremove(vm_offset_t va) 1327 { 1328 pt1_entry_t *pte1p; 1329 pt2_entry_t *pte2p; 1330 1331 pte1p = kern_pte1(va); 1332 if (pte1_is_section(pte1_load(pte1p))) { 1333 pte1_clear(pte1p); 1334 } else { 1335 pte2p = pt2map_entry(va); 1336 pte2_clear(pte2p); 1337 } 1338 } 1339 1340 /* 1341 * Share new kernel PT2PG with all pmaps. 1342 * The caller is responsible for maintaining TLB consistency. 1343 */ 1344 static void 1345 pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) 1346 { 1347 pmap_t pmap; 1348 pt2_entry_t *pte2p; 1349 1350 mtx_lock_spin(&allpmaps_lock); 1351 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1352 pte2p = pmap_pt2tab_entry(pmap, va); 1353 pt2tab_store(pte2p, npte2); 1354 } 1355 mtx_unlock_spin(&allpmaps_lock); 1356 } 1357 1358 /* 1359 * Share new kernel PTE1 with all pmaps. 1360 * The caller is responsible for maintaining TLB consistency. 1361 */ 1362 static void 1363 pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) 1364 { 1365 pmap_t pmap; 1366 pt1_entry_t *pte1p; 1367 1368 mtx_lock_spin(&allpmaps_lock); 1369 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1370 pte1p = pmap_pte1(pmap, va); 1371 pte1_store(pte1p, npte1); 1372 } 1373 mtx_unlock_spin(&allpmaps_lock); 1374 } 1375 1376 /* 1377 * Used to map a range of physical addresses into kernel 1378 * virtual address space. 1379 * 1380 * The value passed in '*virt' is a suggested virtual address for 1381 * the mapping. Architectures which can support a direct-mapped 1382 * physical to virtual region can return the appropriate address 1383 * within that region, leaving '*virt' unchanged. Other 1384 * architectures should map the pages starting at '*virt' and 1385 * update '*virt' with the first usable address after the mapped 1386 * region. 1387 * 1388 * NOTE: Read the comments above pmap_kenter_prot_attr() as 1389 * the function is used herein! 1390 */ 1391 vm_offset_t 1392 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1393 { 1394 vm_offset_t va, sva; 1395 vm_paddr_t pte1_offset; 1396 pt1_entry_t npte1; 1397 uint32_t l1prot, l2prot; 1398 uint32_t l1attr, l2attr; 1399 1400 PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," 1401 " prot = %d\n", __func__, *virt, start, end, end - start, prot)); 1402 1403 l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; 1404 l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1405 l1prot = ATTR_TO_L1(l2prot); 1406 1407 l2attr = PTE2_ATTR_DEFAULT; 1408 l1attr = ATTR_TO_L1(l2attr); 1409 1410 va = *virt; 1411 /* 1412 * Does the physical address range's size and alignment permit at 1413 * least one section mapping to be created? 1414 */ 1415 pte1_offset = start & PTE1_OFFSET; 1416 if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= 1417 PTE1_SIZE) { 1418 /* 1419 * Increase the starting virtual address so that its alignment 1420 * does not preclude the use of section mappings. 1421 */ 1422 if ((va & PTE1_OFFSET) < pte1_offset) 1423 va = pte1_trunc(va) + pte1_offset; 1424 else if ((va & PTE1_OFFSET) > pte1_offset) 1425 va = pte1_roundup(va) + pte1_offset; 1426 } 1427 sva = va; 1428 while (start < end) { 1429 if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { 1430 KASSERT((va & PTE1_OFFSET) == 0, 1431 ("%s: misaligned va %#x", __func__, va)); 1432 npte1 = PTE1_KERN(start, l1prot, l1attr); 1433 pmap_kenter_pte1(va, npte1); 1434 va += PTE1_SIZE; 1435 start += PTE1_SIZE; 1436 } else { 1437 pmap_kenter_prot_attr(va, start, l2prot, l2attr); 1438 va += PAGE_SIZE; 1439 start += PAGE_SIZE; 1440 } 1441 } 1442 tlb_flush_range(sva, va - sva); 1443 *virt = va; 1444 return (sva); 1445 } 1446 1447 /* 1448 * Make a temporary mapping for a physical address. 1449 * This is only intended to be used for panic dumps. 1450 */ 1451 void * 1452 pmap_kenter_temporary(vm_paddr_t pa, int i) 1453 { 1454 vm_offset_t va; 1455 1456 /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ 1457 1458 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 1459 pmap_kenter(va, pa); 1460 tlb_flush_local(va); 1461 return ((void *)crashdumpmap); 1462 } 1463 1464 /************************************* 1465 * 1466 * TLB & cache maintenance routines. 1467 * 1468 *************************************/ 1469 1470 /* 1471 * We inline these within pmap.c for speed. 1472 */ 1473 PMAP_INLINE void 1474 pmap_tlb_flush(pmap_t pmap, vm_offset_t va) 1475 { 1476 1477 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1478 tlb_flush(va); 1479 } 1480 1481 PMAP_INLINE void 1482 pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) 1483 { 1484 1485 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1486 tlb_flush_range(sva, size); 1487 } 1488 1489 /* 1490 * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. 1491 * Requirements: 1492 * - Must deal with pages in order to ensure that none of the PTE2_* bits 1493 * are ever set, PTE2_V in particular. 1494 * - Assumes we can write to pte2s without pte2_store() atomic ops. 1495 * - Assumes nothing will ever test these addresses for 0 to indicate 1496 * no mapping instead of correctly checking PTE2_V. 1497 * - Assumes a vm_offset_t will fit in a pte2 (true for arm). 1498 * Because PTE2_V is never set, there can be no mappings to invalidate. 1499 */ 1500 static vm_offset_t 1501 pmap_pte2list_alloc(vm_offset_t *head) 1502 { 1503 pt2_entry_t *pte2p; 1504 vm_offset_t va; 1505 1506 va = *head; 1507 if (va == 0) 1508 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 1509 pte2p = pt2map_entry(va); 1510 *head = *pte2p; 1511 if (*head & PTE2_V) 1512 panic("%s: va with PTE2_V set!", __func__); 1513 *pte2p = 0; 1514 return (va); 1515 } 1516 1517 static void 1518 pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) 1519 { 1520 pt2_entry_t *pte2p; 1521 1522 if (va & PTE2_V) 1523 panic("%s: freeing va with PTE2_V set!", __func__); 1524 pte2p = pt2map_entry(va); 1525 *pte2p = *head; /* virtual! PTE2_V is 0 though */ 1526 *head = va; 1527 } 1528 1529 static void 1530 pmap_pte2list_init(vm_offset_t *head, void *base, int npages) 1531 { 1532 int i; 1533 vm_offset_t va; 1534 1535 *head = 0; 1536 for (i = npages - 1; i >= 0; i--) { 1537 va = (vm_offset_t)base + i * PAGE_SIZE; 1538 pmap_pte2list_free(head, va); 1539 } 1540 } 1541 1542 /***************************************************************************** 1543 * 1544 * PMAP third and final stage initialization. 1545 * 1546 * After pmap_init() is called, PMAP subsystem is fully initialized. 1547 * 1548 *****************************************************************************/ 1549 1550 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1551 "VM/pmap parameters"); 1552 1553 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1554 "Max number of PV entries"); 1555 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1556 "Page share factor per proc"); 1557 1558 static u_long nkpt2pg = NKPT2PG; 1559 SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, 1560 &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); 1561 1562 static int sp_enabled = 1; 1563 SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 1564 &sp_enabled, 0, "Are large page mappings enabled?"); 1565 1566 bool 1567 pmap_ps_enabled(pmap_t pmap __unused) 1568 { 1569 1570 return (sp_enabled != 0); 1571 } 1572 1573 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1574 "1MB page mapping counters"); 1575 1576 static u_long pmap_pte1_demotions; 1577 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, 1578 &pmap_pte1_demotions, 0, "1MB page demotions"); 1579 1580 static u_long pmap_pte1_mappings; 1581 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, 1582 &pmap_pte1_mappings, 0, "1MB page mappings"); 1583 1584 static u_long pmap_pte1_p_failures; 1585 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, 1586 &pmap_pte1_p_failures, 0, "1MB page promotion failures"); 1587 1588 static u_long pmap_pte1_promotions; 1589 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, 1590 &pmap_pte1_promotions, 0, "1MB page promotions"); 1591 1592 static u_long pmap_pte1_kern_demotions; 1593 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, 1594 &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); 1595 1596 static u_long pmap_pte1_kern_promotions; 1597 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, 1598 &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); 1599 1600 static __inline ttb_entry_t 1601 pmap_ttb_get(pmap_t pmap) 1602 { 1603 1604 return (vtophys(pmap->pm_pt1) | ttb_flags); 1605 } 1606 1607 /* 1608 * Initialize a vm_page's machine-dependent fields. 1609 * 1610 * Variations: 1611 * 1. Pages for L2 page tables are always not managed. So, pv_list and 1612 * pt2_wirecount can share same physical space. However, proper 1613 * initialization on a page alloc for page tables and reinitialization 1614 * on the page free must be ensured. 1615 */ 1616 void 1617 pmap_page_init(vm_page_t m) 1618 { 1619 1620 TAILQ_INIT(&m->md.pv_list); 1621 pt2_wirecount_init(m); 1622 m->md.pat_mode = VM_MEMATTR_DEFAULT; 1623 } 1624 1625 /* 1626 * Virtualization for faster way how to zero whole page. 1627 */ 1628 static __inline void 1629 pagezero(void *page) 1630 { 1631 1632 bzero(page, PAGE_SIZE); 1633 } 1634 1635 /* 1636 * Zero L2 page table page. 1637 * Use same KVA as in pmap_zero_page(). 1638 */ 1639 static __inline vm_paddr_t 1640 pmap_pt2pg_zero(vm_page_t m) 1641 { 1642 pt2_entry_t *cmap2_pte2p; 1643 vm_paddr_t pa; 1644 struct pcpu *pc; 1645 1646 pa = VM_PAGE_TO_PHYS(m); 1647 1648 /* 1649 * XXX: For now, we map whole page even if it's already zero, 1650 * to sync it even if the sync is only DSB. 1651 */ 1652 sched_pin(); 1653 pc = get_pcpu(); 1654 cmap2_pte2p = pc->pc_cmap2_pte2p; 1655 mtx_lock(&pc->pc_cmap_lock); 1656 if (pte2_load(cmap2_pte2p) != 0) 1657 panic("%s: CMAP2 busy", __func__); 1658 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 1659 vm_page_pte2_attr(m))); 1660 /* Even VM_ALLOC_ZERO request is only advisory. */ 1661 if ((m->flags & PG_ZERO) == 0) 1662 pagezero(pc->pc_cmap2_addr); 1663 pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); 1664 pte2_clear(cmap2_pte2p); 1665 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 1666 1667 /* 1668 * Unpin the thread before releasing the lock. Otherwise the thread 1669 * could be rescheduled while still bound to the current CPU, only 1670 * to unpin itself immediately upon resuming execution. 1671 */ 1672 sched_unpin(); 1673 mtx_unlock(&pc->pc_cmap_lock); 1674 1675 return (pa); 1676 } 1677 1678 /* 1679 * Init just allocated page as L2 page table(s) holder 1680 * and return its physical address. 1681 */ 1682 static __inline vm_paddr_t 1683 pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) 1684 { 1685 vm_paddr_t pa; 1686 pt2_entry_t *pte2p; 1687 1688 /* Check page attributes. */ 1689 if (m->md.pat_mode != pt_memattr) 1690 pmap_page_set_memattr(m, pt_memattr); 1691 1692 /* Zero page and init wire counts. */ 1693 pa = pmap_pt2pg_zero(m); 1694 pt2_wirecount_init(m); 1695 1696 /* 1697 * Map page to PT2MAP address space for given pmap. 1698 * Note that PT2MAP space is shared with all pmaps. 1699 */ 1700 if (pmap == kernel_pmap) 1701 pmap_kenter_pt2tab(va, PTE2_KPT(pa)); 1702 else { 1703 pte2p = pmap_pt2tab_entry(pmap, va); 1704 pt2tab_store(pte2p, PTE2_KPT_NG(pa)); 1705 } 1706 1707 return (pa); 1708 } 1709 1710 /* 1711 * Initialize the pmap module. 1712 * Called by vm_init, to initialize any structures that the pmap 1713 * system needs to map virtual memory. 1714 */ 1715 void 1716 pmap_init(void) 1717 { 1718 vm_size_t s; 1719 pt2_entry_t *pte2p, pte2; 1720 u_int i, pte1_idx, pv_npg; 1721 1722 PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); 1723 1724 /* 1725 * Initialize the vm page array entries for kernel pmap's 1726 * L2 page table pages allocated in advance. 1727 */ 1728 pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); 1729 pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); 1730 for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { 1731 vm_paddr_t pa; 1732 vm_page_t m; 1733 1734 pte2 = pte2_load(pte2p); 1735 KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); 1736 1737 pa = pte2_pa(pte2); 1738 m = PHYS_TO_VM_PAGE(pa); 1739 KASSERT(m >= vm_page_array && 1740 m < &vm_page_array[vm_page_array_size], 1741 ("%s: L2 page table page is out of range", __func__)); 1742 1743 m->pindex = pte1_idx; 1744 m->phys_addr = pa; 1745 pte1_idx += NPT2_IN_PG; 1746 } 1747 1748 /* 1749 * Initialize the address space (zone) for the pv entries. Set a 1750 * high water mark so that the system can recover from excessive 1751 * numbers of pv entries. 1752 */ 1753 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1754 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1755 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1756 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1757 pv_entry_high_water = 9 * (pv_entry_max / 10); 1758 1759 /* 1760 * Are large page mappings enabled? 1761 */ 1762 TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); 1763 if (sp_enabled) { 1764 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1765 ("%s: can't assign to pagesizes[1]", __func__)); 1766 pagesizes[1] = PTE1_SIZE; 1767 } 1768 1769 /* 1770 * Calculate the size of the pv head table for sections. 1771 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1772 * Note that the table is only for sections which could be promoted. 1773 */ 1774 first_managed_pa = pte1_trunc(vm_phys_segs[0].start); 1775 pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) 1776 - first_managed_pa) / PTE1_SIZE + 1; 1777 1778 /* 1779 * Allocate memory for the pv head table for sections. 1780 */ 1781 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1782 s = round_page(s); 1783 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 1784 for (i = 0; i < pv_npg; i++) 1785 TAILQ_INIT(&pv_table[i].pv_list); 1786 1787 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1788 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1789 if (pv_chunkbase == NULL) 1790 panic("%s: not enough kvm for pv chunks", __func__); 1791 pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1792 } 1793 1794 /* 1795 * Add a list of wired pages to the kva 1796 * this routine is only used for temporary 1797 * kernel mappings that do not need to have 1798 * page modification or references recorded. 1799 * Note that old mappings are simply written 1800 * over. The page *must* be wired. 1801 * Note: SMP coherent. Uses a ranged shootdown IPI. 1802 */ 1803 void 1804 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1805 { 1806 u_int anychanged; 1807 pt2_entry_t *epte2p, *pte2p, pte2; 1808 vm_page_t m; 1809 vm_paddr_t pa; 1810 1811 anychanged = 0; 1812 pte2p = pt2map_entry(sva); 1813 epte2p = pte2p + count; 1814 while (pte2p < epte2p) { 1815 m = *ma++; 1816 pa = VM_PAGE_TO_PHYS(m); 1817 pte2 = pte2_load(pte2p); 1818 if ((pte2_pa(pte2) != pa) || 1819 (pte2_attr(pte2) != vm_page_pte2_attr(m))) { 1820 anychanged++; 1821 pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, 1822 vm_page_pte2_attr(m))); 1823 } 1824 pte2p++; 1825 } 1826 if (__predict_false(anychanged)) 1827 tlb_flush_range(sva, count * PAGE_SIZE); 1828 } 1829 1830 /* 1831 * This routine tears out page mappings from the 1832 * kernel -- it is meant only for temporary mappings. 1833 * Note: SMP coherent. Uses a ranged shootdown IPI. 1834 */ 1835 void 1836 pmap_qremove(vm_offset_t sva, int count) 1837 { 1838 vm_offset_t va; 1839 1840 va = sva; 1841 while (count-- > 0) { 1842 pmap_kremove(va); 1843 va += PAGE_SIZE; 1844 } 1845 tlb_flush_range(sva, va - sva); 1846 } 1847 1848 /* 1849 * Are we current address space or kernel? 1850 */ 1851 static __inline int 1852 pmap_is_current(pmap_t pmap) 1853 { 1854 1855 return (pmap == kernel_pmap || 1856 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); 1857 } 1858 1859 /* 1860 * If the given pmap is not the current or kernel pmap, the returned 1861 * pte2 must be released by passing it to pmap_pte2_release(). 1862 */ 1863 static pt2_entry_t * 1864 pmap_pte2(pmap_t pmap, vm_offset_t va) 1865 { 1866 pt1_entry_t pte1; 1867 vm_paddr_t pt2pg_pa; 1868 1869 pte1 = pte1_load(pmap_pte1(pmap, va)); 1870 if (pte1_is_section(pte1)) 1871 panic("%s: attempt to map PTE1", __func__); 1872 if (pte1_is_link(pte1)) { 1873 /* Are we current address space or kernel? */ 1874 if (pmap_is_current(pmap)) 1875 return (pt2map_entry(va)); 1876 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1877 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1878 mtx_lock(&PMAP2mutex); 1879 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 1880 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 1881 tlb_flush((vm_offset_t)PADDR2); 1882 } 1883 return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1884 } 1885 return (NULL); 1886 } 1887 1888 /* 1889 * Releases a pte2 that was obtained from pmap_pte2(). 1890 * Be prepared for the pte2p being NULL. 1891 */ 1892 static __inline void 1893 pmap_pte2_release(pt2_entry_t *pte2p) 1894 { 1895 1896 if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { 1897 mtx_unlock(&PMAP2mutex); 1898 } 1899 } 1900 1901 /* 1902 * Super fast pmap_pte2 routine best used when scanning 1903 * the pv lists. This eliminates many coarse-grained 1904 * invltlb calls. Note that many of the pv list 1905 * scans are across different pmaps. It is very wasteful 1906 * to do an entire tlb flush for checking a single mapping. 1907 * 1908 * If the given pmap is not the current pmap, pvh_global_lock 1909 * must be held and curthread pinned to a CPU. 1910 */ 1911 static pt2_entry_t * 1912 pmap_pte2_quick(pmap_t pmap, vm_offset_t va) 1913 { 1914 pt1_entry_t pte1; 1915 vm_paddr_t pt2pg_pa; 1916 1917 pte1 = pte1_load(pmap_pte1(pmap, va)); 1918 if (pte1_is_section(pte1)) 1919 panic("%s: attempt to map PTE1", __func__); 1920 if (pte1_is_link(pte1)) { 1921 /* Are we current address space or kernel? */ 1922 if (pmap_is_current(pmap)) 1923 return (pt2map_entry(va)); 1924 rw_assert(&pvh_global_lock, RA_WLOCKED); 1925 KASSERT(curthread->td_pinned > 0, 1926 ("%s: curthread not pinned", __func__)); 1927 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1928 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1929 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 1930 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 1931 #ifdef SMP 1932 PMAP1cpu = PCPU_GET(cpuid); 1933 #endif 1934 tlb_flush_local((vm_offset_t)PADDR1); 1935 PMAP1changed++; 1936 } else 1937 #ifdef SMP 1938 if (PMAP1cpu != PCPU_GET(cpuid)) { 1939 PMAP1cpu = PCPU_GET(cpuid); 1940 tlb_flush_local((vm_offset_t)PADDR1); 1941 PMAP1changedcpu++; 1942 } else 1943 #endif 1944 PMAP1unchanged++; 1945 return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1946 } 1947 return (NULL); 1948 } 1949 1950 /* 1951 * Routine: pmap_extract 1952 * Function: 1953 * Extract the physical page address associated 1954 * with the given map/virtual_address pair. 1955 */ 1956 vm_paddr_t 1957 pmap_extract(pmap_t pmap, vm_offset_t va) 1958 { 1959 vm_paddr_t pa; 1960 pt1_entry_t pte1; 1961 pt2_entry_t *pte2p; 1962 1963 PMAP_LOCK(pmap); 1964 pte1 = pte1_load(pmap_pte1(pmap, va)); 1965 if (pte1_is_section(pte1)) 1966 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1967 else if (pte1_is_link(pte1)) { 1968 pte2p = pmap_pte2(pmap, va); 1969 pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); 1970 pmap_pte2_release(pte2p); 1971 } else 1972 pa = 0; 1973 PMAP_UNLOCK(pmap); 1974 return (pa); 1975 } 1976 1977 /* 1978 * Routine: pmap_extract_and_hold 1979 * Function: 1980 * Atomically extract and hold the physical page 1981 * with the given pmap and virtual address pair 1982 * if that mapping permits the given protection. 1983 */ 1984 vm_page_t 1985 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1986 { 1987 vm_paddr_t pa; 1988 pt1_entry_t pte1; 1989 pt2_entry_t pte2, *pte2p; 1990 vm_page_t m; 1991 1992 m = NULL; 1993 PMAP_LOCK(pmap); 1994 pte1 = pte1_load(pmap_pte1(pmap, va)); 1995 if (pte1_is_section(pte1)) { 1996 if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { 1997 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1998 m = PHYS_TO_VM_PAGE(pa); 1999 if (!vm_page_wire_mapped(m)) 2000 m = NULL; 2001 } 2002 } else if (pte1_is_link(pte1)) { 2003 pte2p = pmap_pte2(pmap, va); 2004 pte2 = pte2_load(pte2p); 2005 pmap_pte2_release(pte2p); 2006 if (pte2_is_valid(pte2) && 2007 (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { 2008 pa = pte2_pa(pte2); 2009 m = PHYS_TO_VM_PAGE(pa); 2010 if (!vm_page_wire_mapped(m)) 2011 m = NULL; 2012 } 2013 } 2014 PMAP_UNLOCK(pmap); 2015 return (m); 2016 } 2017 2018 /* 2019 * Grow the number of kernel L2 page table entries, if needed. 2020 */ 2021 void 2022 pmap_growkernel(vm_offset_t addr) 2023 { 2024 vm_page_t m; 2025 vm_paddr_t pt2pg_pa, pt2_pa; 2026 pt1_entry_t pte1; 2027 pt2_entry_t pte2; 2028 2029 PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); 2030 /* 2031 * All the time kernel_vm_end is first KVA for which underlying 2032 * L2 page table is either not allocated or linked from L1 page table 2033 * (not considering sections). Except for two possible cases: 2034 * 2035 * (1) in the very beginning as long as pmap_growkernel() was 2036 * not called, it could be first unused KVA (which is not 2037 * rounded up to PTE1_SIZE), 2038 * 2039 * (2) when all KVA space is mapped and vm_map_max(kernel_map) 2040 * address is not rounded up to PTE1_SIZE. (For example, 2041 * it could be 0xFFFFFFFF.) 2042 */ 2043 kernel_vm_end = pte1_roundup(kernel_vm_end); 2044 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2045 addr = roundup2(addr, PTE1_SIZE); 2046 if (addr - 1 >= vm_map_max(kernel_map)) 2047 addr = vm_map_max(kernel_map); 2048 while (kernel_vm_end < addr) { 2049 pte1 = pte1_load(kern_pte1(kernel_vm_end)); 2050 if (pte1_is_valid(pte1)) { 2051 kernel_vm_end += PTE1_SIZE; 2052 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2053 kernel_vm_end = vm_map_max(kernel_map); 2054 break; 2055 } 2056 continue; 2057 } 2058 2059 /* 2060 * kernel_vm_end_new is used in pmap_pinit() when kernel 2061 * mappings are entered to new pmap all at once to avoid race 2062 * between pmap_kenter_pte1() and kernel_vm_end increase. 2063 * The same aplies to pmap_kenter_pt2tab(). 2064 */ 2065 kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; 2066 2067 pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); 2068 if (!pte2_is_valid(pte2)) { 2069 /* 2070 * Install new PT2s page into kernel PT2TAB. 2071 */ 2072 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 2073 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2074 if (m == NULL) 2075 panic("%s: no memory to grow kernel", __func__); 2076 m->pindex = pte1_index(kernel_vm_end) & ~PT2PG_MASK; 2077 2078 /* 2079 * QQQ: To link all new L2 page tables from L1 page 2080 * table now and so pmap_kenter_pte1() them 2081 * at once together with pmap_kenter_pt2tab() 2082 * could be nice speed up. However, 2083 * pmap_growkernel() does not happen so often... 2084 * QQQ: The other TTBR is another option. 2085 */ 2086 pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, 2087 m); 2088 } else 2089 pt2pg_pa = pte2_pa(pte2); 2090 2091 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); 2092 pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); 2093 2094 kernel_vm_end = kernel_vm_end_new; 2095 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2096 kernel_vm_end = vm_map_max(kernel_map); 2097 break; 2098 } 2099 } 2100 } 2101 2102 static int 2103 kvm_size(SYSCTL_HANDLER_ARGS) 2104 { 2105 unsigned long ksize = vm_max_kernel_address - KERNBASE; 2106 2107 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2108 } 2109 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, 2110 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_size, "IU", 2111 "Size of KVM"); 2112 2113 static int 2114 kvm_free(SYSCTL_HANDLER_ARGS) 2115 { 2116 unsigned long kfree = vm_max_kernel_address - kernel_vm_end; 2117 2118 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2119 } 2120 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, 2121 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_free, "IU", 2122 "Amount of KVM free"); 2123 2124 /*********************************************** 2125 * 2126 * Pmap allocation/deallocation routines. 2127 * 2128 ***********************************************/ 2129 2130 /* 2131 * Initialize the pmap for the swapper process. 2132 */ 2133 void 2134 pmap_pinit0(pmap_t pmap) 2135 { 2136 PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); 2137 2138 PMAP_LOCK_INIT(pmap); 2139 2140 /* 2141 * Kernel page table directory and pmap stuff around is already 2142 * initialized, we are using it right now and here. So, finish 2143 * only PMAP structures initialization for process0 ... 2144 * 2145 * Since the L1 page table and PT2TAB is shared with the kernel pmap, 2146 * which is already included in the list "allpmaps", this pmap does 2147 * not need to be inserted into that list. 2148 */ 2149 pmap->pm_pt1 = kern_pt1; 2150 pmap->pm_pt2tab = kern_pt2tab; 2151 CPU_ZERO(&pmap->pm_active); 2152 PCPU_SET(curpmap, pmap); 2153 TAILQ_INIT(&pmap->pm_pvchunk); 2154 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2155 CPU_SET(0, &pmap->pm_active); 2156 } 2157 2158 static __inline void 2159 pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, 2160 vm_offset_t eva) 2161 { 2162 u_int idx, count; 2163 2164 idx = pte1_index(sva); 2165 count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); 2166 bcopy(spte1p + idx, dpte1p + idx, count); 2167 } 2168 2169 static __inline void 2170 pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, 2171 vm_offset_t eva) 2172 { 2173 u_int idx, count; 2174 2175 idx = pt2tab_index(sva); 2176 count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); 2177 bcopy(spte2p + idx, dpte2p + idx, count); 2178 } 2179 2180 /* 2181 * Initialize a preallocated and zeroed pmap structure, 2182 * such as one in a vmspace structure. 2183 */ 2184 int 2185 pmap_pinit(pmap_t pmap) 2186 { 2187 pt1_entry_t *pte1p; 2188 pt2_entry_t *pte2p; 2189 vm_paddr_t pa, pt2tab_pa; 2190 u_int i; 2191 2192 PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, 2193 pmap->pm_pt1)); 2194 2195 /* 2196 * No need to allocate L2 page table space yet but we do need 2197 * a valid L1 page table and PT2TAB table. 2198 * 2199 * Install shared kernel mappings to these tables. It's a little 2200 * tricky as some parts of KVA are reserved for vectors, devices, 2201 * and whatever else. These parts are supposed to be above 2202 * vm_max_kernel_address. Thus two regions should be installed: 2203 * 2204 * (1) <KERNBASE, kernel_vm_end), 2205 * (2) <vm_max_kernel_address, 0xFFFFFFFF>. 2206 * 2207 * QQQ: The second region should be stable enough to be installed 2208 * only once in time when the tables are allocated. 2209 * QQQ: Maybe copy of both regions at once could be faster ... 2210 * QQQ: Maybe the other TTBR is an option. 2211 * 2212 * Finally, install own PT2TAB table to these tables. 2213 */ 2214 2215 if (pmap->pm_pt1 == NULL) { 2216 pmap->pm_pt1 = kmem_alloc_contig(NB_IN_PT1, 2217 M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); 2218 if (pmap->pm_pt1 == NULL) 2219 return (0); 2220 } 2221 if (pmap->pm_pt2tab == NULL) { 2222 /* 2223 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page 2224 * only, what should be the only size for 32 bit systems, 2225 * then we could allocate it with vm_page_alloc() and all 2226 * the stuff needed as other L2 page table pages. 2227 * (2) Note that a process PT2TAB is special L2 page table 2228 * page. Its mapping in kernel_arena is permanent and can 2229 * be used no matter which process is current. Its mapping 2230 * in PT2MAP can be used only for current process. 2231 */ 2232 pmap->pm_pt2tab = kmem_alloc_attr(NB_IN_PT2TAB, 2233 M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); 2234 if (pmap->pm_pt2tab == NULL) { 2235 /* 2236 * QQQ: As struct pmap is allocated from UMA with 2237 * UMA_ZONE_NOFREE flag, it's important to leave 2238 * no allocation in pmap if initialization failed. 2239 */ 2240 kmem_free(pmap->pm_pt1, NB_IN_PT1); 2241 pmap->pm_pt1 = NULL; 2242 return (0); 2243 } 2244 /* 2245 * QQQ: Each L2 page table page vm_page_t has pindex set to 2246 * pte1 index of virtual address mapped by this page. 2247 * It's not valid for non kernel PT2TABs themselves. 2248 * The pindex of these pages can not be altered because 2249 * of the way how they are allocated now. However, it 2250 * should not be a problem. 2251 */ 2252 } 2253 2254 mtx_lock_spin(&allpmaps_lock); 2255 /* 2256 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), 2257 * kernel_vm_end_new is used here instead of kernel_vm_end. 2258 */ 2259 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, 2260 kernel_vm_end_new - 1); 2261 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 2262 0xFFFFFFFF); 2263 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, 2264 kernel_vm_end_new - 1); 2265 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 2266 0xFFFFFFFF); 2267 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 2268 mtx_unlock_spin(&allpmaps_lock); 2269 2270 /* 2271 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. 2272 * I.e. self reference mapping. The PT2TAB is private, however mapped 2273 * into shared PT2MAP space, so the mapping should be not global. 2274 */ 2275 pt2tab_pa = vtophys(pmap->pm_pt2tab); 2276 pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); 2277 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 2278 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 2279 } 2280 2281 /* Insert PT2MAP PT2s into pmap PT1. */ 2282 pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); 2283 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 2284 pte1_store(pte1p++, PTE1_LINK(pa)); 2285 } 2286 2287 /* 2288 * Now synchronize new mapping which was made above. 2289 */ 2290 pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); 2291 pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); 2292 2293 CPU_ZERO(&pmap->pm_active); 2294 TAILQ_INIT(&pmap->pm_pvchunk); 2295 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2296 2297 return (1); 2298 } 2299 2300 #ifdef INVARIANTS 2301 static boolean_t 2302 pt2tab_user_is_empty(pt2_entry_t *tab) 2303 { 2304 u_int i, end; 2305 2306 end = pt2tab_index(VM_MAXUSER_ADDRESS); 2307 for (i = 0; i < end; i++) 2308 if (tab[i] != 0) return (FALSE); 2309 return (TRUE); 2310 } 2311 #endif 2312 /* 2313 * Release any resources held by the given physical map. 2314 * Called when a pmap initialized by pmap_pinit is being released. 2315 * Should only be called if the map contains no valid mappings. 2316 */ 2317 void 2318 pmap_release(pmap_t pmap) 2319 { 2320 #ifdef INVARIANTS 2321 vm_offset_t start, end; 2322 #endif 2323 KASSERT(pmap->pm_stats.resident_count == 0, 2324 ("%s: pmap resident count %ld != 0", __func__, 2325 pmap->pm_stats.resident_count)); 2326 KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), 2327 ("%s: has allocated user PT2(s)", __func__)); 2328 KASSERT(CPU_EMPTY(&pmap->pm_active), 2329 ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); 2330 2331 mtx_lock_spin(&allpmaps_lock); 2332 LIST_REMOVE(pmap, pm_list); 2333 mtx_unlock_spin(&allpmaps_lock); 2334 2335 #ifdef INVARIANTS 2336 start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); 2337 end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); 2338 bzero((char *)pmap->pm_pt1 + start, end - start); 2339 2340 start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); 2341 end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); 2342 bzero((char *)pmap->pm_pt2tab + start, end - start); 2343 #endif 2344 /* 2345 * We are leaving PT1 and PT2TAB allocated on released pmap, 2346 * so hopefully UMA vmspace_zone will always be inited with 2347 * UMA_ZONE_NOFREE flag. 2348 */ 2349 } 2350 2351 /********************************************************* 2352 * 2353 * L2 table pages and their pages management routines. 2354 * 2355 *********************************************************/ 2356 2357 /* 2358 * Virtual interface for L2 page table wire counting. 2359 * 2360 * Each L2 page table in a page has own counter which counts a number of 2361 * valid mappings in a table. Global page counter counts mappings in all 2362 * tables in a page plus a single itself mapping in PT2TAB. 2363 * 2364 * During a promotion we leave the associated L2 page table counter 2365 * untouched, so the table (strictly speaking a page which holds it) 2366 * is never freed if promoted. 2367 * 2368 * If a page m->ref_count == 1 then no valid mappings exist in any L2 page 2369 * table in the page and the page itself is only mapped in PT2TAB. 2370 */ 2371 2372 static __inline void 2373 pt2_wirecount_init(vm_page_t m) 2374 { 2375 u_int i; 2376 2377 /* 2378 * Note: A page m is allocated with VM_ALLOC_WIRED flag and 2379 * m->ref_count should be already set correctly. 2380 * So, there is no need to set it again herein. 2381 */ 2382 for (i = 0; i < NPT2_IN_PG; i++) 2383 m->md.pt2_wirecount[i] = 0; 2384 } 2385 2386 static __inline void 2387 pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) 2388 { 2389 2390 /* 2391 * Note: A just modificated pte2 (i.e. already allocated) 2392 * is acquiring one extra reference which must be 2393 * explicitly cleared. It influences the KASSERTs herein. 2394 * All L2 page tables in a page always belong to the same 2395 * pmap, so we allow only one extra reference for the page. 2396 */ 2397 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), 2398 ("%s: PT2 is overflowing ...", __func__)); 2399 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2400 ("%s: PT2PG is overflowing ...", __func__)); 2401 2402 m->ref_count++; 2403 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; 2404 } 2405 2406 static __inline void 2407 pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) 2408 { 2409 2410 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, 2411 ("%s: PT2 is underflowing ...", __func__)); 2412 KASSERT(m->ref_count > 1, 2413 ("%s: PT2PG is underflowing ...", __func__)); 2414 2415 m->ref_count--; 2416 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; 2417 } 2418 2419 static __inline void 2420 pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) 2421 { 2422 2423 KASSERT(count <= NPTE2_IN_PT2, 2424 ("%s: invalid count %u", __func__, count)); 2425 KASSERT(m->ref_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], 2426 ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->ref_count, 2427 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); 2428 2429 m->ref_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; 2430 m->ref_count += count; 2431 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; 2432 2433 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2434 ("%s: PT2PG is overflowed (%u) ...", __func__, m->ref_count)); 2435 } 2436 2437 static __inline uint32_t 2438 pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) 2439 { 2440 2441 return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); 2442 } 2443 2444 static __inline boolean_t 2445 pt2_is_empty(vm_page_t m, vm_offset_t va) 2446 { 2447 2448 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); 2449 } 2450 2451 static __inline boolean_t 2452 pt2_is_full(vm_page_t m, vm_offset_t va) 2453 { 2454 2455 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 2456 NPTE2_IN_PT2); 2457 } 2458 2459 static __inline boolean_t 2460 pt2pg_is_empty(vm_page_t m) 2461 { 2462 2463 return (m->ref_count == 1); 2464 } 2465 2466 /* 2467 * This routine is called if the L2 page table 2468 * is not mapped correctly. 2469 */ 2470 static vm_page_t 2471 _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2472 { 2473 uint32_t pte1_idx; 2474 pt1_entry_t *pte1p; 2475 pt2_entry_t pte2; 2476 vm_page_t m; 2477 vm_paddr_t pt2pg_pa, pt2_pa; 2478 2479 pte1_idx = pte1_index(va); 2480 pte1p = pmap->pm_pt1 + pte1_idx; 2481 2482 KASSERT(pte1_load(pte1p) == 0, 2483 ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, 2484 pte1_load(pte1p))); 2485 2486 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); 2487 if (!pte2_is_valid(pte2)) { 2488 /* 2489 * Install new PT2s page into pmap PT2TAB. 2490 */ 2491 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2492 if (m == NULL) { 2493 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2494 PMAP_UNLOCK(pmap); 2495 rw_wunlock(&pvh_global_lock); 2496 vm_wait(NULL); 2497 rw_wlock(&pvh_global_lock); 2498 PMAP_LOCK(pmap); 2499 } 2500 2501 /* 2502 * Indicate the need to retry. While waiting, 2503 * the L2 page table page may have been allocated. 2504 */ 2505 return (NULL); 2506 } 2507 m->pindex = pte1_idx & ~PT2PG_MASK; 2508 pmap->pm_stats.resident_count++; 2509 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 2510 } else { 2511 pt2pg_pa = pte2_pa(pte2); 2512 m = PHYS_TO_VM_PAGE(pt2pg_pa); 2513 } 2514 2515 pt2_wirecount_inc(m, pte1_idx); 2516 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 2517 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 2518 2519 return (m); 2520 } 2521 2522 static vm_page_t 2523 pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2524 { 2525 u_int pte1_idx; 2526 pt1_entry_t *pte1p, pte1; 2527 vm_page_t m; 2528 2529 pte1_idx = pte1_index(va); 2530 retry: 2531 pte1p = pmap->pm_pt1 + pte1_idx; 2532 pte1 = pte1_load(pte1p); 2533 2534 /* 2535 * This supports switching from a 1MB page to a 2536 * normal 4K page. 2537 */ 2538 if (pte1_is_section(pte1)) { 2539 (void)pmap_demote_pte1(pmap, pte1p, va); 2540 /* 2541 * Reload pte1 after demotion. 2542 * 2543 * Note: Demotion can even fail as either PT2 is not find for 2544 * the virtual address or PT2PG can not be allocated. 2545 */ 2546 pte1 = pte1_load(pte1p); 2547 } 2548 2549 /* 2550 * If the L2 page table page is mapped, we just increment the 2551 * hold count, and activate it. 2552 */ 2553 if (pte1_is_link(pte1)) { 2554 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2555 pt2_wirecount_inc(m, pte1_idx); 2556 } else { 2557 /* 2558 * Here if the PT2 isn't mapped, or if it has 2559 * been deallocated. 2560 */ 2561 m = _pmap_allocpte2(pmap, va, flags); 2562 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2563 goto retry; 2564 } 2565 2566 return (m); 2567 } 2568 2569 /* 2570 * Schedule the specified unused L2 page table page to be freed. Specifically, 2571 * add the page to the specified list of pages that will be released to the 2572 * physical memory manager after the TLB has been updated. 2573 */ 2574 static __inline void 2575 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) 2576 { 2577 2578 /* 2579 * Put page on a list so that it is released after 2580 * *ALL* TLB shootdown is done 2581 */ 2582 #ifdef PMAP_DEBUG 2583 pmap_zero_page_check(m); 2584 #endif 2585 m->flags |= PG_ZERO; 2586 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2587 } 2588 2589 /* 2590 * Unwire L2 page tables page. 2591 */ 2592 static void 2593 pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) 2594 { 2595 pt1_entry_t *pte1p, opte1 __unused; 2596 pt2_entry_t *pte2p; 2597 uint32_t i; 2598 2599 KASSERT(pt2pg_is_empty(m), 2600 ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); 2601 2602 /* 2603 * Unmap all L2 page tables in the page from L1 page table. 2604 * 2605 * QQQ: Individual L2 page tables (except the last one) can be unmapped 2606 * earlier. However, we are doing that this way. 2607 */ 2608 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 2609 ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); 2610 pte1p = pmap->pm_pt1 + m->pindex; 2611 for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { 2612 KASSERT(m->md.pt2_wirecount[i] == 0, 2613 ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); 2614 opte1 = pte1_load(pte1p); 2615 if (pte1_is_link(opte1)) { 2616 pte1_clear(pte1p); 2617 /* 2618 * Flush intermediate TLB cache. 2619 */ 2620 pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); 2621 } 2622 #ifdef INVARIANTS 2623 else 2624 KASSERT((opte1 == 0) || pte1_is_section(opte1), 2625 ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, 2626 pmap, va, opte1, i)); 2627 #endif 2628 } 2629 2630 /* 2631 * Unmap the page from PT2TAB. 2632 */ 2633 pte2p = pmap_pt2tab_entry(pmap, va); 2634 (void)pt2tab_load_clear(pte2p); 2635 pmap_tlb_flush(pmap, pt2map_pt2pg(va)); 2636 2637 m->ref_count = 0; 2638 pmap->pm_stats.resident_count--; 2639 2640 /* 2641 * This barrier is so that the ordinary store unmapping 2642 * the L2 page table page is globally performed before TLB shoot- 2643 * down is begun. 2644 */ 2645 wmb(); 2646 vm_wire_sub(1); 2647 } 2648 2649 /* 2650 * Decrements a L2 page table page's wire count, which is used to record the 2651 * number of valid page table entries within the page. If the wire count 2652 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2653 * page table page was unmapped and FALSE otherwise. 2654 */ 2655 static __inline boolean_t 2656 pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2657 { 2658 pt2_wirecount_dec(m, pte1_index(va)); 2659 if (pt2pg_is_empty(m)) { 2660 /* 2661 * QQQ: Wire count is zero, so whole page should be zero and 2662 * we can set PG_ZERO flag to it. 2663 * Note that when promotion is enabled, it takes some 2664 * more efforts. See pmap_unwire_pt2_all() below. 2665 */ 2666 pmap_unwire_pt2pg(pmap, va, m); 2667 pmap_add_delayed_free_list(m, free); 2668 return (TRUE); 2669 } else 2670 return (FALSE); 2671 } 2672 2673 /* 2674 * Drop a L2 page table page's wire count at once, which is used to record 2675 * the number of valid L2 page table entries within the page. If the wire 2676 * count drops to zero, then the L2 page table page is unmapped. 2677 */ 2678 static __inline void 2679 pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, 2680 struct spglist *free) 2681 { 2682 u_int pte1_idx = pte1_index(va); 2683 2684 KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), 2685 ("%s: PT2 page's pindex is wrong", __func__)); 2686 KASSERT(m->ref_count > pt2_wirecount_get(m, pte1_idx), 2687 ("%s: bad pt2 wire count %u > %u", __func__, m->ref_count, 2688 pt2_wirecount_get(m, pte1_idx))); 2689 2690 /* 2691 * It's possible that the L2 page table was never used. 2692 * It happened in case that a section was created without promotion. 2693 */ 2694 if (pt2_is_full(m, va)) { 2695 pt2_wirecount_set(m, pte1_idx, 0); 2696 2697 /* 2698 * QQQ: We clear L2 page table now, so when L2 page table page 2699 * is going to be freed, we can set it PG_ZERO flag ... 2700 * This function is called only on section mappings, so 2701 * hopefully it's not to big overload. 2702 * 2703 * XXX: If pmap is current, existing PT2MAP mapping could be 2704 * used for zeroing. 2705 */ 2706 pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); 2707 } 2708 #ifdef INVARIANTS 2709 else 2710 KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", 2711 __func__, pt2_wirecount_get(m, pte1_idx))); 2712 #endif 2713 if (pt2pg_is_empty(m)) { 2714 pmap_unwire_pt2pg(pmap, va, m); 2715 pmap_add_delayed_free_list(m, free); 2716 } 2717 } 2718 2719 /* 2720 * After removing a L2 page table entry, this routine is used to 2721 * conditionally free the page, and manage the hold/wire counts. 2722 */ 2723 static boolean_t 2724 pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) 2725 { 2726 pt1_entry_t pte1; 2727 vm_page_t mpte; 2728 2729 if (va >= VM_MAXUSER_ADDRESS) 2730 return (FALSE); 2731 pte1 = pte1_load(pmap_pte1(pmap, va)); 2732 mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2733 return (pmap_unwire_pt2(pmap, va, mpte, free)); 2734 } 2735 2736 /************************************* 2737 * 2738 * Page management routines. 2739 * 2740 *************************************/ 2741 2742 static const uint32_t pc_freemask[_NPCM] = { 2743 [0 ... _NPCM - 2] = PC_FREEN, 2744 [_NPCM - 1] = PC_FREEL 2745 }; 2746 2747 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2748 "Current number of pv entries"); 2749 2750 #ifdef PV_STATS 2751 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2752 2753 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2754 "Current number of pv entry chunks"); 2755 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2756 "Current number of pv entry chunks allocated"); 2757 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2758 "Current number of pv entry chunks frees"); 2759 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 2760 0, "Number of times tried to get a chunk page but failed."); 2761 2762 static long pv_entry_frees, pv_entry_allocs; 2763 static int pv_entry_spare; 2764 2765 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2766 "Current number of pv entry frees"); 2767 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 2768 0, "Current number of pv entry allocs"); 2769 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2770 "Current number of spare pv entries"); 2771 #endif 2772 2773 /* 2774 * Is given page managed? 2775 */ 2776 static __inline bool 2777 is_managed(vm_paddr_t pa) 2778 { 2779 vm_page_t m; 2780 2781 m = PHYS_TO_VM_PAGE(pa); 2782 if (m == NULL) 2783 return (false); 2784 return ((m->oflags & VPO_UNMANAGED) == 0); 2785 } 2786 2787 static __inline bool 2788 pte1_is_managed(pt1_entry_t pte1) 2789 { 2790 2791 return (is_managed(pte1_pa(pte1))); 2792 } 2793 2794 static __inline bool 2795 pte2_is_managed(pt2_entry_t pte2) 2796 { 2797 2798 return (is_managed(pte2_pa(pte2))); 2799 } 2800 2801 /* 2802 * We are in a serious low memory condition. Resort to 2803 * drastic measures to free some pages so we can allocate 2804 * another pv entry chunk. 2805 */ 2806 static vm_page_t 2807 pmap_pv_reclaim(pmap_t locked_pmap) 2808 { 2809 struct pch newtail; 2810 struct pv_chunk *pc; 2811 struct md_page *pvh; 2812 pt1_entry_t *pte1p; 2813 pmap_t pmap; 2814 pt2_entry_t *pte2p, tpte2; 2815 pv_entry_t pv; 2816 vm_offset_t va; 2817 vm_page_t m, m_pc; 2818 struct spglist free; 2819 uint32_t inuse; 2820 int bit, field, freed; 2821 2822 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2823 pmap = NULL; 2824 m_pc = NULL; 2825 SLIST_INIT(&free); 2826 TAILQ_INIT(&newtail); 2827 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2828 SLIST_EMPTY(&free))) { 2829 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2830 if (pmap != pc->pc_pmap) { 2831 if (pmap != NULL) { 2832 if (pmap != locked_pmap) 2833 PMAP_UNLOCK(pmap); 2834 } 2835 pmap = pc->pc_pmap; 2836 /* Avoid deadlock and lock recursion. */ 2837 if (pmap > locked_pmap) 2838 PMAP_LOCK(pmap); 2839 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2840 pmap = NULL; 2841 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2842 continue; 2843 } 2844 } 2845 2846 /* 2847 * Destroy every non-wired, 4 KB page mapping in the chunk. 2848 */ 2849 freed = 0; 2850 for (field = 0; field < _NPCM; field++) { 2851 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2852 inuse != 0; inuse &= ~(1UL << bit)) { 2853 bit = ffs(inuse) - 1; 2854 pv = &pc->pc_pventry[field * 32 + bit]; 2855 va = pv->pv_va; 2856 pte1p = pmap_pte1(pmap, va); 2857 if (pte1_is_section(pte1_load(pte1p))) 2858 continue; 2859 pte2p = pmap_pte2(pmap, va); 2860 tpte2 = pte2_load(pte2p); 2861 if ((tpte2 & PTE2_W) == 0) 2862 tpte2 = pte2_load_clear(pte2p); 2863 pmap_pte2_release(pte2p); 2864 if ((tpte2 & PTE2_W) != 0) 2865 continue; 2866 KASSERT(tpte2 != 0, 2867 ("pmap_pv_reclaim: pmap %p va %#x zero pte", 2868 pmap, va)); 2869 pmap_tlb_flush(pmap, va); 2870 m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); 2871 if (pte2_is_dirty(tpte2)) 2872 vm_page_dirty(m); 2873 if ((tpte2 & PTE2_A) != 0) 2874 vm_page_aflag_set(m, PGA_REFERENCED); 2875 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2876 if (TAILQ_EMPTY(&m->md.pv_list) && 2877 (m->flags & PG_FICTITIOUS) == 0) { 2878 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2879 if (TAILQ_EMPTY(&pvh->pv_list)) { 2880 vm_page_aflag_clear(m, 2881 PGA_WRITEABLE); 2882 } 2883 } 2884 pc->pc_map[field] |= 1UL << bit; 2885 pmap_unuse_pt2(pmap, va, &free); 2886 freed++; 2887 } 2888 } 2889 if (freed == 0) { 2890 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2891 continue; 2892 } 2893 /* Every freed mapping is for a 4 KB page. */ 2894 pmap->pm_stats.resident_count -= freed; 2895 PV_STAT(pv_entry_frees += freed); 2896 PV_STAT(pv_entry_spare += freed); 2897 pv_entry_count -= freed; 2898 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2899 for (field = 0; field < _NPCM; field++) 2900 if (pc->pc_map[field] != pc_freemask[field]) { 2901 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2902 pc_list); 2903 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2904 2905 /* 2906 * One freed pv entry in locked_pmap is 2907 * sufficient. 2908 */ 2909 if (pmap == locked_pmap) 2910 goto out; 2911 break; 2912 } 2913 if (field == _NPCM) { 2914 PV_STAT(pv_entry_spare -= _NPCPV); 2915 PV_STAT(pc_chunk_count--); 2916 PV_STAT(pc_chunk_frees++); 2917 /* Entire chunk is free; return it. */ 2918 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2919 pmap_qremove((vm_offset_t)pc, 1); 2920 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2921 break; 2922 } 2923 } 2924 out: 2925 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2926 if (pmap != NULL) { 2927 if (pmap != locked_pmap) 2928 PMAP_UNLOCK(pmap); 2929 } 2930 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2931 m_pc = SLIST_FIRST(&free); 2932 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2933 /* Recycle a freed page table page. */ 2934 m_pc->ref_count = 1; 2935 vm_wire_add(1); 2936 } 2937 vm_page_free_pages_toq(&free, false); 2938 return (m_pc); 2939 } 2940 2941 static void 2942 free_pv_chunk(struct pv_chunk *pc) 2943 { 2944 vm_page_t m; 2945 2946 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2947 PV_STAT(pv_entry_spare -= _NPCPV); 2948 PV_STAT(pc_chunk_count--); 2949 PV_STAT(pc_chunk_frees++); 2950 /* entire chunk is free, return it */ 2951 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2952 pmap_qremove((vm_offset_t)pc, 1); 2953 vm_page_unwire_noq(m); 2954 vm_page_free(m); 2955 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2956 } 2957 2958 /* 2959 * Free the pv_entry back to the free list. 2960 */ 2961 static void 2962 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2963 { 2964 struct pv_chunk *pc; 2965 int idx, field, bit; 2966 2967 rw_assert(&pvh_global_lock, RA_WLOCKED); 2968 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2969 PV_STAT(pv_entry_frees++); 2970 PV_STAT(pv_entry_spare++); 2971 pv_entry_count--; 2972 pc = pv_to_chunk(pv); 2973 idx = pv - &pc->pc_pventry[0]; 2974 field = idx / 32; 2975 bit = idx % 32; 2976 pc->pc_map[field] |= 1ul << bit; 2977 for (idx = 0; idx < _NPCM; idx++) 2978 if (pc->pc_map[idx] != pc_freemask[idx]) { 2979 /* 2980 * 98% of the time, pc is already at the head of the 2981 * list. If it isn't already, move it to the head. 2982 */ 2983 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2984 pc)) { 2985 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2986 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2987 pc_list); 2988 } 2989 return; 2990 } 2991 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2992 free_pv_chunk(pc); 2993 } 2994 2995 /* 2996 * Get a new pv_entry, allocating a block from the system 2997 * when needed. 2998 */ 2999 static pv_entry_t 3000 get_pv_entry(pmap_t pmap, boolean_t try) 3001 { 3002 static const struct timeval printinterval = { 60, 0 }; 3003 static struct timeval lastprint; 3004 int bit, field; 3005 pv_entry_t pv; 3006 struct pv_chunk *pc; 3007 vm_page_t m; 3008 3009 rw_assert(&pvh_global_lock, RA_WLOCKED); 3010 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3011 PV_STAT(pv_entry_allocs++); 3012 pv_entry_count++; 3013 if (pv_entry_count > pv_entry_high_water) 3014 if (ratecheck(&lastprint, &printinterval)) 3015 printf("Approaching the limit on PV entries, consider " 3016 "increasing either the vm.pmap.shpgperproc or the " 3017 "vm.pmap.pv_entries tunable.\n"); 3018 retry: 3019 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3020 if (pc != NULL) { 3021 for (field = 0; field < _NPCM; field++) { 3022 if (pc->pc_map[field]) { 3023 bit = ffs(pc->pc_map[field]) - 1; 3024 break; 3025 } 3026 } 3027 if (field < _NPCM) { 3028 pv = &pc->pc_pventry[field * 32 + bit]; 3029 pc->pc_map[field] &= ~(1ul << bit); 3030 /* If this was the last item, move it to tail */ 3031 for (field = 0; field < _NPCM; field++) 3032 if (pc->pc_map[field] != 0) { 3033 PV_STAT(pv_entry_spare--); 3034 return (pv); /* not full, return */ 3035 } 3036 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3037 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3038 PV_STAT(pv_entry_spare--); 3039 return (pv); 3040 } 3041 } 3042 /* 3043 * Access to the pte2list "pv_vafree" is synchronized by the pvh 3044 * global lock. If "pv_vafree" is currently non-empty, it will 3045 * remain non-empty until pmap_pte2list_alloc() completes. 3046 */ 3047 if (pv_vafree == 0 || 3048 (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { 3049 if (try) { 3050 pv_entry_count--; 3051 PV_STAT(pc_chunk_tryfail++); 3052 return (NULL); 3053 } 3054 m = pmap_pv_reclaim(pmap); 3055 if (m == NULL) 3056 goto retry; 3057 } 3058 PV_STAT(pc_chunk_count++); 3059 PV_STAT(pc_chunk_allocs++); 3060 pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); 3061 pmap_qenter((vm_offset_t)pc, &m, 1); 3062 pc->pc_pmap = pmap; 3063 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 3064 for (field = 1; field < _NPCM; field++) 3065 pc->pc_map[field] = pc_freemask[field]; 3066 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3067 pv = &pc->pc_pventry[0]; 3068 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3069 PV_STAT(pv_entry_spare += _NPCPV - 1); 3070 return (pv); 3071 } 3072 3073 /* 3074 * Create a pv entry for page at pa for 3075 * (pmap, va). 3076 */ 3077 static void 3078 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3079 { 3080 pv_entry_t pv; 3081 3082 rw_assert(&pvh_global_lock, RA_WLOCKED); 3083 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3084 pv = get_pv_entry(pmap, FALSE); 3085 pv->pv_va = va; 3086 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3087 } 3088 3089 static __inline pv_entry_t 3090 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3091 { 3092 pv_entry_t pv; 3093 3094 rw_assert(&pvh_global_lock, RA_WLOCKED); 3095 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3096 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3097 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3098 break; 3099 } 3100 } 3101 return (pv); 3102 } 3103 3104 static void 3105 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3106 { 3107 pv_entry_t pv; 3108 3109 pv = pmap_pvh_remove(pvh, pmap, va); 3110 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3111 free_pv_entry(pmap, pv); 3112 } 3113 3114 static void 3115 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 3116 { 3117 struct md_page *pvh; 3118 3119 rw_assert(&pvh_global_lock, RA_WLOCKED); 3120 pmap_pvh_free(&m->md, pmap, va); 3121 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 3122 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3123 if (TAILQ_EMPTY(&pvh->pv_list)) 3124 vm_page_aflag_clear(m, PGA_WRITEABLE); 3125 } 3126 } 3127 3128 static void 3129 pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3130 { 3131 struct md_page *pvh; 3132 pv_entry_t pv; 3133 vm_offset_t va_last; 3134 vm_page_t m; 3135 3136 rw_assert(&pvh_global_lock, RA_WLOCKED); 3137 KASSERT((pa & PTE1_OFFSET) == 0, 3138 ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); 3139 3140 /* 3141 * Transfer the 1mpage's pv entry for this mapping to the first 3142 * page's pv list. 3143 */ 3144 pvh = pa_to_pvh(pa); 3145 va = pte1_trunc(va); 3146 pv = pmap_pvh_remove(pvh, pmap, va); 3147 KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); 3148 m = PHYS_TO_VM_PAGE(pa); 3149 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3150 /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3151 va_last = va + PTE1_SIZE - PAGE_SIZE; 3152 do { 3153 m++; 3154 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3155 ("pmap_pv_demote_pte1: page %p is not managed", m)); 3156 va += PAGE_SIZE; 3157 pmap_insert_entry(pmap, va, m); 3158 } while (va < va_last); 3159 } 3160 3161 #if VM_NRESERVLEVEL > 0 3162 static void 3163 pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3164 { 3165 struct md_page *pvh; 3166 pv_entry_t pv; 3167 vm_offset_t va_last; 3168 vm_page_t m; 3169 3170 rw_assert(&pvh_global_lock, RA_WLOCKED); 3171 KASSERT((pa & PTE1_OFFSET) == 0, 3172 ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); 3173 3174 /* 3175 * Transfer the first page's pv entry for this mapping to the 3176 * 1mpage's pv list. Aside from avoiding the cost of a call 3177 * to get_pv_entry(), a transfer avoids the possibility that 3178 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() 3179 * removes one of the mappings that is being promoted. 3180 */ 3181 m = PHYS_TO_VM_PAGE(pa); 3182 va = pte1_trunc(va); 3183 pv = pmap_pvh_remove(&m->md, pmap, va); 3184 KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); 3185 pvh = pa_to_pvh(pa); 3186 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3187 /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3188 va_last = va + PTE1_SIZE - PAGE_SIZE; 3189 do { 3190 m++; 3191 va += PAGE_SIZE; 3192 pmap_pvh_free(&m->md, pmap, va); 3193 } while (va < va_last); 3194 } 3195 #endif 3196 3197 /* 3198 * Conditionally create a pv entry. 3199 */ 3200 static boolean_t 3201 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3202 { 3203 pv_entry_t pv; 3204 3205 rw_assert(&pvh_global_lock, RA_WLOCKED); 3206 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3207 if (pv_entry_count < pv_entry_high_water && 3208 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3209 pv->pv_va = va; 3210 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3211 return (TRUE); 3212 } else 3213 return (FALSE); 3214 } 3215 3216 /* 3217 * Create the pv entries for each of the pages within a section. 3218 */ 3219 static bool 3220 pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags) 3221 { 3222 struct md_page *pvh; 3223 pv_entry_t pv; 3224 bool noreclaim; 3225 3226 rw_assert(&pvh_global_lock, RA_WLOCKED); 3227 noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; 3228 if ((noreclaim && pv_entry_count >= pv_entry_high_water) || 3229 (pv = get_pv_entry(pmap, noreclaim)) == NULL) 3230 return (false); 3231 pv->pv_va = va; 3232 pvh = pa_to_pvh(pte1_pa(pte1)); 3233 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3234 return (true); 3235 } 3236 3237 static inline void 3238 pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) 3239 { 3240 3241 /* Kill all the small mappings or the big one only. */ 3242 if (pte1_is_section(npte1)) 3243 pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); 3244 else 3245 pmap_tlb_flush(pmap, pte1_trunc(va)); 3246 } 3247 3248 /* 3249 * Update kernel pte1 on all pmaps. 3250 * 3251 * The following function is called only on one cpu with disabled interrupts. 3252 * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way 3253 * nobody can invoke explicit hardware table walk during the update of pte1. 3254 * Unsolicited hardware table walk can still happen, invoked by speculative 3255 * data or instruction prefetch or even by speculative hardware table walk. 3256 * 3257 * The break-before-make approach should be implemented here. However, it's 3258 * not so easy to do that for kernel mappings as it would be unhappy to unmap 3259 * itself unexpectedly but voluntarily. 3260 */ 3261 static void 3262 pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) 3263 { 3264 pmap_t pmap; 3265 pt1_entry_t *pte1p; 3266 3267 /* 3268 * Get current pmap. Interrupts should be disabled here 3269 * so PCPU_GET() is done atomically. 3270 */ 3271 pmap = PCPU_GET(curpmap); 3272 if (pmap == NULL) 3273 pmap = kernel_pmap; 3274 3275 /* 3276 * (1) Change pte1 on current pmap. 3277 * (2) Flush all obsolete TLB entries on current CPU. 3278 * (3) Change pte1 on all pmaps. 3279 * (4) Flush all obsolete TLB entries on all CPUs in SMP case. 3280 */ 3281 3282 pte1p = pmap_pte1(pmap, va); 3283 pte1_store(pte1p, npte1); 3284 3285 /* Kill all the small mappings or the big one only. */ 3286 if (pte1_is_section(npte1)) { 3287 pmap_pte1_kern_promotions++; 3288 tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); 3289 } else { 3290 pmap_pte1_kern_demotions++; 3291 tlb_flush_local(pte1_trunc(va)); 3292 } 3293 3294 /* 3295 * In SMP case, this function is called when all cpus are at smp 3296 * rendezvous, so there is no need to use 'allpmaps_lock' lock here. 3297 * In UP case, the function is called with this lock locked. 3298 */ 3299 LIST_FOREACH(pmap, &allpmaps, pm_list) { 3300 pte1p = pmap_pte1(pmap, va); 3301 pte1_store(pte1p, npte1); 3302 } 3303 3304 #ifdef SMP 3305 /* Kill all the small mappings or the big one only. */ 3306 if (pte1_is_section(npte1)) 3307 tlb_flush_range(pte1_trunc(va), PTE1_SIZE); 3308 else 3309 tlb_flush(pte1_trunc(va)); 3310 #endif 3311 } 3312 3313 #ifdef SMP 3314 struct pte1_action { 3315 vm_offset_t va; 3316 pt1_entry_t npte1; 3317 u_int update; /* CPU that updates the PTE1 */ 3318 }; 3319 3320 static void 3321 pmap_update_pte1_action(void *arg) 3322 { 3323 struct pte1_action *act = arg; 3324 3325 if (act->update == PCPU_GET(cpuid)) 3326 pmap_update_pte1_kernel(act->va, act->npte1); 3327 } 3328 3329 /* 3330 * Change pte1 on current pmap. 3331 * Note that kernel pte1 must be changed on all pmaps. 3332 * 3333 * According to the architecture reference manual published by ARM, 3334 * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. 3335 * According to this manual, UNPREDICTABLE behaviours must never happen in 3336 * a viable system. In contrast, on x86 processors, it is not specified which 3337 * TLB entry mapping the virtual address will be used, but the MMU doesn't 3338 * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone 3339 * Black). 3340 * 3341 * It's a problem when either promotion or demotion is being done. The pte1 3342 * update and appropriate TLB flush must be done atomically in general. 3343 */ 3344 static void 3345 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3346 pt1_entry_t npte1) 3347 { 3348 3349 if (pmap == kernel_pmap) { 3350 struct pte1_action act; 3351 3352 sched_pin(); 3353 act.va = va; 3354 act.npte1 = npte1; 3355 act.update = PCPU_GET(cpuid); 3356 smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, 3357 pmap_update_pte1_action, NULL, &act); 3358 sched_unpin(); 3359 } else { 3360 register_t cspr; 3361 3362 /* 3363 * Use break-before-make approach for changing userland 3364 * mappings. It can cause L1 translation aborts on other 3365 * cores in SMP case. So, special treatment is implemented 3366 * in pmap_fault(). To reduce the likelihood that another core 3367 * will be affected by the broken mapping, disable interrupts 3368 * until the mapping change is completed. 3369 */ 3370 cspr = disable_interrupts(PSR_I | PSR_F); 3371 pte1_clear(pte1p); 3372 pmap_tlb_flush_pte1(pmap, va, npte1); 3373 pte1_store(pte1p, npte1); 3374 restore_interrupts(cspr); 3375 } 3376 } 3377 #else 3378 static void 3379 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3380 pt1_entry_t npte1) 3381 { 3382 3383 if (pmap == kernel_pmap) { 3384 mtx_lock_spin(&allpmaps_lock); 3385 pmap_update_pte1_kernel(va, npte1); 3386 mtx_unlock_spin(&allpmaps_lock); 3387 } else { 3388 register_t cspr; 3389 3390 /* 3391 * Use break-before-make approach for changing userland 3392 * mappings. It's absolutely safe in UP case when interrupts 3393 * are disabled. 3394 */ 3395 cspr = disable_interrupts(PSR_I | PSR_F); 3396 pte1_clear(pte1p); 3397 pmap_tlb_flush_pte1(pmap, va, npte1); 3398 pte1_store(pte1p, npte1); 3399 restore_interrupts(cspr); 3400 } 3401 } 3402 #endif 3403 3404 #if VM_NRESERVLEVEL > 0 3405 /* 3406 * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are 3407 * within a single page table page (PT2) to a single 1MB page mapping. 3408 * For promotion to occur, two conditions must be met: (1) the 4KB page 3409 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3410 * mappings must have identical characteristics. 3411 * 3412 * Managed (PG_MANAGED) mappings within the kernel address space are not 3413 * promoted. The reason is that kernel PTE1s are replicated in each pmap but 3414 * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only 3415 * read the PTE1 from the kernel pmap. 3416 */ 3417 static void 3418 pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3419 { 3420 pt1_entry_t npte1; 3421 pt2_entry_t *fpte2p, fpte2, fpte2_fav; 3422 pt2_entry_t *pte2p, pte2; 3423 vm_offset_t pteva __unused; 3424 vm_page_t m __unused; 3425 3426 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3427 pmap, va, pte1_load(pte1p), pte1p)); 3428 3429 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3430 3431 /* 3432 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is 3433 * either invalid, unused, or does not map the first 4KB physical page 3434 * within a 1MB page. 3435 */ 3436 fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); 3437 fpte2 = pte2_load(fpte2p); 3438 if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != 3439 (PTE2_A | PTE2_V)) { 3440 pmap_pte1_p_failures++; 3441 CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", 3442 __func__, va, pmap); 3443 return; 3444 } 3445 if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { 3446 pmap_pte1_p_failures++; 3447 CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", 3448 __func__, va, pmap); 3449 return; 3450 } 3451 if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3452 /* 3453 * When page is not modified, PTE2_RO can be set without 3454 * a TLB invalidation. 3455 */ 3456 fpte2 |= PTE2_RO; 3457 pte2_store(fpte2p, fpte2); 3458 } 3459 3460 /* 3461 * Examine each of the other PTE2s in the specified PT2. Abort if this 3462 * PTE2 maps an unexpected 4KB physical page or does not have identical 3463 * characteristics to the first PTE2. 3464 */ 3465 fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); 3466 fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ 3467 for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { 3468 pte2 = pte2_load(pte2p); 3469 if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { 3470 pmap_pte1_p_failures++; 3471 CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", 3472 __func__, va, pmap); 3473 return; 3474 } 3475 if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3476 /* 3477 * When page is not modified, PTE2_RO can be set 3478 * without a TLB invalidation. See note above. 3479 */ 3480 pte2 |= PTE2_RO; 3481 pte2_store(pte2p, pte2); 3482 pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & 3483 PTE2_FRAME); 3484 CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", 3485 __func__, pteva, pmap); 3486 } 3487 if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { 3488 pmap_pte1_p_failures++; 3489 CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", 3490 __func__, va, pmap); 3491 return; 3492 } 3493 3494 fpte2_fav -= PTE2_SIZE; 3495 } 3496 /* 3497 * The page table page in its current state will stay in PT2TAB 3498 * until the PTE1 mapping the section is demoted by pmap_demote_pte1() 3499 * or destroyed by pmap_remove_pte1(). 3500 * 3501 * Note that L2 page table size is not equal to PAGE_SIZE. 3502 */ 3503 m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); 3504 KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], 3505 ("%s: PT2 page is out of range", __func__)); 3506 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 3507 ("%s: PT2 page's pindex is wrong", __func__)); 3508 3509 /* 3510 * Get pte1 from pte2 format. 3511 */ 3512 npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; 3513 3514 /* 3515 * Promote the pv entries. 3516 */ 3517 if (pte2_is_managed(fpte2)) 3518 pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); 3519 3520 /* 3521 * Promote the mappings. 3522 */ 3523 pmap_change_pte1(pmap, pte1p, va, npte1); 3524 3525 pmap_pte1_promotions++; 3526 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3527 __func__, va, pmap); 3528 3529 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3530 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3531 } 3532 #endif /* VM_NRESERVLEVEL > 0 */ 3533 3534 /* 3535 * Zero L2 page table page. 3536 */ 3537 static __inline void 3538 pmap_clear_pt2(pt2_entry_t *fpte2p) 3539 { 3540 pt2_entry_t *pte2p; 3541 3542 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) 3543 pte2_clear(pte2p); 3544 3545 } 3546 3547 /* 3548 * Removes a 1MB page mapping from the kernel pmap. 3549 */ 3550 static void 3551 pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3552 { 3553 vm_page_t m; 3554 uint32_t pte1_idx; 3555 pt2_entry_t *fpte2p; 3556 vm_paddr_t pt2_pa; 3557 3558 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3559 m = pmap_pt2_page(pmap, va); 3560 if (m == NULL) 3561 /* 3562 * QQQ: Is this function called only on promoted pte1? 3563 * We certainly do section mappings directly 3564 * (without promotion) in kernel !!! 3565 */ 3566 panic("%s: missing pt2 page", __func__); 3567 3568 pte1_idx = pte1_index(va); 3569 3570 /* 3571 * Initialize the L2 page table. 3572 */ 3573 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3574 pmap_clear_pt2(fpte2p); 3575 3576 /* 3577 * Remove the mapping. 3578 */ 3579 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); 3580 pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); 3581 3582 /* 3583 * QQQ: We do not need to invalidate PT2MAP mapping 3584 * as we did not change it. I.e. the L2 page table page 3585 * was and still is mapped the same way. 3586 */ 3587 } 3588 3589 /* 3590 * Do the things to unmap a section in a process 3591 */ 3592 static void 3593 pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 3594 struct spglist *free) 3595 { 3596 pt1_entry_t opte1; 3597 struct md_page *pvh; 3598 vm_offset_t eva, va; 3599 vm_page_t m; 3600 3601 PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, 3602 pte1_load(pte1p), pte1p)); 3603 3604 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3605 KASSERT((sva & PTE1_OFFSET) == 0, 3606 ("%s: sva is not 1mpage aligned", __func__)); 3607 3608 /* 3609 * Clear and invalidate the mapping. It should occupy one and only TLB 3610 * entry. So, pmap_tlb_flush() called with aligned address should be 3611 * sufficient. 3612 */ 3613 opte1 = pte1_load_clear(pte1p); 3614 pmap_tlb_flush(pmap, sva); 3615 3616 if (pte1_is_wired(opte1)) 3617 pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; 3618 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 3619 if (pte1_is_managed(opte1)) { 3620 pvh = pa_to_pvh(pte1_pa(opte1)); 3621 pmap_pvh_free(pvh, pmap, sva); 3622 eva = sva + PTE1_SIZE; 3623 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 3624 va < eva; va += PAGE_SIZE, m++) { 3625 if (pte1_is_dirty(opte1)) 3626 vm_page_dirty(m); 3627 if (opte1 & PTE1_A) 3628 vm_page_aflag_set(m, PGA_REFERENCED); 3629 if (TAILQ_EMPTY(&m->md.pv_list) && 3630 TAILQ_EMPTY(&pvh->pv_list)) 3631 vm_page_aflag_clear(m, PGA_WRITEABLE); 3632 } 3633 } 3634 if (pmap == kernel_pmap) { 3635 /* 3636 * L2 page table(s) can't be removed from kernel map as 3637 * kernel counts on it (stuff around pmap_growkernel()). 3638 */ 3639 pmap_remove_kernel_pte1(pmap, pte1p, sva); 3640 } else { 3641 /* 3642 * Get associated L2 page table page. 3643 * It's possible that the page was never allocated. 3644 */ 3645 m = pmap_pt2_page(pmap, sva); 3646 if (m != NULL) 3647 pmap_unwire_pt2_all(pmap, sva, m, free); 3648 } 3649 } 3650 3651 /* 3652 * Fills L2 page table page with mappings to consecutive physical pages. 3653 */ 3654 static __inline void 3655 pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) 3656 { 3657 pt2_entry_t *pte2p; 3658 3659 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { 3660 pte2_store(pte2p, npte2); 3661 npte2 += PTE2_SIZE; 3662 } 3663 } 3664 3665 /* 3666 * Tries to demote a 1MB page mapping. If demotion fails, the 3667 * 1MB page mapping is invalidated. 3668 */ 3669 static boolean_t 3670 pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3671 { 3672 pt1_entry_t opte1, npte1; 3673 pt2_entry_t *fpte2p, npte2; 3674 vm_paddr_t pt2pg_pa, pt2_pa; 3675 vm_page_t m; 3676 struct spglist free; 3677 uint32_t pte1_idx, isnew = 0; 3678 3679 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3680 pmap, va, pte1_load(pte1p), pte1p)); 3681 3682 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3683 3684 opte1 = pte1_load(pte1p); 3685 KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); 3686 3687 if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { 3688 KASSERT(!pte1_is_wired(opte1), 3689 ("%s: PT2 page for a wired mapping is missing", __func__)); 3690 3691 /* 3692 * Invalidate the 1MB page mapping and return 3693 * "failure" if the mapping was never accessed or the 3694 * allocation of the new page table page fails. 3695 */ 3696 if ((opte1 & PTE1_A) == 0 || 3697 (m = vm_page_alloc_noobj(VM_ALLOC_WIRED)) == NULL) { 3698 SLIST_INIT(&free); 3699 pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); 3700 vm_page_free_pages_toq(&free, false); 3701 CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", 3702 __func__, va, pmap); 3703 return (FALSE); 3704 } 3705 m->pindex = pte1_index(va) & ~PT2PG_MASK; 3706 if (va < VM_MAXUSER_ADDRESS) 3707 pmap->pm_stats.resident_count++; 3708 3709 isnew = 1; 3710 3711 /* 3712 * We init all L2 page tables in the page even if 3713 * we are going to change everything for one L2 page 3714 * table in a while. 3715 */ 3716 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 3717 } else { 3718 if (va < VM_MAXUSER_ADDRESS) { 3719 if (pt2_is_empty(m, va)) 3720 isnew = 1; /* Demoting section w/o promotion. */ 3721 #ifdef INVARIANTS 3722 else 3723 KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" 3724 " count %u", __func__, 3725 pt2_wirecount_get(m, pte1_index(va)))); 3726 #endif 3727 } 3728 } 3729 3730 pt2pg_pa = VM_PAGE_TO_PHYS(m); 3731 pte1_idx = pte1_index(va); 3732 /* 3733 * If the pmap is current, then the PT2MAP can provide access to 3734 * the page table page (promoted L2 page tables are not unmapped). 3735 * Otherwise, temporarily map the L2 page table page (m) into 3736 * the kernel's address space at either PADDR1 or PADDR2. 3737 * 3738 * Note that L2 page table size is not equal to PAGE_SIZE. 3739 */ 3740 if (pmap_is_current(pmap)) 3741 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3742 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 3743 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 3744 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 3745 #ifdef SMP 3746 PMAP1cpu = PCPU_GET(cpuid); 3747 #endif 3748 tlb_flush_local((vm_offset_t)PADDR1); 3749 PMAP1changed++; 3750 } else 3751 #ifdef SMP 3752 if (PMAP1cpu != PCPU_GET(cpuid)) { 3753 PMAP1cpu = PCPU_GET(cpuid); 3754 tlb_flush_local((vm_offset_t)PADDR1); 3755 PMAP1changedcpu++; 3756 } else 3757 #endif 3758 PMAP1unchanged++; 3759 fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); 3760 } else { 3761 mtx_lock(&PMAP2mutex); 3762 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 3763 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 3764 tlb_flush((vm_offset_t)PADDR2); 3765 } 3766 fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); 3767 } 3768 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 3769 npte1 = PTE1_LINK(pt2_pa); 3770 3771 KASSERT((opte1 & PTE1_A) != 0, 3772 ("%s: opte1 is missing PTE1_A", __func__)); 3773 KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, 3774 ("%s: opte1 has PTE1_NM", __func__)); 3775 3776 /* 3777 * Get pte2 from pte1 format. 3778 */ 3779 npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; 3780 3781 /* 3782 * If the L2 page table page is new, initialize it. If the mapping 3783 * has changed attributes, update the page table entries. 3784 */ 3785 if (isnew != 0) { 3786 pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); 3787 pmap_fill_pt2(fpte2p, npte2); 3788 } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != 3789 (npte2 & PTE2_PROMOTE)) 3790 pmap_fill_pt2(fpte2p, npte2); 3791 3792 KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), 3793 ("%s: fpte2p and npte2 map different physical addresses", 3794 __func__)); 3795 3796 if (fpte2p == PADDR2) 3797 mtx_unlock(&PMAP2mutex); 3798 3799 /* 3800 * Demote the mapping. This pmap is locked. The old PTE1 has 3801 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also 3802 * has not PTE1_NM set. Thus, there is no danger of a race with 3803 * another processor changing the setting of PTE1_A and/or PTE1_NM 3804 * between the read above and the store below. 3805 */ 3806 pmap_change_pte1(pmap, pte1p, va, npte1); 3807 3808 /* 3809 * Demote the pv entry. This depends on the earlier demotion 3810 * of the mapping. Specifically, the (re)creation of a per- 3811 * page pv entry might trigger the execution of pmap_pv_reclaim(), 3812 * which might reclaim a newly (re)created per-page pv entry 3813 * and destroy the associated mapping. In order to destroy 3814 * the mapping, the PTE1 must have already changed from mapping 3815 * the 1mpage to referencing the page table page. 3816 */ 3817 if (pte1_is_managed(opte1)) 3818 pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); 3819 3820 pmap_pte1_demotions++; 3821 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3822 __func__, va, pmap); 3823 3824 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3825 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3826 return (TRUE); 3827 } 3828 3829 /* 3830 * Insert the given physical page (p) at 3831 * the specified virtual address (v) in the 3832 * target physical map with the protection requested. 3833 * 3834 * If specified, the page will be wired down, meaning 3835 * that the related pte can not be reclaimed. 3836 * 3837 * NB: This is the only routine which MAY NOT lazy-evaluate 3838 * or lose information. That is, this routine must actually 3839 * insert this page into the given map NOW. 3840 */ 3841 int 3842 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3843 u_int flags, int8_t psind) 3844 { 3845 pt1_entry_t *pte1p; 3846 pt2_entry_t *pte2p; 3847 pt2_entry_t npte2, opte2; 3848 pv_entry_t pv; 3849 vm_paddr_t opa, pa; 3850 vm_page_t mpte2, om; 3851 int rv; 3852 3853 va = trunc_page(va); 3854 KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); 3855 KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, 3856 ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, 3857 va)); 3858 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 3859 ("%s: managed mapping within the clean submap", __func__)); 3860 if ((m->oflags & VPO_UNMANAGED) == 0) 3861 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3862 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 3863 ("%s: flags %u has reserved bits set", __func__, flags)); 3864 pa = VM_PAGE_TO_PHYS(m); 3865 npte2 = PTE2(pa, PTE2_A, vm_page_pte2_attr(m)); 3866 if ((flags & VM_PROT_WRITE) == 0) 3867 npte2 |= PTE2_NM; 3868 if ((prot & VM_PROT_WRITE) == 0) 3869 npte2 |= PTE2_RO; 3870 KASSERT((npte2 & (PTE2_NM | PTE2_RO)) != PTE2_RO, 3871 ("%s: flags includes VM_PROT_WRITE but prot doesn't", __func__)); 3872 if ((prot & VM_PROT_EXECUTE) == 0) 3873 npte2 |= PTE2_NX; 3874 if ((flags & PMAP_ENTER_WIRED) != 0) 3875 npte2 |= PTE2_W; 3876 if (va < VM_MAXUSER_ADDRESS) 3877 npte2 |= PTE2_U; 3878 if (pmap != kernel_pmap) 3879 npte2 |= PTE2_NG; 3880 3881 rw_wlock(&pvh_global_lock); 3882 PMAP_LOCK(pmap); 3883 sched_pin(); 3884 if (psind == 1) { 3885 /* Assert the required virtual and physical alignment. */ 3886 KASSERT((va & PTE1_OFFSET) == 0, 3887 ("%s: va unaligned", __func__)); 3888 KASSERT(m->psind > 0, ("%s: m->psind < psind", __func__)); 3889 rv = pmap_enter_pte1(pmap, va, PTE1_PA(pa) | ATTR_TO_L1(npte2) | 3890 PTE1_V, flags, m); 3891 goto out; 3892 } 3893 3894 /* 3895 * In the case that a page table page is not 3896 * resident, we are creating it here. 3897 */ 3898 if (va < VM_MAXUSER_ADDRESS) { 3899 mpte2 = pmap_allocpte2(pmap, va, flags); 3900 if (mpte2 == NULL) { 3901 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3902 ("pmap_allocpte2 failed with sleep allowed")); 3903 rv = KERN_RESOURCE_SHORTAGE; 3904 goto out; 3905 } 3906 } else 3907 mpte2 = NULL; 3908 pte1p = pmap_pte1(pmap, va); 3909 if (pte1_is_section(pte1_load(pte1p))) 3910 panic("%s: attempted on 1MB page", __func__); 3911 pte2p = pmap_pte2_quick(pmap, va); 3912 if (pte2p == NULL) 3913 panic("%s: invalid L1 page table entry va=%#x", __func__, va); 3914 3915 om = NULL; 3916 opte2 = pte2_load(pte2p); 3917 opa = pte2_pa(opte2); 3918 /* 3919 * Mapping has not changed, must be protection or wiring change. 3920 */ 3921 if (pte2_is_valid(opte2) && (opa == pa)) { 3922 /* 3923 * Wiring change, just update stats. We don't worry about 3924 * wiring PT2 pages as they remain resident as long as there 3925 * are valid mappings in them. Hence, if a user page is wired, 3926 * the PT2 page will be also. 3927 */ 3928 if (pte2_is_wired(npte2) && !pte2_is_wired(opte2)) 3929 pmap->pm_stats.wired_count++; 3930 else if (!pte2_is_wired(npte2) && pte2_is_wired(opte2)) 3931 pmap->pm_stats.wired_count--; 3932 3933 /* 3934 * Remove extra pte2 reference 3935 */ 3936 if (mpte2) 3937 pt2_wirecount_dec(mpte2, pte1_index(va)); 3938 if ((m->oflags & VPO_UNMANAGED) == 0) 3939 om = m; 3940 goto validate; 3941 } 3942 3943 /* 3944 * QQQ: We think that changing physical address on writeable mapping 3945 * is not safe. Well, maybe on kernel address space with correct 3946 * locking, it can make a sense. However, we have no idea why 3947 * anyone should do that on user address space. Are we wrong? 3948 */ 3949 KASSERT((opa == 0) || (opa == pa) || 3950 !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), 3951 ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", 3952 __func__, pmap, va, opte2, opa, pa, flags, prot)); 3953 3954 pv = NULL; 3955 3956 /* 3957 * Mapping has changed, invalidate old range and fall through to 3958 * handle validating new mapping. 3959 */ 3960 if (opa) { 3961 if (pte2_is_wired(opte2)) 3962 pmap->pm_stats.wired_count--; 3963 om = PHYS_TO_VM_PAGE(opa); 3964 if (om != NULL && (om->oflags & VPO_UNMANAGED) != 0) 3965 om = NULL; 3966 if (om != NULL) 3967 pv = pmap_pvh_remove(&om->md, pmap, va); 3968 3969 /* 3970 * Remove extra pte2 reference 3971 */ 3972 if (mpte2 != NULL) 3973 pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); 3974 } else 3975 pmap->pm_stats.resident_count++; 3976 3977 /* 3978 * Enter on the PV list if part of our managed memory. 3979 */ 3980 if ((m->oflags & VPO_UNMANAGED) == 0) { 3981 if (pv == NULL) { 3982 pv = get_pv_entry(pmap, FALSE); 3983 pv->pv_va = va; 3984 } 3985 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3986 } else if (pv != NULL) 3987 free_pv_entry(pmap, pv); 3988 3989 /* 3990 * Increment counters 3991 */ 3992 if (pte2_is_wired(npte2)) 3993 pmap->pm_stats.wired_count++; 3994 3995 validate: 3996 /* 3997 * Now validate mapping with desired protection/wiring. 3998 */ 3999 if (prot & VM_PROT_WRITE) { 4000 if ((m->oflags & VPO_UNMANAGED) == 0) 4001 vm_page_aflag_set(m, PGA_WRITEABLE); 4002 } 4003 4004 /* 4005 * If the mapping or permission bits are different, we need 4006 * to update the pte2. 4007 * 4008 * QQQ: Think again and again what to do 4009 * if the mapping is going to be changed! 4010 */ 4011 if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { 4012 /* 4013 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4014 * is set. Do it now, before the mapping is stored and made 4015 * valid for hardware table walk. If done later, there is a race 4016 * for other threads of current process in lazy loading case. 4017 * Don't do it for kernel memory which is mapped with exec 4018 * permission even if the memory isn't going to hold executable 4019 * code. The only time when icache sync is needed is after 4020 * kernel module is loaded and the relocation info is processed. 4021 * And it's done in elf_cpu_load_file(). 4022 * 4023 * QQQ: (1) Does it exist any better way where 4024 * or how to sync icache? 4025 * (2) Now, we do it on a page basis. 4026 */ 4027 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4028 m->md.pat_mode == VM_MEMATTR_WB_WA && 4029 (opa != pa || (opte2 & PTE2_NX))) 4030 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4031 4032 if (opte2 & PTE2_V) { 4033 /* Change mapping with break-before-make approach. */ 4034 opte2 = pte2_load_clear(pte2p); 4035 pmap_tlb_flush(pmap, va); 4036 pte2_store(pte2p, npte2); 4037 if (om != NULL) { 4038 KASSERT((om->oflags & VPO_UNMANAGED) == 0, 4039 ("%s: om %p unmanaged", __func__, om)); 4040 if ((opte2 & PTE2_A) != 0) 4041 vm_page_aflag_set(om, PGA_REFERENCED); 4042 if (pte2_is_dirty(opte2)) 4043 vm_page_dirty(om); 4044 if (TAILQ_EMPTY(&om->md.pv_list) && 4045 ((om->flags & PG_FICTITIOUS) != 0 || 4046 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4047 vm_page_aflag_clear(om, PGA_WRITEABLE); 4048 } 4049 } else 4050 pte2_store(pte2p, npte2); 4051 } 4052 #if 0 4053 else { 4054 /* 4055 * QQQ: In time when both access and not mofified bits are 4056 * emulated by software, this should not happen. Some 4057 * analysis is need, if this really happen. Missing 4058 * tlb flush somewhere could be the reason. 4059 */ 4060 panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, 4061 va, opte2, npte2); 4062 } 4063 #endif 4064 4065 #if VM_NRESERVLEVEL > 0 4066 /* 4067 * If both the L2 page table page and the reservation are fully 4068 * populated, then attempt promotion. 4069 */ 4070 if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && 4071 sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && 4072 vm_reserv_level_iffullpop(m) == 0) 4073 pmap_promote_pte1(pmap, pte1p, va); 4074 #endif 4075 4076 rv = KERN_SUCCESS; 4077 out: 4078 sched_unpin(); 4079 rw_wunlock(&pvh_global_lock); 4080 PMAP_UNLOCK(pmap); 4081 return (rv); 4082 } 4083 4084 /* 4085 * Do the things to unmap a page in a process. 4086 */ 4087 static int 4088 pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, 4089 struct spglist *free) 4090 { 4091 pt2_entry_t opte2; 4092 vm_page_t m; 4093 4094 rw_assert(&pvh_global_lock, RA_WLOCKED); 4095 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4096 4097 /* Clear and invalidate the mapping. */ 4098 opte2 = pte2_load_clear(pte2p); 4099 pmap_tlb_flush(pmap, va); 4100 4101 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", 4102 __func__, pmap, va, opte2)); 4103 4104 if (opte2 & PTE2_W) 4105 pmap->pm_stats.wired_count -= 1; 4106 pmap->pm_stats.resident_count -= 1; 4107 if (pte2_is_managed(opte2)) { 4108 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4109 if (pte2_is_dirty(opte2)) 4110 vm_page_dirty(m); 4111 if (opte2 & PTE2_A) 4112 vm_page_aflag_set(m, PGA_REFERENCED); 4113 pmap_remove_entry(pmap, m, va); 4114 } 4115 return (pmap_unuse_pt2(pmap, va, free)); 4116 } 4117 4118 /* 4119 * Remove a single page from a process address space. 4120 */ 4121 static void 4122 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 4123 { 4124 pt2_entry_t *pte2p; 4125 4126 rw_assert(&pvh_global_lock, RA_WLOCKED); 4127 KASSERT(curthread->td_pinned > 0, 4128 ("%s: curthread not pinned", __func__)); 4129 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4130 if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || 4131 !pte2_is_valid(pte2_load(pte2p))) 4132 return; 4133 pmap_remove_pte2(pmap, pte2p, va, free); 4134 } 4135 4136 /* 4137 * Remove the given range of addresses from the specified map. 4138 * 4139 * It is assumed that the start and end are properly 4140 * rounded to the page size. 4141 */ 4142 void 4143 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4144 { 4145 vm_offset_t nextva; 4146 pt1_entry_t *pte1p, pte1; 4147 pt2_entry_t *pte2p, pte2; 4148 struct spglist free; 4149 4150 /* 4151 * Perform an unsynchronized read. This is, however, safe. 4152 */ 4153 if (pmap->pm_stats.resident_count == 0) 4154 return; 4155 4156 SLIST_INIT(&free); 4157 4158 rw_wlock(&pvh_global_lock); 4159 sched_pin(); 4160 PMAP_LOCK(pmap); 4161 4162 /* 4163 * Special handling of removing one page. A very common 4164 * operation and easy to short circuit some code. 4165 */ 4166 if (sva + PAGE_SIZE == eva) { 4167 pte1 = pte1_load(pmap_pte1(pmap, sva)); 4168 if (pte1_is_link(pte1)) { 4169 pmap_remove_page(pmap, sva, &free); 4170 goto out; 4171 } 4172 } 4173 4174 for (; sva < eva; sva = nextva) { 4175 /* 4176 * Calculate address for next L2 page table. 4177 */ 4178 nextva = pte1_trunc(sva + PTE1_SIZE); 4179 if (nextva < sva) 4180 nextva = eva; 4181 if (pmap->pm_stats.resident_count == 0) 4182 break; 4183 4184 pte1p = pmap_pte1(pmap, sva); 4185 pte1 = pte1_load(pte1p); 4186 4187 /* 4188 * Weed out invalid mappings. Note: we assume that the L1 page 4189 * table is always allocated, and in kernel virtual. 4190 */ 4191 if (pte1 == 0) 4192 continue; 4193 4194 if (pte1_is_section(pte1)) { 4195 /* 4196 * Are we removing the entire large page? If not, 4197 * demote the mapping and fall through. 4198 */ 4199 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4200 pmap_remove_pte1(pmap, pte1p, sva, &free); 4201 continue; 4202 } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4203 /* The large page mapping was destroyed. */ 4204 continue; 4205 } 4206 #ifdef INVARIANTS 4207 else { 4208 /* Update pte1 after demotion. */ 4209 pte1 = pte1_load(pte1p); 4210 } 4211 #endif 4212 } 4213 4214 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4215 " is not link", __func__, pmap, sva, pte1, pte1p)); 4216 4217 /* 4218 * Limit our scan to either the end of the va represented 4219 * by the current L2 page table page, or to the end of the 4220 * range being removed. 4221 */ 4222 if (nextva > eva) 4223 nextva = eva; 4224 4225 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; 4226 pte2p++, sva += PAGE_SIZE) { 4227 pte2 = pte2_load(pte2p); 4228 if (!pte2_is_valid(pte2)) 4229 continue; 4230 if (pmap_remove_pte2(pmap, pte2p, sva, &free)) 4231 break; 4232 } 4233 } 4234 out: 4235 sched_unpin(); 4236 rw_wunlock(&pvh_global_lock); 4237 PMAP_UNLOCK(pmap); 4238 vm_page_free_pages_toq(&free, false); 4239 } 4240 4241 /* 4242 * Routine: pmap_remove_all 4243 * Function: 4244 * Removes this physical page from 4245 * all physical maps in which it resides. 4246 * Reflects back modify bits to the pager. 4247 * 4248 * Notes: 4249 * Original versions of this routine were very 4250 * inefficient because they iteratively called 4251 * pmap_remove (slow...) 4252 */ 4253 4254 void 4255 pmap_remove_all(vm_page_t m) 4256 { 4257 struct md_page *pvh; 4258 pv_entry_t pv; 4259 pmap_t pmap; 4260 pt2_entry_t *pte2p, opte2; 4261 pt1_entry_t *pte1p; 4262 vm_offset_t va; 4263 struct spglist free; 4264 4265 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4266 ("%s: page %p is not managed", __func__, m)); 4267 SLIST_INIT(&free); 4268 rw_wlock(&pvh_global_lock); 4269 sched_pin(); 4270 if ((m->flags & PG_FICTITIOUS) != 0) 4271 goto small_mappings; 4272 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4273 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4274 va = pv->pv_va; 4275 pmap = PV_PMAP(pv); 4276 PMAP_LOCK(pmap); 4277 pte1p = pmap_pte1(pmap, va); 4278 (void)pmap_demote_pte1(pmap, pte1p, va); 4279 PMAP_UNLOCK(pmap); 4280 } 4281 small_mappings: 4282 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4283 pmap = PV_PMAP(pv); 4284 PMAP_LOCK(pmap); 4285 pmap->pm_stats.resident_count--; 4286 pte1p = pmap_pte1(pmap, pv->pv_va); 4287 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " 4288 "a 1mpage in page %p's pv list", __func__, m)); 4289 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 4290 opte2 = pte2_load_clear(pte2p); 4291 pmap_tlb_flush(pmap, pv->pv_va); 4292 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", 4293 __func__, pmap, pv->pv_va)); 4294 if (pte2_is_wired(opte2)) 4295 pmap->pm_stats.wired_count--; 4296 if (opte2 & PTE2_A) 4297 vm_page_aflag_set(m, PGA_REFERENCED); 4298 4299 /* 4300 * Update the vm_page_t clean and reference bits. 4301 */ 4302 if (pte2_is_dirty(opte2)) 4303 vm_page_dirty(m); 4304 pmap_unuse_pt2(pmap, pv->pv_va, &free); 4305 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4306 free_pv_entry(pmap, pv); 4307 PMAP_UNLOCK(pmap); 4308 } 4309 vm_page_aflag_clear(m, PGA_WRITEABLE); 4310 sched_unpin(); 4311 rw_wunlock(&pvh_global_lock); 4312 vm_page_free_pages_toq(&free, false); 4313 } 4314 4315 /* 4316 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4317 * good coding style, a.k.a. 80 character line width limit hell. 4318 */ 4319 static __inline void 4320 pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, 4321 struct spglist *free) 4322 { 4323 vm_paddr_t pa; 4324 vm_page_t m, mt, mpt2pg; 4325 struct md_page *pvh; 4326 4327 pa = pte1_pa(pte1); 4328 m = PHYS_TO_VM_PAGE(pa); 4329 4330 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4331 __func__, m, m->phys_addr, pa)); 4332 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4333 m < &vm_page_array[vm_page_array_size], 4334 ("%s: bad pte1 %#x", __func__, pte1)); 4335 4336 if (pte1_is_dirty(pte1)) { 4337 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4338 vm_page_dirty(mt); 4339 } 4340 4341 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 4342 pvh = pa_to_pvh(pa); 4343 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4344 if (TAILQ_EMPTY(&pvh->pv_list)) { 4345 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4346 if (TAILQ_EMPTY(&mt->md.pv_list)) 4347 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4348 } 4349 mpt2pg = pmap_pt2_page(pmap, pv->pv_va); 4350 if (mpt2pg != NULL) 4351 pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); 4352 } 4353 4354 /* 4355 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4356 * good coding style, a.k.a. 80 character line width limit hell. 4357 */ 4358 static __inline void 4359 pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, 4360 struct spglist *free) 4361 { 4362 vm_paddr_t pa; 4363 vm_page_t m; 4364 struct md_page *pvh; 4365 4366 pa = pte2_pa(pte2); 4367 m = PHYS_TO_VM_PAGE(pa); 4368 4369 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4370 __func__, m, m->phys_addr, pa)); 4371 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4372 m < &vm_page_array[vm_page_array_size], 4373 ("%s: bad pte2 %#x", __func__, pte2)); 4374 4375 if (pte2_is_dirty(pte2)) 4376 vm_page_dirty(m); 4377 4378 pmap->pm_stats.resident_count--; 4379 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4380 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 4381 pvh = pa_to_pvh(pa); 4382 if (TAILQ_EMPTY(&pvh->pv_list)) 4383 vm_page_aflag_clear(m, PGA_WRITEABLE); 4384 } 4385 pmap_unuse_pt2(pmap, pv->pv_va, free); 4386 } 4387 4388 /* 4389 * Remove all pages from specified address space this aids process 4390 * exit speeds. Also, this code is special cased for current process 4391 * only, but can have the more generic (and slightly slower) mode enabled. 4392 * This is much faster than pmap_remove in the case of running down 4393 * an entire address space. 4394 */ 4395 void 4396 pmap_remove_pages(pmap_t pmap) 4397 { 4398 pt1_entry_t *pte1p, pte1; 4399 pt2_entry_t *pte2p, pte2; 4400 pv_entry_t pv; 4401 struct pv_chunk *pc, *npc; 4402 struct spglist free; 4403 int field, idx; 4404 int32_t bit; 4405 uint32_t inuse, bitmask; 4406 boolean_t allfree; 4407 4408 /* 4409 * Assert that the given pmap is only active on the current 4410 * CPU. Unfortunately, we cannot block another CPU from 4411 * activating the pmap while this function is executing. 4412 */ 4413 KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), 4414 ("%s: non-current pmap %p", __func__, pmap)); 4415 #if defined(SMP) && defined(INVARIANTS) 4416 { 4417 cpuset_t other_cpus; 4418 4419 sched_pin(); 4420 other_cpus = pmap->pm_active; 4421 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 4422 sched_unpin(); 4423 KASSERT(CPU_EMPTY(&other_cpus), 4424 ("%s: pmap %p active on other cpus", __func__, pmap)); 4425 } 4426 #endif 4427 SLIST_INIT(&free); 4428 rw_wlock(&pvh_global_lock); 4429 PMAP_LOCK(pmap); 4430 sched_pin(); 4431 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4432 KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", 4433 __func__, pmap, pc->pc_pmap)); 4434 allfree = TRUE; 4435 for (field = 0; field < _NPCM; field++) { 4436 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4437 while (inuse != 0) { 4438 bit = ffs(inuse) - 1; 4439 bitmask = 1UL << bit; 4440 idx = field * 32 + bit; 4441 pv = &pc->pc_pventry[idx]; 4442 inuse &= ~bitmask; 4443 4444 /* 4445 * Note that we cannot remove wired pages 4446 * from a process' mapping at this time 4447 */ 4448 pte1p = pmap_pte1(pmap, pv->pv_va); 4449 pte1 = pte1_load(pte1p); 4450 if (pte1_is_section(pte1)) { 4451 if (pte1_is_wired(pte1)) { 4452 allfree = FALSE; 4453 continue; 4454 } 4455 pte1_clear(pte1p); 4456 pmap_remove_pte1_quick(pmap, pte1, pv, 4457 &free); 4458 } 4459 else if (pte1_is_link(pte1)) { 4460 pte2p = pt2map_entry(pv->pv_va); 4461 pte2 = pte2_load(pte2p); 4462 4463 if (!pte2_is_valid(pte2)) { 4464 printf("%s: pmap %p va %#x " 4465 "pte2 %#x\n", __func__, 4466 pmap, pv->pv_va, pte2); 4467 panic("bad pte2"); 4468 } 4469 4470 if (pte2_is_wired(pte2)) { 4471 allfree = FALSE; 4472 continue; 4473 } 4474 pte2_clear(pte2p); 4475 pmap_remove_pte2_quick(pmap, pte2, pv, 4476 &free); 4477 } else { 4478 printf("%s: pmap %p va %#x pte1 %#x\n", 4479 __func__, pmap, pv->pv_va, pte1); 4480 panic("bad pte1"); 4481 } 4482 4483 /* Mark free */ 4484 PV_STAT(pv_entry_frees++); 4485 PV_STAT(pv_entry_spare++); 4486 pv_entry_count--; 4487 pc->pc_map[field] |= bitmask; 4488 } 4489 } 4490 if (allfree) { 4491 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4492 free_pv_chunk(pc); 4493 } 4494 } 4495 tlb_flush_all_ng_local(); 4496 sched_unpin(); 4497 rw_wunlock(&pvh_global_lock); 4498 PMAP_UNLOCK(pmap); 4499 vm_page_free_pages_toq(&free, false); 4500 } 4501 4502 /* 4503 * This code makes some *MAJOR* assumptions: 4504 * 1. Current pmap & pmap exists. 4505 * 2. Not wired. 4506 * 3. Read access. 4507 * 4. No L2 page table pages. 4508 * but is *MUCH* faster than pmap_enter... 4509 */ 4510 static vm_page_t 4511 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4512 vm_prot_t prot, vm_page_t mpt2pg) 4513 { 4514 pt2_entry_t *pte2p, pte2; 4515 vm_paddr_t pa; 4516 struct spglist free; 4517 uint32_t l2prot; 4518 4519 KASSERT(!VA_IS_CLEANMAP(va) || 4520 (m->oflags & VPO_UNMANAGED) != 0, 4521 ("%s: managed mapping within the clean submap", __func__)); 4522 rw_assert(&pvh_global_lock, RA_WLOCKED); 4523 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4524 4525 /* 4526 * In the case that a L2 page table page is not 4527 * resident, we are creating it here. 4528 */ 4529 if (va < VM_MAXUSER_ADDRESS) { 4530 u_int pte1_idx; 4531 pt1_entry_t pte1, *pte1p; 4532 vm_paddr_t pt2_pa; 4533 4534 /* 4535 * Get L1 page table things. 4536 */ 4537 pte1_idx = pte1_index(va); 4538 pte1p = pmap_pte1(pmap, va); 4539 pte1 = pte1_load(pte1p); 4540 4541 if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { 4542 /* 4543 * Each of NPT2_IN_PG L2 page tables on the page can 4544 * come here. Make sure that associated L1 page table 4545 * link is established. 4546 * 4547 * QQQ: It comes that we don't establish all links to 4548 * L2 page tables for newly allocated L2 page 4549 * tables page. 4550 */ 4551 KASSERT(!pte1_is_section(pte1), 4552 ("%s: pte1 %#x is section", __func__, pte1)); 4553 if (!pte1_is_link(pte1)) { 4554 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), 4555 pte1_idx); 4556 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 4557 } 4558 pt2_wirecount_inc(mpt2pg, pte1_idx); 4559 } else { 4560 /* 4561 * If the L2 page table page is mapped, we just 4562 * increment the hold count, and activate it. 4563 */ 4564 if (pte1_is_section(pte1)) { 4565 return (NULL); 4566 } else if (pte1_is_link(pte1)) { 4567 mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 4568 pt2_wirecount_inc(mpt2pg, pte1_idx); 4569 } else { 4570 mpt2pg = _pmap_allocpte2(pmap, va, 4571 PMAP_ENTER_NOSLEEP); 4572 if (mpt2pg == NULL) 4573 return (NULL); 4574 } 4575 } 4576 } else { 4577 mpt2pg = NULL; 4578 } 4579 4580 /* 4581 * This call to pt2map_entry() makes the assumption that we are 4582 * entering the page into the current pmap. In order to support 4583 * quick entry into any pmap, one would likely use pmap_pte2_quick(). 4584 * But that isn't as quick as pt2map_entry(). 4585 */ 4586 pte2p = pt2map_entry(va); 4587 pte2 = pte2_load(pte2p); 4588 if (pte2_is_valid(pte2)) { 4589 if (mpt2pg != NULL) { 4590 /* 4591 * Remove extra pte2 reference 4592 */ 4593 pt2_wirecount_dec(mpt2pg, pte1_index(va)); 4594 mpt2pg = NULL; 4595 } 4596 return (NULL); 4597 } 4598 4599 /* 4600 * Enter on the PV list if part of our managed memory. 4601 */ 4602 if ((m->oflags & VPO_UNMANAGED) == 0 && 4603 !pmap_try_insert_pv_entry(pmap, va, m)) { 4604 if (mpt2pg != NULL) { 4605 SLIST_INIT(&free); 4606 if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { 4607 pmap_tlb_flush(pmap, va); 4608 vm_page_free_pages_toq(&free, false); 4609 } 4610 4611 mpt2pg = NULL; 4612 } 4613 return (NULL); 4614 } 4615 4616 /* 4617 * Increment counters 4618 */ 4619 pmap->pm_stats.resident_count++; 4620 4621 /* 4622 * Now validate mapping with RO protection 4623 */ 4624 pa = VM_PAGE_TO_PHYS(m); 4625 l2prot = PTE2_RO | PTE2_NM; 4626 if (va < VM_MAXUSER_ADDRESS) 4627 l2prot |= PTE2_U | PTE2_NG; 4628 if ((prot & VM_PROT_EXECUTE) == 0) 4629 l2prot |= PTE2_NX; 4630 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4631 /* 4632 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4633 * is set. QQQ: For more info, see comments in pmap_enter(). 4634 */ 4635 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4636 } 4637 pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); 4638 4639 return (mpt2pg); 4640 } 4641 4642 void 4643 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4644 { 4645 4646 rw_wlock(&pvh_global_lock); 4647 PMAP_LOCK(pmap); 4648 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4649 rw_wunlock(&pvh_global_lock); 4650 PMAP_UNLOCK(pmap); 4651 } 4652 4653 /* 4654 * Tries to create a read- and/or execute-only 1 MB page mapping. Returns 4655 * true if successful. Returns false if (1) a mapping already exists at the 4656 * specified virtual address or (2) a PV entry cannot be allocated without 4657 * reclaiming another PV entry. 4658 */ 4659 static bool 4660 pmap_enter_1mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4661 { 4662 pt1_entry_t pte1; 4663 vm_paddr_t pa; 4664 4665 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4666 pa = VM_PAGE_TO_PHYS(m); 4667 pte1 = PTE1(pa, PTE1_NM | PTE1_RO, ATTR_TO_L1(vm_page_pte2_attr(m))); 4668 if ((prot & VM_PROT_EXECUTE) == 0) 4669 pte1 |= PTE1_NX; 4670 if (va < VM_MAXUSER_ADDRESS) 4671 pte1 |= PTE1_U; 4672 if (pmap != kernel_pmap) 4673 pte1 |= PTE1_NG; 4674 return (pmap_enter_pte1(pmap, va, pte1, PMAP_ENTER_NOSLEEP | 4675 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m) == KERN_SUCCESS); 4676 } 4677 4678 /* 4679 * Tries to create the specified 1 MB page mapping. Returns KERN_SUCCESS if 4680 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4681 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4682 * a mapping already exists at the specified virtual address. Returns 4683 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NORECLAIM was specified and PV entry 4684 * allocation failed. 4685 */ 4686 static int 4687 pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, 4688 vm_page_t m) 4689 { 4690 struct spglist free; 4691 pt1_entry_t opte1, *pte1p; 4692 pt2_entry_t pte2, *pte2p; 4693 vm_offset_t cur, end; 4694 vm_page_t mt; 4695 4696 rw_assert(&pvh_global_lock, RA_WLOCKED); 4697 KASSERT((pte1 & (PTE1_NM | PTE1_RO)) == 0 || 4698 (pte1 & (PTE1_NM | PTE1_RO)) == (PTE1_NM | PTE1_RO), 4699 ("%s: pte1 has inconsistent NM and RO attributes", __func__)); 4700 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4701 pte1p = pmap_pte1(pmap, va); 4702 opte1 = pte1_load(pte1p); 4703 if (pte1_is_valid(opte1)) { 4704 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 4705 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4706 __func__, va, pmap); 4707 return (KERN_FAILURE); 4708 } 4709 /* Break the existing mapping(s). */ 4710 SLIST_INIT(&free); 4711 if (pte1_is_section(opte1)) { 4712 /* 4713 * If the section resulted from a promotion, then a 4714 * reserved PT page could be freed. 4715 */ 4716 pmap_remove_pte1(pmap, pte1p, va, &free); 4717 } else { 4718 sched_pin(); 4719 end = va + PTE1_SIZE; 4720 for (cur = va, pte2p = pmap_pte2_quick(pmap, va); 4721 cur != end; cur += PAGE_SIZE, pte2p++) { 4722 pte2 = pte2_load(pte2p); 4723 if (!pte2_is_valid(pte2)) 4724 continue; 4725 if (pmap_remove_pte2(pmap, pte2p, cur, &free)) 4726 break; 4727 } 4728 sched_unpin(); 4729 } 4730 vm_page_free_pages_toq(&free, false); 4731 } 4732 if ((m->oflags & VPO_UNMANAGED) == 0) { 4733 /* 4734 * Abort this mapping if its PV entry could not be created. 4735 */ 4736 if (!pmap_pv_insert_pte1(pmap, va, pte1, flags)) { 4737 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4738 __func__, va, pmap); 4739 return (KERN_RESOURCE_SHORTAGE); 4740 } 4741 if ((pte1 & PTE1_RO) == 0) { 4742 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4743 vm_page_aflag_set(mt, PGA_WRITEABLE); 4744 } 4745 } 4746 4747 /* 4748 * Increment counters. 4749 */ 4750 if (pte1_is_wired(pte1)) 4751 pmap->pm_stats.wired_count += PTE1_SIZE / PAGE_SIZE; 4752 pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; 4753 4754 /* 4755 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4756 * is set. QQQ: For more info, see comments in pmap_enter(). 4757 */ 4758 if ((pte1 & PTE1_NX) == 0 && m->md.pat_mode == VM_MEMATTR_WB_WA && 4759 pmap != kernel_pmap && (!pte1_is_section(opte1) || 4760 pte1_pa(opte1) != VM_PAGE_TO_PHYS(m) || (opte1 & PTE2_NX) != 0)) 4761 cache_icache_sync_fresh(va, VM_PAGE_TO_PHYS(m), PTE1_SIZE); 4762 4763 /* 4764 * Map the section. 4765 */ 4766 pte1_store(pte1p, pte1); 4767 4768 pmap_pte1_mappings++; 4769 CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, 4770 pmap); 4771 return (KERN_SUCCESS); 4772 } 4773 4774 /* 4775 * Maps a sequence of resident pages belonging to the same object. 4776 * The sequence begins with the given page m_start. This page is 4777 * mapped at the given virtual address start. Each subsequent page is 4778 * mapped at a virtual address that is offset from start by the same 4779 * amount as the page is offset from m_start within the object. The 4780 * last page in the sequence is the page with the largest offset from 4781 * m_start that can be mapped at a virtual address less than the given 4782 * virtual address end. Not every virtual page between start and end 4783 * is mapped; only those for which a resident page exists with the 4784 * corresponding offset from m_start are mapped. 4785 */ 4786 void 4787 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4788 vm_page_t m_start, vm_prot_t prot) 4789 { 4790 vm_offset_t va; 4791 vm_page_t m, mpt2pg; 4792 vm_pindex_t diff, psize; 4793 4794 PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", 4795 __func__, pmap, start, end, m_start, prot)); 4796 4797 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4798 psize = atop(end - start); 4799 mpt2pg = NULL; 4800 m = m_start; 4801 rw_wlock(&pvh_global_lock); 4802 PMAP_LOCK(pmap); 4803 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4804 va = start + ptoa(diff); 4805 if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && 4806 m->psind == 1 && sp_enabled && 4807 pmap_enter_1mpage(pmap, va, m, prot)) 4808 m = &m[PTE1_SIZE / PAGE_SIZE - 1]; 4809 else 4810 mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, 4811 mpt2pg); 4812 m = TAILQ_NEXT(m, listq); 4813 } 4814 rw_wunlock(&pvh_global_lock); 4815 PMAP_UNLOCK(pmap); 4816 } 4817 4818 /* 4819 * This code maps large physical mmap regions into the 4820 * processor address space. Note that some shortcuts 4821 * are taken, but the code works. 4822 */ 4823 void 4824 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4825 vm_pindex_t pindex, vm_size_t size) 4826 { 4827 pt1_entry_t *pte1p; 4828 vm_paddr_t pa, pte2_pa; 4829 vm_page_t p; 4830 vm_memattr_t pat_mode; 4831 u_int l1attr, l1prot; 4832 4833 VM_OBJECT_ASSERT_WLOCKED(object); 4834 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4835 ("%s: non-device object", __func__)); 4836 if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { 4837 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4838 return; 4839 p = vm_page_lookup(object, pindex); 4840 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4841 ("%s: invalid page %p", __func__, p)); 4842 pat_mode = p->md.pat_mode; 4843 4844 /* 4845 * Abort the mapping if the first page is not physically 4846 * aligned to a 1MB page boundary. 4847 */ 4848 pte2_pa = VM_PAGE_TO_PHYS(p); 4849 if (pte2_pa & PTE1_OFFSET) 4850 return; 4851 4852 /* 4853 * Skip the first page. Abort the mapping if the rest of 4854 * the pages are not physically contiguous or have differing 4855 * memory attributes. 4856 */ 4857 p = TAILQ_NEXT(p, listq); 4858 for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; 4859 pa += PAGE_SIZE) { 4860 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4861 ("%s: invalid page %p", __func__, p)); 4862 if (pa != VM_PAGE_TO_PHYS(p) || 4863 pat_mode != p->md.pat_mode) 4864 return; 4865 p = TAILQ_NEXT(p, listq); 4866 } 4867 4868 /* 4869 * Map using 1MB pages. 4870 * 4871 * QQQ: Well, we are mapping a section, so same condition must 4872 * be hold like during promotion. It looks that only RW mapping 4873 * is done here, so readonly mapping must be done elsewhere. 4874 */ 4875 l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; 4876 l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); 4877 PMAP_LOCK(pmap); 4878 for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { 4879 pte1p = pmap_pte1(pmap, addr); 4880 if (!pte1_is_valid(pte1_load(pte1p))) { 4881 pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); 4882 pmap->pm_stats.resident_count += PTE1_SIZE / 4883 PAGE_SIZE; 4884 pmap_pte1_mappings++; 4885 } 4886 /* Else continue on if the PTE1 is already valid. */ 4887 addr += PTE1_SIZE; 4888 } 4889 PMAP_UNLOCK(pmap); 4890 } 4891 } 4892 4893 /* 4894 * Do the things to protect a 1mpage in a process. 4895 */ 4896 static void 4897 pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 4898 vm_prot_t prot) 4899 { 4900 pt1_entry_t npte1, opte1; 4901 vm_offset_t eva, va; 4902 vm_page_t m; 4903 4904 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4905 KASSERT((sva & PTE1_OFFSET) == 0, 4906 ("%s: sva is not 1mpage aligned", __func__)); 4907 4908 opte1 = npte1 = pte1_load(pte1p); 4909 if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { 4910 eva = sva + PTE1_SIZE; 4911 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 4912 va < eva; va += PAGE_SIZE, m++) 4913 vm_page_dirty(m); 4914 } 4915 if ((prot & VM_PROT_WRITE) == 0) 4916 npte1 |= PTE1_RO | PTE1_NM; 4917 if ((prot & VM_PROT_EXECUTE) == 0) 4918 npte1 |= PTE1_NX; 4919 4920 /* 4921 * QQQ: Herein, execute permission is never set. 4922 * It only can be cleared. So, no icache 4923 * syncing is needed. 4924 */ 4925 4926 if (npte1 != opte1) { 4927 pte1_store(pte1p, npte1); 4928 pmap_tlb_flush(pmap, sva); 4929 } 4930 } 4931 4932 /* 4933 * Set the physical protection on the 4934 * specified range of this map as requested. 4935 */ 4936 void 4937 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4938 { 4939 boolean_t pv_lists_locked; 4940 vm_offset_t nextva; 4941 pt1_entry_t *pte1p, pte1; 4942 pt2_entry_t *pte2p, opte2, npte2; 4943 4944 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4945 if (prot == VM_PROT_NONE) { 4946 pmap_remove(pmap, sva, eva); 4947 return; 4948 } 4949 4950 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 4951 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 4952 return; 4953 4954 if (pmap_is_current(pmap)) 4955 pv_lists_locked = FALSE; 4956 else { 4957 pv_lists_locked = TRUE; 4958 resume: 4959 rw_wlock(&pvh_global_lock); 4960 sched_pin(); 4961 } 4962 4963 PMAP_LOCK(pmap); 4964 for (; sva < eva; sva = nextva) { 4965 /* 4966 * Calculate address for next L2 page table. 4967 */ 4968 nextva = pte1_trunc(sva + PTE1_SIZE); 4969 if (nextva < sva) 4970 nextva = eva; 4971 4972 pte1p = pmap_pte1(pmap, sva); 4973 pte1 = pte1_load(pte1p); 4974 4975 /* 4976 * Weed out invalid mappings. Note: we assume that L1 page 4977 * page table is always allocated, and in kernel virtual. 4978 */ 4979 if (pte1 == 0) 4980 continue; 4981 4982 if (pte1_is_section(pte1)) { 4983 /* 4984 * Are we protecting the entire large page? If not, 4985 * demote the mapping and fall through. 4986 */ 4987 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4988 pmap_protect_pte1(pmap, pte1p, sva, prot); 4989 continue; 4990 } else { 4991 if (!pv_lists_locked) { 4992 pv_lists_locked = TRUE; 4993 if (!rw_try_wlock(&pvh_global_lock)) { 4994 PMAP_UNLOCK(pmap); 4995 goto resume; 4996 } 4997 sched_pin(); 4998 } 4999 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5000 /* 5001 * The large page mapping 5002 * was destroyed. 5003 */ 5004 continue; 5005 } 5006 #ifdef INVARIANTS 5007 else { 5008 /* Update pte1 after demotion */ 5009 pte1 = pte1_load(pte1p); 5010 } 5011 #endif 5012 } 5013 } 5014 5015 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5016 " is not link", __func__, pmap, sva, pte1, pte1p)); 5017 5018 /* 5019 * Limit our scan to either the end of the va represented 5020 * by the current L2 page table page, or to the end of the 5021 * range being protected. 5022 */ 5023 if (nextva > eva) 5024 nextva = eva; 5025 5026 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5027 sva += PAGE_SIZE) { 5028 vm_page_t m; 5029 5030 opte2 = npte2 = pte2_load(pte2p); 5031 if (!pte2_is_valid(opte2)) 5032 continue; 5033 5034 if ((prot & VM_PROT_WRITE) == 0) { 5035 if (pte2_is_managed(opte2) && 5036 pte2_is_dirty(opte2)) { 5037 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 5038 vm_page_dirty(m); 5039 } 5040 npte2 |= PTE2_RO | PTE2_NM; 5041 } 5042 5043 if ((prot & VM_PROT_EXECUTE) == 0) 5044 npte2 |= PTE2_NX; 5045 5046 /* 5047 * QQQ: Herein, execute permission is never set. 5048 * It only can be cleared. So, no icache 5049 * syncing is needed. 5050 */ 5051 5052 if (npte2 != opte2) { 5053 pte2_store(pte2p, npte2); 5054 pmap_tlb_flush(pmap, sva); 5055 } 5056 } 5057 } 5058 if (pv_lists_locked) { 5059 sched_unpin(); 5060 rw_wunlock(&pvh_global_lock); 5061 } 5062 PMAP_UNLOCK(pmap); 5063 } 5064 5065 /* 5066 * pmap_pvh_wired_mappings: 5067 * 5068 * Return the updated number "count" of managed mappings that are wired. 5069 */ 5070 static int 5071 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 5072 { 5073 pmap_t pmap; 5074 pt1_entry_t pte1; 5075 pt2_entry_t pte2; 5076 pv_entry_t pv; 5077 5078 rw_assert(&pvh_global_lock, RA_WLOCKED); 5079 sched_pin(); 5080 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5081 pmap = PV_PMAP(pv); 5082 PMAP_LOCK(pmap); 5083 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5084 if (pte1_is_section(pte1)) { 5085 if (pte1_is_wired(pte1)) 5086 count++; 5087 } else { 5088 KASSERT(pte1_is_link(pte1), 5089 ("%s: pte1 %#x is not link", __func__, pte1)); 5090 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5091 if (pte2_is_wired(pte2)) 5092 count++; 5093 } 5094 PMAP_UNLOCK(pmap); 5095 } 5096 sched_unpin(); 5097 return (count); 5098 } 5099 5100 /* 5101 * pmap_page_wired_mappings: 5102 * 5103 * Return the number of managed mappings to the given physical page 5104 * that are wired. 5105 */ 5106 int 5107 pmap_page_wired_mappings(vm_page_t m) 5108 { 5109 int count; 5110 5111 count = 0; 5112 if ((m->oflags & VPO_UNMANAGED) != 0) 5113 return (count); 5114 rw_wlock(&pvh_global_lock); 5115 count = pmap_pvh_wired_mappings(&m->md, count); 5116 if ((m->flags & PG_FICTITIOUS) == 0) { 5117 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 5118 count); 5119 } 5120 rw_wunlock(&pvh_global_lock); 5121 return (count); 5122 } 5123 5124 /* 5125 * Returns TRUE if any of the given mappings were used to modify 5126 * physical memory. Otherwise, returns FALSE. Both page and 1mpage 5127 * mappings are supported. 5128 */ 5129 static boolean_t 5130 pmap_is_modified_pvh(struct md_page *pvh) 5131 { 5132 pv_entry_t pv; 5133 pt1_entry_t pte1; 5134 pt2_entry_t pte2; 5135 pmap_t pmap; 5136 boolean_t rv; 5137 5138 rw_assert(&pvh_global_lock, RA_WLOCKED); 5139 rv = FALSE; 5140 sched_pin(); 5141 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5142 pmap = PV_PMAP(pv); 5143 PMAP_LOCK(pmap); 5144 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5145 if (pte1_is_section(pte1)) { 5146 rv = pte1_is_dirty(pte1); 5147 } else { 5148 KASSERT(pte1_is_link(pte1), 5149 ("%s: pte1 %#x is not link", __func__, pte1)); 5150 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5151 rv = pte2_is_dirty(pte2); 5152 } 5153 PMAP_UNLOCK(pmap); 5154 if (rv) 5155 break; 5156 } 5157 sched_unpin(); 5158 return (rv); 5159 } 5160 5161 /* 5162 * pmap_is_modified: 5163 * 5164 * Return whether or not the specified physical page was modified 5165 * in any physical maps. 5166 */ 5167 boolean_t 5168 pmap_is_modified(vm_page_t m) 5169 { 5170 boolean_t rv; 5171 5172 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5173 ("%s: page %p is not managed", __func__, m)); 5174 5175 /* 5176 * If the page is not busied then this check is racy. 5177 */ 5178 if (!pmap_page_is_write_mapped(m)) 5179 return (FALSE); 5180 rw_wlock(&pvh_global_lock); 5181 rv = pmap_is_modified_pvh(&m->md) || 5182 ((m->flags & PG_FICTITIOUS) == 0 && 5183 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5184 rw_wunlock(&pvh_global_lock); 5185 return (rv); 5186 } 5187 5188 /* 5189 * pmap_is_prefaultable: 5190 * 5191 * Return whether or not the specified virtual address is eligible 5192 * for prefault. 5193 */ 5194 boolean_t 5195 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5196 { 5197 pt1_entry_t pte1; 5198 pt2_entry_t pte2; 5199 boolean_t rv; 5200 5201 rv = FALSE; 5202 PMAP_LOCK(pmap); 5203 pte1 = pte1_load(pmap_pte1(pmap, addr)); 5204 if (pte1_is_link(pte1)) { 5205 pte2 = pte2_load(pt2map_entry(addr)); 5206 rv = !pte2_is_valid(pte2) ; 5207 } 5208 PMAP_UNLOCK(pmap); 5209 return (rv); 5210 } 5211 5212 /* 5213 * Returns TRUE if any of the given mappings were referenced and FALSE 5214 * otherwise. Both page and 1mpage mappings are supported. 5215 */ 5216 static boolean_t 5217 pmap_is_referenced_pvh(struct md_page *pvh) 5218 { 5219 5220 pv_entry_t pv; 5221 pt1_entry_t pte1; 5222 pt2_entry_t pte2; 5223 pmap_t pmap; 5224 boolean_t rv; 5225 5226 rw_assert(&pvh_global_lock, RA_WLOCKED); 5227 rv = FALSE; 5228 sched_pin(); 5229 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5230 pmap = PV_PMAP(pv); 5231 PMAP_LOCK(pmap); 5232 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5233 if (pte1_is_section(pte1)) { 5234 rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); 5235 } else { 5236 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5237 rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); 5238 } 5239 PMAP_UNLOCK(pmap); 5240 if (rv) 5241 break; 5242 } 5243 sched_unpin(); 5244 return (rv); 5245 } 5246 5247 /* 5248 * pmap_is_referenced: 5249 * 5250 * Return whether or not the specified physical page was referenced 5251 * in any physical maps. 5252 */ 5253 boolean_t 5254 pmap_is_referenced(vm_page_t m) 5255 { 5256 boolean_t rv; 5257 5258 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5259 ("%s: page %p is not managed", __func__, m)); 5260 rw_wlock(&pvh_global_lock); 5261 rv = pmap_is_referenced_pvh(&m->md) || 5262 ((m->flags & PG_FICTITIOUS) == 0 && 5263 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5264 rw_wunlock(&pvh_global_lock); 5265 return (rv); 5266 } 5267 5268 /* 5269 * pmap_ts_referenced: 5270 * 5271 * Return a count of reference bits for a page, clearing those bits. 5272 * It is not necessary for every reference bit to be cleared, but it 5273 * is necessary that 0 only be returned when there are truly no 5274 * reference bits set. 5275 * 5276 * As an optimization, update the page's dirty field if a modified bit is 5277 * found while counting reference bits. This opportunistic update can be 5278 * performed at low cost and can eliminate the need for some future calls 5279 * to pmap_is_modified(). However, since this function stops after 5280 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5281 * dirty pages. Those dirty pages will only be detected by a future call 5282 * to pmap_is_modified(). 5283 */ 5284 int 5285 pmap_ts_referenced(vm_page_t m) 5286 { 5287 struct md_page *pvh; 5288 pv_entry_t pv, pvf; 5289 pmap_t pmap; 5290 pt1_entry_t *pte1p, opte1; 5291 pt2_entry_t *pte2p, opte2; 5292 vm_paddr_t pa; 5293 int rtval = 0; 5294 5295 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5296 ("%s: page %p is not managed", __func__, m)); 5297 pa = VM_PAGE_TO_PHYS(m); 5298 pvh = pa_to_pvh(pa); 5299 rw_wlock(&pvh_global_lock); 5300 sched_pin(); 5301 if ((m->flags & PG_FICTITIOUS) != 0 || 5302 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5303 goto small_mappings; 5304 pv = pvf; 5305 do { 5306 pmap = PV_PMAP(pv); 5307 PMAP_LOCK(pmap); 5308 pte1p = pmap_pte1(pmap, pv->pv_va); 5309 opte1 = pte1_load(pte1p); 5310 if (pte1_is_dirty(opte1)) { 5311 /* 5312 * Although "opte1" is mapping a 1MB page, because 5313 * this function is called at a 4KB page granularity, 5314 * we only update the 4KB page under test. 5315 */ 5316 vm_page_dirty(m); 5317 } 5318 if ((opte1 & PTE1_A) != 0) { 5319 /* 5320 * Since this reference bit is shared by 256 4KB pages, 5321 * it should not be cleared every time it is tested. 5322 * Apply a simple "hash" function on the physical page 5323 * number, the virtual section number, and the pmap 5324 * address to select one 4KB page out of the 256 5325 * on which testing the reference bit will result 5326 * in clearing that bit. This function is designed 5327 * to avoid the selection of the same 4KB page 5328 * for every 1MB page mapping. 5329 * 5330 * On demotion, a mapping that hasn't been referenced 5331 * is simply destroyed. To avoid the possibility of a 5332 * subsequent page fault on a demoted wired mapping, 5333 * always leave its reference bit set. Moreover, 5334 * since the section is wired, the current state of 5335 * its reference bit won't affect page replacement. 5336 */ 5337 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ 5338 (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && 5339 !pte1_is_wired(opte1)) { 5340 pte1_clear_bit(pte1p, PTE1_A); 5341 pmap_tlb_flush(pmap, pv->pv_va); 5342 } 5343 rtval++; 5344 } 5345 PMAP_UNLOCK(pmap); 5346 /* Rotate the PV list if it has more than one entry. */ 5347 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5348 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5349 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5350 } 5351 if (rtval >= PMAP_TS_REFERENCED_MAX) 5352 goto out; 5353 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5354 small_mappings: 5355 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5356 goto out; 5357 pv = pvf; 5358 do { 5359 pmap = PV_PMAP(pv); 5360 PMAP_LOCK(pmap); 5361 pte1p = pmap_pte1(pmap, pv->pv_va); 5362 KASSERT(pte1_is_link(pte1_load(pte1p)), 5363 ("%s: not found a link in page %p's pv list", __func__, m)); 5364 5365 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5366 opte2 = pte2_load(pte2p); 5367 if (pte2_is_dirty(opte2)) 5368 vm_page_dirty(m); 5369 if ((opte2 & PTE2_A) != 0) { 5370 pte2_clear_bit(pte2p, PTE2_A); 5371 pmap_tlb_flush(pmap, pv->pv_va); 5372 rtval++; 5373 } 5374 PMAP_UNLOCK(pmap); 5375 /* Rotate the PV list if it has more than one entry. */ 5376 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5377 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5378 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5379 } 5380 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5381 PMAP_TS_REFERENCED_MAX); 5382 out: 5383 sched_unpin(); 5384 rw_wunlock(&pvh_global_lock); 5385 return (rtval); 5386 } 5387 5388 /* 5389 * Clear the wired attribute from the mappings for the specified range of 5390 * addresses in the given pmap. Every valid mapping within that range 5391 * must have the wired attribute set. In contrast, invalid mappings 5392 * cannot have the wired attribute set, so they are ignored. 5393 * 5394 * The wired attribute of the page table entry is not a hardware feature, 5395 * so there is no need to invalidate any TLB entries. 5396 */ 5397 void 5398 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5399 { 5400 vm_offset_t nextva; 5401 pt1_entry_t *pte1p, pte1; 5402 pt2_entry_t *pte2p, pte2; 5403 boolean_t pv_lists_locked; 5404 5405 if (pmap_is_current(pmap)) 5406 pv_lists_locked = FALSE; 5407 else { 5408 pv_lists_locked = TRUE; 5409 resume: 5410 rw_wlock(&pvh_global_lock); 5411 sched_pin(); 5412 } 5413 PMAP_LOCK(pmap); 5414 for (; sva < eva; sva = nextva) { 5415 nextva = pte1_trunc(sva + PTE1_SIZE); 5416 if (nextva < sva) 5417 nextva = eva; 5418 5419 pte1p = pmap_pte1(pmap, sva); 5420 pte1 = pte1_load(pte1p); 5421 5422 /* 5423 * Weed out invalid mappings. Note: we assume that L1 page 5424 * page table is always allocated, and in kernel virtual. 5425 */ 5426 if (pte1 == 0) 5427 continue; 5428 5429 if (pte1_is_section(pte1)) { 5430 if (!pte1_is_wired(pte1)) 5431 panic("%s: pte1 %#x not wired", __func__, pte1); 5432 5433 /* 5434 * Are we unwiring the entire large page? If not, 5435 * demote the mapping and fall through. 5436 */ 5437 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5438 pte1_clear_bit(pte1p, PTE1_W); 5439 pmap->pm_stats.wired_count -= PTE1_SIZE / 5440 PAGE_SIZE; 5441 continue; 5442 } else { 5443 if (!pv_lists_locked) { 5444 pv_lists_locked = TRUE; 5445 if (!rw_try_wlock(&pvh_global_lock)) { 5446 PMAP_UNLOCK(pmap); 5447 /* Repeat sva. */ 5448 goto resume; 5449 } 5450 sched_pin(); 5451 } 5452 if (!pmap_demote_pte1(pmap, pte1p, sva)) 5453 panic("%s: demotion failed", __func__); 5454 #ifdef INVARIANTS 5455 else { 5456 /* Update pte1 after demotion */ 5457 pte1 = pte1_load(pte1p); 5458 } 5459 #endif 5460 } 5461 } 5462 5463 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5464 " is not link", __func__, pmap, sva, pte1, pte1p)); 5465 5466 /* 5467 * Limit our scan to either the end of the va represented 5468 * by the current L2 page table page, or to the end of the 5469 * range being protected. 5470 */ 5471 if (nextva > eva) 5472 nextva = eva; 5473 5474 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5475 sva += PAGE_SIZE) { 5476 pte2 = pte2_load(pte2p); 5477 if (!pte2_is_valid(pte2)) 5478 continue; 5479 if (!pte2_is_wired(pte2)) 5480 panic("%s: pte2 %#x is missing PTE2_W", 5481 __func__, pte2); 5482 5483 /* 5484 * PTE2_W must be cleared atomically. Although the pmap 5485 * lock synchronizes access to PTE2_W, another processor 5486 * could be changing PTE2_NM and/or PTE2_A concurrently. 5487 */ 5488 pte2_clear_bit(pte2p, PTE2_W); 5489 pmap->pm_stats.wired_count--; 5490 } 5491 } 5492 if (pv_lists_locked) { 5493 sched_unpin(); 5494 rw_wunlock(&pvh_global_lock); 5495 } 5496 PMAP_UNLOCK(pmap); 5497 } 5498 5499 /* 5500 * Clear the write and modified bits in each of the given page's mappings. 5501 */ 5502 void 5503 pmap_remove_write(vm_page_t m) 5504 { 5505 struct md_page *pvh; 5506 pv_entry_t next_pv, pv; 5507 pmap_t pmap; 5508 pt1_entry_t *pte1p; 5509 pt2_entry_t *pte2p, opte2; 5510 vm_offset_t va; 5511 5512 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5513 ("%s: page %p is not managed", __func__, m)); 5514 vm_page_assert_busied(m); 5515 5516 if (!pmap_page_is_write_mapped(m)) 5517 return; 5518 rw_wlock(&pvh_global_lock); 5519 sched_pin(); 5520 if ((m->flags & PG_FICTITIOUS) != 0) 5521 goto small_mappings; 5522 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5523 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5524 va = pv->pv_va; 5525 pmap = PV_PMAP(pv); 5526 PMAP_LOCK(pmap); 5527 pte1p = pmap_pte1(pmap, va); 5528 if (!(pte1_load(pte1p) & PTE1_RO)) 5529 (void)pmap_demote_pte1(pmap, pte1p, va); 5530 PMAP_UNLOCK(pmap); 5531 } 5532 small_mappings: 5533 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5534 pmap = PV_PMAP(pv); 5535 PMAP_LOCK(pmap); 5536 pte1p = pmap_pte1(pmap, pv->pv_va); 5537 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5538 " a section in page %p's pv list", __func__, m)); 5539 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5540 opte2 = pte2_load(pte2p); 5541 if (!(opte2 & PTE2_RO)) { 5542 pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); 5543 if (pte2_is_dirty(opte2)) 5544 vm_page_dirty(m); 5545 pmap_tlb_flush(pmap, pv->pv_va); 5546 } 5547 PMAP_UNLOCK(pmap); 5548 } 5549 vm_page_aflag_clear(m, PGA_WRITEABLE); 5550 sched_unpin(); 5551 rw_wunlock(&pvh_global_lock); 5552 } 5553 5554 /* 5555 * Apply the given advice to the specified range of addresses within the 5556 * given pmap. Depending on the advice, clear the referenced and/or 5557 * modified flags in each mapping and set the mapped page's dirty field. 5558 */ 5559 void 5560 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5561 { 5562 pt1_entry_t *pte1p, opte1; 5563 pt2_entry_t *pte2p, pte2; 5564 vm_offset_t pdnxt; 5565 vm_page_t m; 5566 boolean_t pv_lists_locked; 5567 5568 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5569 return; 5570 if (pmap_is_current(pmap)) 5571 pv_lists_locked = FALSE; 5572 else { 5573 pv_lists_locked = TRUE; 5574 resume: 5575 rw_wlock(&pvh_global_lock); 5576 sched_pin(); 5577 } 5578 PMAP_LOCK(pmap); 5579 for (; sva < eva; sva = pdnxt) { 5580 pdnxt = pte1_trunc(sva + PTE1_SIZE); 5581 if (pdnxt < sva) 5582 pdnxt = eva; 5583 pte1p = pmap_pte1(pmap, sva); 5584 opte1 = pte1_load(pte1p); 5585 if (!pte1_is_valid(opte1)) /* XXX */ 5586 continue; 5587 else if (pte1_is_section(opte1)) { 5588 if (!pte1_is_managed(opte1)) 5589 continue; 5590 if (!pv_lists_locked) { 5591 pv_lists_locked = TRUE; 5592 if (!rw_try_wlock(&pvh_global_lock)) { 5593 PMAP_UNLOCK(pmap); 5594 goto resume; 5595 } 5596 sched_pin(); 5597 } 5598 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5599 /* 5600 * The large page mapping was destroyed. 5601 */ 5602 continue; 5603 } 5604 5605 /* 5606 * Unless the page mappings are wired, remove the 5607 * mapping to a single page so that a subsequent 5608 * access may repromote. Since the underlying L2 page 5609 * table is fully populated, this removal never 5610 * frees a L2 page table page. 5611 */ 5612 if (!pte1_is_wired(opte1)) { 5613 pte2p = pmap_pte2_quick(pmap, sva); 5614 KASSERT(pte2_is_valid(pte2_load(pte2p)), 5615 ("%s: invalid PTE2", __func__)); 5616 pmap_remove_pte2(pmap, pte2p, sva, NULL); 5617 } 5618 } 5619 if (pdnxt > eva) 5620 pdnxt = eva; 5621 for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, 5622 sva += PAGE_SIZE) { 5623 pte2 = pte2_load(pte2p); 5624 if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) 5625 continue; 5626 else if (pte2_is_dirty(pte2)) { 5627 if (advice == MADV_DONTNEED) { 5628 /* 5629 * Future calls to pmap_is_modified() 5630 * can be avoided by making the page 5631 * dirty now. 5632 */ 5633 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 5634 vm_page_dirty(m); 5635 } 5636 pte2_set_bit(pte2p, PTE2_NM); 5637 pte2_clear_bit(pte2p, PTE2_A); 5638 } else if ((pte2 & PTE2_A) != 0) 5639 pte2_clear_bit(pte2p, PTE2_A); 5640 else 5641 continue; 5642 pmap_tlb_flush(pmap, sva); 5643 } 5644 } 5645 if (pv_lists_locked) { 5646 sched_unpin(); 5647 rw_wunlock(&pvh_global_lock); 5648 } 5649 PMAP_UNLOCK(pmap); 5650 } 5651 5652 /* 5653 * Clear the modify bits on the specified physical page. 5654 */ 5655 void 5656 pmap_clear_modify(vm_page_t m) 5657 { 5658 struct md_page *pvh; 5659 pv_entry_t next_pv, pv; 5660 pmap_t pmap; 5661 pt1_entry_t *pte1p, opte1; 5662 pt2_entry_t *pte2p, opte2; 5663 vm_offset_t va; 5664 5665 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5666 ("%s: page %p is not managed", __func__, m)); 5667 vm_page_assert_busied(m); 5668 5669 if (!pmap_page_is_write_mapped(m)) 5670 return; 5671 rw_wlock(&pvh_global_lock); 5672 sched_pin(); 5673 if ((m->flags & PG_FICTITIOUS) != 0) 5674 goto small_mappings; 5675 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5676 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5677 va = pv->pv_va; 5678 pmap = PV_PMAP(pv); 5679 PMAP_LOCK(pmap); 5680 pte1p = pmap_pte1(pmap, va); 5681 opte1 = pte1_load(pte1p); 5682 if (!(opte1 & PTE1_RO)) { 5683 if (pmap_demote_pte1(pmap, pte1p, va) && 5684 !pte1_is_wired(opte1)) { 5685 /* 5686 * Write protect the mapping to a 5687 * single page so that a subsequent 5688 * write access may repromote. 5689 */ 5690 va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); 5691 pte2p = pmap_pte2_quick(pmap, va); 5692 opte2 = pte2_load(pte2p); 5693 if ((opte2 & PTE2_V)) { 5694 pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); 5695 vm_page_dirty(m); 5696 pmap_tlb_flush(pmap, va); 5697 } 5698 } 5699 } 5700 PMAP_UNLOCK(pmap); 5701 } 5702 small_mappings: 5703 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5704 pmap = PV_PMAP(pv); 5705 PMAP_LOCK(pmap); 5706 pte1p = pmap_pte1(pmap, pv->pv_va); 5707 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5708 " a section in page %p's pv list", __func__, m)); 5709 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5710 if (pte2_is_dirty(pte2_load(pte2p))) { 5711 pte2_set_bit(pte2p, PTE2_NM); 5712 pmap_tlb_flush(pmap, pv->pv_va); 5713 } 5714 PMAP_UNLOCK(pmap); 5715 } 5716 sched_unpin(); 5717 rw_wunlock(&pvh_global_lock); 5718 } 5719 5720 /* 5721 * Sets the memory attribute for the specified page. 5722 */ 5723 void 5724 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5725 { 5726 pt2_entry_t *cmap2_pte2p; 5727 vm_memattr_t oma; 5728 vm_paddr_t pa; 5729 struct pcpu *pc; 5730 5731 oma = m->md.pat_mode; 5732 m->md.pat_mode = ma; 5733 5734 CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, 5735 VM_PAGE_TO_PHYS(m), oma, ma); 5736 if ((m->flags & PG_FICTITIOUS) != 0) 5737 return; 5738 #if 0 5739 /* 5740 * If "m" is a normal page, flush it from the cache. 5741 * 5742 * First, try to find an existing mapping of the page by sf 5743 * buffer. sf_buf_invalidate_cache() modifies mapping and 5744 * flushes the cache. 5745 */ 5746 if (sf_buf_invalidate_cache(m, oma)) 5747 return; 5748 #endif 5749 /* 5750 * If page is not mapped by sf buffer, map the page 5751 * transient and do invalidation. 5752 */ 5753 if (ma != oma) { 5754 pa = VM_PAGE_TO_PHYS(m); 5755 sched_pin(); 5756 pc = get_pcpu(); 5757 cmap2_pte2p = pc->pc_cmap2_pte2p; 5758 mtx_lock(&pc->pc_cmap_lock); 5759 if (pte2_load(cmap2_pte2p) != 0) 5760 panic("%s: CMAP2 busy", __func__); 5761 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 5762 vm_memattr_to_pte2(ma))); 5763 dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); 5764 pte2_clear(cmap2_pte2p); 5765 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5766 sched_unpin(); 5767 mtx_unlock(&pc->pc_cmap_lock); 5768 } 5769 } 5770 5771 /* 5772 * Miscellaneous support routines follow 5773 */ 5774 5775 /* 5776 * Returns TRUE if the given page is mapped individually or as part of 5777 * a 1mpage. Otherwise, returns FALSE. 5778 */ 5779 boolean_t 5780 pmap_page_is_mapped(vm_page_t m) 5781 { 5782 boolean_t rv; 5783 5784 if ((m->oflags & VPO_UNMANAGED) != 0) 5785 return (FALSE); 5786 rw_wlock(&pvh_global_lock); 5787 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5788 ((m->flags & PG_FICTITIOUS) == 0 && 5789 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5790 rw_wunlock(&pvh_global_lock); 5791 return (rv); 5792 } 5793 5794 /* 5795 * Returns true if the pmap's pv is one of the first 5796 * 16 pvs linked to from this page. This count may 5797 * be changed upwards or downwards in the future; it 5798 * is only necessary that true be returned for a small 5799 * subset of pmaps for proper page aging. 5800 */ 5801 boolean_t 5802 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5803 { 5804 struct md_page *pvh; 5805 pv_entry_t pv; 5806 int loops = 0; 5807 boolean_t rv; 5808 5809 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5810 ("%s: page %p is not managed", __func__, m)); 5811 rv = FALSE; 5812 rw_wlock(&pvh_global_lock); 5813 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5814 if (PV_PMAP(pv) == pmap) { 5815 rv = TRUE; 5816 break; 5817 } 5818 loops++; 5819 if (loops >= 16) 5820 break; 5821 } 5822 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5823 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5824 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5825 if (PV_PMAP(pv) == pmap) { 5826 rv = TRUE; 5827 break; 5828 } 5829 loops++; 5830 if (loops >= 16) 5831 break; 5832 } 5833 } 5834 rw_wunlock(&pvh_global_lock); 5835 return (rv); 5836 } 5837 5838 /* 5839 * pmap_zero_page zeros the specified hardware page by mapping 5840 * the page into KVM and using bzero to clear its contents. 5841 */ 5842 void 5843 pmap_zero_page(vm_page_t m) 5844 { 5845 pt2_entry_t *cmap2_pte2p; 5846 struct pcpu *pc; 5847 5848 sched_pin(); 5849 pc = get_pcpu(); 5850 cmap2_pte2p = pc->pc_cmap2_pte2p; 5851 mtx_lock(&pc->pc_cmap_lock); 5852 if (pte2_load(cmap2_pte2p) != 0) 5853 panic("%s: CMAP2 busy", __func__); 5854 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5855 vm_page_pte2_attr(m))); 5856 pagezero(pc->pc_cmap2_addr); 5857 pte2_clear(cmap2_pte2p); 5858 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5859 sched_unpin(); 5860 mtx_unlock(&pc->pc_cmap_lock); 5861 } 5862 5863 /* 5864 * pmap_zero_page_area zeros the specified hardware page by mapping 5865 * the page into KVM and using bzero to clear its contents. 5866 * 5867 * off and size may not cover an area beyond a single hardware page. 5868 */ 5869 void 5870 pmap_zero_page_area(vm_page_t m, int off, int size) 5871 { 5872 pt2_entry_t *cmap2_pte2p; 5873 struct pcpu *pc; 5874 5875 sched_pin(); 5876 pc = get_pcpu(); 5877 cmap2_pte2p = pc->pc_cmap2_pte2p; 5878 mtx_lock(&pc->pc_cmap_lock); 5879 if (pte2_load(cmap2_pte2p) != 0) 5880 panic("%s: CMAP2 busy", __func__); 5881 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5882 vm_page_pte2_attr(m))); 5883 if (off == 0 && size == PAGE_SIZE) 5884 pagezero(pc->pc_cmap2_addr); 5885 else 5886 bzero(pc->pc_cmap2_addr + off, size); 5887 pte2_clear(cmap2_pte2p); 5888 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5889 sched_unpin(); 5890 mtx_unlock(&pc->pc_cmap_lock); 5891 } 5892 5893 /* 5894 * pmap_copy_page copies the specified (machine independent) 5895 * page by mapping the page into virtual memory and using 5896 * bcopy to copy the page, one machine dependent page at a 5897 * time. 5898 */ 5899 void 5900 pmap_copy_page(vm_page_t src, vm_page_t dst) 5901 { 5902 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5903 struct pcpu *pc; 5904 5905 sched_pin(); 5906 pc = get_pcpu(); 5907 cmap1_pte2p = pc->pc_cmap1_pte2p; 5908 cmap2_pte2p = pc->pc_cmap2_pte2p; 5909 mtx_lock(&pc->pc_cmap_lock); 5910 if (pte2_load(cmap1_pte2p) != 0) 5911 panic("%s: CMAP1 busy", __func__); 5912 if (pte2_load(cmap2_pte2p) != 0) 5913 panic("%s: CMAP2 busy", __func__); 5914 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), 5915 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); 5916 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), 5917 PTE2_AP_KRW, vm_page_pte2_attr(dst))); 5918 bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); 5919 pte2_clear(cmap1_pte2p); 5920 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5921 pte2_clear(cmap2_pte2p); 5922 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5923 sched_unpin(); 5924 mtx_unlock(&pc->pc_cmap_lock); 5925 } 5926 5927 int unmapped_buf_allowed = 1; 5928 5929 void 5930 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5931 vm_offset_t b_offset, int xfersize) 5932 { 5933 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5934 vm_page_t a_pg, b_pg; 5935 char *a_cp, *b_cp; 5936 vm_offset_t a_pg_offset, b_pg_offset; 5937 struct pcpu *pc; 5938 int cnt; 5939 5940 sched_pin(); 5941 pc = get_pcpu(); 5942 cmap1_pte2p = pc->pc_cmap1_pte2p; 5943 cmap2_pte2p = pc->pc_cmap2_pte2p; 5944 mtx_lock(&pc->pc_cmap_lock); 5945 if (pte2_load(cmap1_pte2p) != 0) 5946 panic("pmap_copy_pages: CMAP1 busy"); 5947 if (pte2_load(cmap2_pte2p) != 0) 5948 panic("pmap_copy_pages: CMAP2 busy"); 5949 while (xfersize > 0) { 5950 a_pg = ma[a_offset >> PAGE_SHIFT]; 5951 a_pg_offset = a_offset & PAGE_MASK; 5952 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5953 b_pg = mb[b_offset >> PAGE_SHIFT]; 5954 b_pg_offset = b_offset & PAGE_MASK; 5955 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5956 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), 5957 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); 5958 tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); 5959 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), 5960 PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); 5961 tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); 5962 a_cp = pc->pc_cmap1_addr + a_pg_offset; 5963 b_cp = pc->pc_cmap2_addr + b_pg_offset; 5964 bcopy(a_cp, b_cp, cnt); 5965 a_offset += cnt; 5966 b_offset += cnt; 5967 xfersize -= cnt; 5968 } 5969 pte2_clear(cmap1_pte2p); 5970 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5971 pte2_clear(cmap2_pte2p); 5972 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5973 sched_unpin(); 5974 mtx_unlock(&pc->pc_cmap_lock); 5975 } 5976 5977 vm_offset_t 5978 pmap_quick_enter_page(vm_page_t m) 5979 { 5980 struct pcpu *pc; 5981 pt2_entry_t *pte2p; 5982 5983 critical_enter(); 5984 pc = get_pcpu(); 5985 pte2p = pc->pc_qmap_pte2p; 5986 5987 KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); 5988 5989 pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5990 vm_page_pte2_attr(m))); 5991 return (pc->pc_qmap_addr); 5992 } 5993 5994 void 5995 pmap_quick_remove_page(vm_offset_t addr) 5996 { 5997 struct pcpu *pc; 5998 pt2_entry_t *pte2p; 5999 6000 pc = get_pcpu(); 6001 pte2p = pc->pc_qmap_pte2p; 6002 6003 KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); 6004 KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); 6005 6006 pte2_clear(pte2p); 6007 tlb_flush(pc->pc_qmap_addr); 6008 critical_exit(); 6009 } 6010 6011 /* 6012 * Copy the range specified by src_addr/len 6013 * from the source map to the range dst_addr/len 6014 * in the destination map. 6015 * 6016 * This routine is only advisory and need not do anything. 6017 */ 6018 void 6019 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6020 vm_offset_t src_addr) 6021 { 6022 struct spglist free; 6023 vm_offset_t addr; 6024 vm_offset_t end_addr = src_addr + len; 6025 vm_offset_t nextva; 6026 6027 if (dst_addr != src_addr) 6028 return; 6029 6030 if (!pmap_is_current(src_pmap)) 6031 return; 6032 6033 rw_wlock(&pvh_global_lock); 6034 if (dst_pmap < src_pmap) { 6035 PMAP_LOCK(dst_pmap); 6036 PMAP_LOCK(src_pmap); 6037 } else { 6038 PMAP_LOCK(src_pmap); 6039 PMAP_LOCK(dst_pmap); 6040 } 6041 sched_pin(); 6042 for (addr = src_addr; addr < end_addr; addr = nextva) { 6043 pt2_entry_t *src_pte2p, *dst_pte2p; 6044 vm_page_t dst_mpt2pg, src_mpt2pg; 6045 pt1_entry_t src_pte1; 6046 u_int pte1_idx; 6047 6048 KASSERT(addr < VM_MAXUSER_ADDRESS, 6049 ("%s: invalid to pmap_copy page tables", __func__)); 6050 6051 nextva = pte1_trunc(addr + PTE1_SIZE); 6052 if (nextva < addr) 6053 nextva = end_addr; 6054 6055 pte1_idx = pte1_index(addr); 6056 src_pte1 = src_pmap->pm_pt1[pte1_idx]; 6057 if (pte1_is_section(src_pte1)) { 6058 if ((addr & PTE1_OFFSET) != 0 || 6059 (addr + PTE1_SIZE) > end_addr) 6060 continue; 6061 if (dst_pmap->pm_pt1[pte1_idx] == 0 && 6062 (!pte1_is_managed(src_pte1) || 6063 pmap_pv_insert_pte1(dst_pmap, addr, src_pte1, 6064 PMAP_ENTER_NORECLAIM))) { 6065 dst_pmap->pm_pt1[pte1_idx] = src_pte1 & 6066 ~PTE1_W; 6067 dst_pmap->pm_stats.resident_count += 6068 PTE1_SIZE / PAGE_SIZE; 6069 pmap_pte1_mappings++; 6070 } 6071 continue; 6072 } else if (!pte1_is_link(src_pte1)) 6073 continue; 6074 6075 src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); 6076 6077 /* 6078 * We leave PT2s to be linked from PT1 even if they are not 6079 * referenced until all PT2s in a page are without reference. 6080 * 6081 * QQQ: It could be changed ... 6082 */ 6083 #if 0 /* single_pt2_link_is_cleared */ 6084 KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, 6085 ("%s: source page table page is unused", __func__)); 6086 #else 6087 if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) 6088 continue; 6089 #endif 6090 if (nextva > end_addr) 6091 nextva = end_addr; 6092 6093 src_pte2p = pt2map_entry(addr); 6094 while (addr < nextva) { 6095 pt2_entry_t temp_pte2; 6096 temp_pte2 = pte2_load(src_pte2p); 6097 /* 6098 * we only virtual copy managed pages 6099 */ 6100 if (pte2_is_managed(temp_pte2)) { 6101 dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, 6102 PMAP_ENTER_NOSLEEP); 6103 if (dst_mpt2pg == NULL) 6104 goto out; 6105 dst_pte2p = pmap_pte2_quick(dst_pmap, addr); 6106 if (!pte2_is_valid(pte2_load(dst_pte2p)) && 6107 pmap_try_insert_pv_entry(dst_pmap, addr, 6108 PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { 6109 /* 6110 * Clear the wired, modified, and 6111 * accessed (referenced) bits 6112 * during the copy. 6113 */ 6114 temp_pte2 &= ~(PTE2_W | PTE2_A); 6115 temp_pte2 |= PTE2_NM; 6116 pte2_store(dst_pte2p, temp_pte2); 6117 dst_pmap->pm_stats.resident_count++; 6118 } else { 6119 SLIST_INIT(&free); 6120 if (pmap_unwire_pt2(dst_pmap, addr, 6121 dst_mpt2pg, &free)) { 6122 pmap_tlb_flush(dst_pmap, addr); 6123 vm_page_free_pages_toq(&free, 6124 false); 6125 } 6126 goto out; 6127 } 6128 if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= 6129 pt2_wirecount_get(src_mpt2pg, pte1_idx)) 6130 break; 6131 } 6132 addr += PAGE_SIZE; 6133 src_pte2p++; 6134 } 6135 } 6136 out: 6137 sched_unpin(); 6138 rw_wunlock(&pvh_global_lock); 6139 PMAP_UNLOCK(src_pmap); 6140 PMAP_UNLOCK(dst_pmap); 6141 } 6142 6143 /* 6144 * Increase the starting virtual address of the given mapping if a 6145 * different alignment might result in more section mappings. 6146 */ 6147 void 6148 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6149 vm_offset_t *addr, vm_size_t size) 6150 { 6151 vm_offset_t pte1_offset; 6152 6153 if (size < PTE1_SIZE) 6154 return; 6155 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6156 offset += ptoa(object->pg_color); 6157 pte1_offset = offset & PTE1_OFFSET; 6158 if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || 6159 (*addr & PTE1_OFFSET) == pte1_offset) 6160 return; 6161 if ((*addr & PTE1_OFFSET) < pte1_offset) 6162 *addr = pte1_trunc(*addr) + pte1_offset; 6163 else 6164 *addr = pte1_roundup(*addr) + pte1_offset; 6165 } 6166 6167 void 6168 pmap_activate(struct thread *td) 6169 { 6170 pmap_t pmap, oldpmap; 6171 u_int cpuid, ttb; 6172 6173 PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); 6174 6175 critical_enter(); 6176 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6177 oldpmap = PCPU_GET(curpmap); 6178 cpuid = PCPU_GET(cpuid); 6179 6180 #if defined(SMP) 6181 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6182 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6183 #else 6184 CPU_CLR(cpuid, &oldpmap->pm_active); 6185 CPU_SET(cpuid, &pmap->pm_active); 6186 #endif 6187 6188 ttb = pmap_ttb_get(pmap); 6189 6190 /* 6191 * pmap_activate is for the current thread on the current cpu 6192 */ 6193 td->td_pcb->pcb_pagedir = ttb; 6194 cp15_ttbr_set(ttb); 6195 PCPU_SET(curpmap, pmap); 6196 critical_exit(); 6197 } 6198 6199 /* 6200 * Perform the pmap work for mincore(2). If the page is not both referenced and 6201 * modified by this pmap, returns its physical address so that the caller can 6202 * find other mappings. 6203 */ 6204 int 6205 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6206 { 6207 pt1_entry_t *pte1p, pte1; 6208 pt2_entry_t *pte2p, pte2; 6209 vm_paddr_t pa; 6210 bool managed; 6211 int val; 6212 6213 PMAP_LOCK(pmap); 6214 pte1p = pmap_pte1(pmap, addr); 6215 pte1 = pte1_load(pte1p); 6216 if (pte1_is_section(pte1)) { 6217 pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); 6218 managed = pte1_is_managed(pte1); 6219 val = MINCORE_PSIND(1) | MINCORE_INCORE; 6220 if (pte1_is_dirty(pte1)) 6221 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6222 if (pte1 & PTE1_A) 6223 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6224 } else if (pte1_is_link(pte1)) { 6225 pte2p = pmap_pte2(pmap, addr); 6226 pte2 = pte2_load(pte2p); 6227 pmap_pte2_release(pte2p); 6228 pa = pte2_pa(pte2); 6229 managed = pte2_is_managed(pte2); 6230 val = MINCORE_INCORE; 6231 if (pte2_is_dirty(pte2)) 6232 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6233 if (pte2 & PTE2_A) 6234 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6235 } else { 6236 managed = false; 6237 val = 0; 6238 } 6239 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6240 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6241 *pap = pa; 6242 } 6243 PMAP_UNLOCK(pmap); 6244 return (val); 6245 } 6246 6247 void 6248 pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) 6249 { 6250 vm_offset_t sva; 6251 uint32_t l2attr; 6252 6253 KASSERT((size & PAGE_MASK) == 0, 6254 ("%s: device mapping not page-sized", __func__)); 6255 6256 sva = va; 6257 l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); 6258 while (size != 0) { 6259 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); 6260 va += PAGE_SIZE; 6261 pa += PAGE_SIZE; 6262 size -= PAGE_SIZE; 6263 } 6264 tlb_flush_range(sva, va - sva); 6265 } 6266 6267 void 6268 pmap_kremove_device(vm_offset_t va, vm_size_t size) 6269 { 6270 vm_offset_t sva; 6271 6272 KASSERT((size & PAGE_MASK) == 0, 6273 ("%s: device mapping not page-sized", __func__)); 6274 6275 sva = va; 6276 while (size != 0) { 6277 pmap_kremove(va); 6278 va += PAGE_SIZE; 6279 size -= PAGE_SIZE; 6280 } 6281 tlb_flush_range(sva, va - sva); 6282 } 6283 6284 void 6285 pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) 6286 { 6287 6288 pcb->pcb_pagedir = pmap_ttb_get(pmap); 6289 } 6290 6291 /* 6292 * Clean L1 data cache range by physical address. 6293 * The range must be within a single page. 6294 */ 6295 static void 6296 pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) 6297 { 6298 pt2_entry_t *cmap2_pte2p; 6299 struct pcpu *pc; 6300 6301 KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, 6302 ("%s: not on single page", __func__)); 6303 6304 sched_pin(); 6305 pc = get_pcpu(); 6306 cmap2_pte2p = pc->pc_cmap2_pte2p; 6307 mtx_lock(&pc->pc_cmap_lock); 6308 if (pte2_load(cmap2_pte2p) != 0) 6309 panic("%s: CMAP2 busy", __func__); 6310 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); 6311 dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); 6312 pte2_clear(cmap2_pte2p); 6313 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6314 sched_unpin(); 6315 mtx_unlock(&pc->pc_cmap_lock); 6316 } 6317 6318 /* 6319 * Sync instruction cache range which is not mapped yet. 6320 */ 6321 void 6322 cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) 6323 { 6324 uint32_t len, offset; 6325 vm_page_t m; 6326 6327 /* Write back d-cache on given address range. */ 6328 offset = pa & PAGE_MASK; 6329 for ( ; size != 0; size -= len, pa += len, offset = 0) { 6330 len = min(PAGE_SIZE - offset, size); 6331 m = PHYS_TO_VM_PAGE(pa); 6332 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6333 __func__, pa)); 6334 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6335 } 6336 /* 6337 * I-cache is VIPT. Only way how to flush all virtual mappings 6338 * on given physical address is to invalidate all i-cache. 6339 */ 6340 icache_inv_all(); 6341 } 6342 6343 void 6344 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) 6345 { 6346 6347 /* Write back d-cache on given address range. */ 6348 if (va >= VM_MIN_KERNEL_ADDRESS) { 6349 dcache_wb_pou(va, size); 6350 } else { 6351 uint32_t len, offset; 6352 vm_paddr_t pa; 6353 vm_page_t m; 6354 6355 offset = va & PAGE_MASK; 6356 for ( ; size != 0; size -= len, va += len, offset = 0) { 6357 pa = pmap_extract(pmap, va); /* offset is preserved */ 6358 len = min(PAGE_SIZE - offset, size); 6359 m = PHYS_TO_VM_PAGE(pa); 6360 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6361 __func__, pa)); 6362 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6363 } 6364 } 6365 /* 6366 * I-cache is VIPT. Only way how to flush all virtual mappings 6367 * on given physical address is to invalidate all i-cache. 6368 */ 6369 icache_inv_all(); 6370 } 6371 6372 /* 6373 * The implementation of pmap_fault() uses IN_RANGE2() macro which 6374 * depends on the fact that given range size is a power of 2. 6375 */ 6376 CTASSERT(powerof2(NB_IN_PT1)); 6377 CTASSERT(powerof2(PT2MAP_SIZE)); 6378 6379 #define IN_RANGE2(addr, start, size) \ 6380 ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) 6381 6382 /* 6383 * Handle access and R/W emulation faults. 6384 */ 6385 int 6386 pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) 6387 { 6388 pt1_entry_t *pte1p, pte1; 6389 pt2_entry_t *pte2p, pte2; 6390 6391 if (pmap == NULL) 6392 pmap = kernel_pmap; 6393 6394 /* 6395 * In kernel, we should never get abort with FAR which is in range of 6396 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here 6397 * and print out a useful abort message and even get to the debugger 6398 * otherwise it likely ends with never ending loop of aborts. 6399 */ 6400 if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { 6401 /* 6402 * All L1 tables should always be mapped and present. 6403 * However, we check only current one herein. For user mode, 6404 * only permission abort from malicious user is not fatal. 6405 * And alignment abort as it may have higher priority. 6406 */ 6407 if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { 6408 CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", 6409 __func__, pmap, pmap->pm_pt1, far); 6410 panic("%s: pm_pt1 abort", __func__); 6411 } 6412 return (KERN_INVALID_ADDRESS); 6413 } 6414 if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { 6415 /* 6416 * PT2MAP should be always mapped and present in current 6417 * L1 table. However, only existing L2 tables are mapped 6418 * in PT2MAP. For user mode, only L2 translation abort and 6419 * permission abort from malicious user is not fatal. 6420 * And alignment abort as it may have higher priority. 6421 */ 6422 if (!usermode || (idx != FAULT_ALIGN && 6423 idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { 6424 CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", 6425 __func__, pmap, PT2MAP, far); 6426 panic("%s: PT2MAP abort", __func__); 6427 } 6428 return (KERN_INVALID_ADDRESS); 6429 } 6430 6431 /* 6432 * A pmap lock is used below for handling of access and R/W emulation 6433 * aborts. They were handled by atomic operations before so some 6434 * analysis of new situation is needed to answer the following question: 6435 * Is it safe to use the lock even for these aborts? 6436 * 6437 * There may happen two cases in general: 6438 * 6439 * (1) Aborts while the pmap lock is locked already - this should not 6440 * happen as pmap lock is not recursive. However, under pmap lock only 6441 * internal kernel data should be accessed and such data should be 6442 * mapped with A bit set and NM bit cleared. If double abort happens, 6443 * then a mapping of data which has caused it must be fixed. Further, 6444 * all new mappings are always made with A bit set and the bit can be 6445 * cleared only on managed mappings. 6446 * 6447 * (2) Aborts while another lock(s) is/are locked - this already can 6448 * happen. However, there is no difference here if it's either access or 6449 * R/W emulation abort, or if it's some other abort. 6450 */ 6451 6452 PMAP_LOCK(pmap); 6453 #ifdef INVARIANTS 6454 pte1 = pte1_load(pmap_pte1(pmap, far)); 6455 if (pte1_is_link(pte1)) { 6456 /* 6457 * Check in advance that associated L2 page table is mapped into 6458 * PT2MAP space. Note that faulty access to not mapped L2 page 6459 * table is caught in more general check above where "far" is 6460 * checked that it does not lay in PT2MAP space. Note also that 6461 * L1 page table and PT2TAB always exist and are mapped. 6462 */ 6463 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); 6464 if (!pte2_is_valid(pte2)) 6465 panic("%s: missing L2 page table (%p, %#x)", 6466 __func__, pmap, far); 6467 } 6468 #endif 6469 #ifdef SMP 6470 /* 6471 * Special treatment is due to break-before-make approach done when 6472 * pte1 is updated for userland mapping during section promotion or 6473 * demotion. If not caught here, pmap_enter() can find a section 6474 * mapping on faulting address. That is not allowed. 6475 */ 6476 if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { 6477 PMAP_UNLOCK(pmap); 6478 return (KERN_SUCCESS); 6479 } 6480 #endif 6481 /* 6482 * Accesss bits for page and section. Note that the entry 6483 * is not in TLB yet, so TLB flush is not necessary. 6484 * 6485 * QQQ: This is hardware emulation, we do not call userret() 6486 * for aborts from user mode. 6487 */ 6488 if (idx == FAULT_ACCESS_L2) { 6489 pte1 = pte1_load(pmap_pte1(pmap, far)); 6490 if (pte1_is_link(pte1)) { 6491 /* L2 page table should exist and be mapped. */ 6492 pte2p = pt2map_entry(far); 6493 pte2 = pte2_load(pte2p); 6494 if (pte2_is_valid(pte2)) { 6495 pte2_store(pte2p, pte2 | PTE2_A); 6496 PMAP_UNLOCK(pmap); 6497 return (KERN_SUCCESS); 6498 } 6499 } else { 6500 /* 6501 * We got L2 access fault but PTE1 is not a link. 6502 * Probably some race happened, do nothing. 6503 */ 6504 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L2 - pmap %#x far %#x", 6505 __func__, pmap, far); 6506 PMAP_UNLOCK(pmap); 6507 return (KERN_SUCCESS); 6508 } 6509 } 6510 if (idx == FAULT_ACCESS_L1) { 6511 pte1p = pmap_pte1(pmap, far); 6512 pte1 = pte1_load(pte1p); 6513 if (pte1_is_section(pte1)) { 6514 pte1_store(pte1p, pte1 | PTE1_A); 6515 PMAP_UNLOCK(pmap); 6516 return (KERN_SUCCESS); 6517 } else { 6518 /* 6519 * We got L1 access fault but PTE1 is not section 6520 * mapping. Probably some race happened, do nothing. 6521 */ 6522 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L1 - pmap %#x far %#x", 6523 __func__, pmap, far); 6524 PMAP_UNLOCK(pmap); 6525 return (KERN_SUCCESS); 6526 } 6527 } 6528 6529 /* 6530 * Handle modify bits for page and section. Note that the modify 6531 * bit is emulated by software. So PTEx_RO is software read only 6532 * bit and PTEx_NM flag is real hardware read only bit. 6533 * 6534 * QQQ: This is hardware emulation, we do not call userret() 6535 * for aborts from user mode. 6536 */ 6537 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { 6538 pte1 = pte1_load(pmap_pte1(pmap, far)); 6539 if (pte1_is_link(pte1)) { 6540 /* L2 page table should exist and be mapped. */ 6541 pte2p = pt2map_entry(far); 6542 pte2 = pte2_load(pte2p); 6543 if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && 6544 (pte2 & PTE2_NM)) { 6545 pte2_store(pte2p, pte2 & ~PTE2_NM); 6546 tlb_flush(trunc_page(far)); 6547 PMAP_UNLOCK(pmap); 6548 return (KERN_SUCCESS); 6549 } 6550 } else { 6551 /* 6552 * We got L2 permission fault but PTE1 is not a link. 6553 * Probably some race happened, do nothing. 6554 */ 6555 CTR3(KTR_PMAP, "%s: FAULT_PERM_L2 - pmap %#x far %#x", 6556 __func__, pmap, far); 6557 PMAP_UNLOCK(pmap); 6558 return (KERN_SUCCESS); 6559 } 6560 } 6561 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { 6562 pte1p = pmap_pte1(pmap, far); 6563 pte1 = pte1_load(pte1p); 6564 if (pte1_is_section(pte1)) { 6565 if (!(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { 6566 pte1_store(pte1p, pte1 & ~PTE1_NM); 6567 tlb_flush(pte1_trunc(far)); 6568 PMAP_UNLOCK(pmap); 6569 return (KERN_SUCCESS); 6570 } 6571 } else { 6572 /* 6573 * We got L1 permission fault but PTE1 is not section 6574 * mapping. Probably some race happened, do nothing. 6575 */ 6576 CTR3(KTR_PMAP, "%s: FAULT_PERM_L1 - pmap %#x far %#x", 6577 __func__, pmap, far); 6578 PMAP_UNLOCK(pmap); 6579 return (KERN_SUCCESS); 6580 } 6581 } 6582 6583 /* 6584 * QQQ: The previous code, mainly fast handling of access and 6585 * modify bits aborts, could be moved to ASM. Now we are 6586 * starting to deal with not fast aborts. 6587 */ 6588 PMAP_UNLOCK(pmap); 6589 return (KERN_FAILURE); 6590 } 6591 6592 #if defined(PMAP_DEBUG) 6593 /* 6594 * Reusing of KVA used in pmap_zero_page function !!! 6595 */ 6596 static void 6597 pmap_zero_page_check(vm_page_t m) 6598 { 6599 pt2_entry_t *cmap2_pte2p; 6600 uint32_t *p, *end; 6601 struct pcpu *pc; 6602 6603 sched_pin(); 6604 pc = get_pcpu(); 6605 cmap2_pte2p = pc->pc_cmap2_pte2p; 6606 mtx_lock(&pc->pc_cmap_lock); 6607 if (pte2_load(cmap2_pte2p) != 0) 6608 panic("%s: CMAP2 busy", __func__); 6609 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6610 vm_page_pte2_attr(m))); 6611 end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); 6612 for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) 6613 if (*p != 0) 6614 panic("%s: page %p not zero, va: %p", __func__, m, 6615 pc->pc_cmap2_addr); 6616 pte2_clear(cmap2_pte2p); 6617 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6618 sched_unpin(); 6619 mtx_unlock(&pc->pc_cmap_lock); 6620 } 6621 6622 int 6623 pmap_pid_dump(int pid) 6624 { 6625 pmap_t pmap; 6626 struct proc *p; 6627 int npte2 = 0; 6628 int i, j, index; 6629 6630 sx_slock(&allproc_lock); 6631 FOREACH_PROC_IN_SYSTEM(p) { 6632 if (p->p_pid != pid || p->p_vmspace == NULL) 6633 continue; 6634 index = 0; 6635 pmap = vmspace_pmap(p->p_vmspace); 6636 for (i = 0; i < NPTE1_IN_PT1; i++) { 6637 pt1_entry_t pte1; 6638 pt2_entry_t *pte2p, pte2; 6639 vm_offset_t base, va; 6640 vm_paddr_t pa; 6641 vm_page_t m; 6642 6643 base = i << PTE1_SHIFT; 6644 pte1 = pte1_load(&pmap->pm_pt1[i]); 6645 6646 if (pte1_is_section(pte1)) { 6647 /* 6648 * QQQ: Do something here! 6649 */ 6650 } else if (pte1_is_link(pte1)) { 6651 for (j = 0; j < NPTE2_IN_PT2; j++) { 6652 va = base + (j << PAGE_SHIFT); 6653 if (va >= VM_MIN_KERNEL_ADDRESS) { 6654 if (index) { 6655 index = 0; 6656 printf("\n"); 6657 } 6658 sx_sunlock(&allproc_lock); 6659 return (npte2); 6660 } 6661 pte2p = pmap_pte2(pmap, va); 6662 pte2 = pte2_load(pte2p); 6663 pmap_pte2_release(pte2p); 6664 if (!pte2_is_valid(pte2)) 6665 continue; 6666 6667 pa = pte2_pa(pte2); 6668 m = PHYS_TO_VM_PAGE(pa); 6669 printf("va: 0x%x, pa: 0x%x, w: %d, " 6670 "f: 0x%x", va, pa, 6671 m->ref_count, m->flags); 6672 npte2++; 6673 index++; 6674 if (index >= 2) { 6675 index = 0; 6676 printf("\n"); 6677 } else { 6678 printf(" "); 6679 } 6680 } 6681 } 6682 } 6683 } 6684 sx_sunlock(&allproc_lock); 6685 return (npte2); 6686 } 6687 6688 #endif 6689 6690 #ifdef DDB 6691 static pt2_entry_t * 6692 pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) 6693 { 6694 pt1_entry_t pte1; 6695 vm_paddr_t pt2pg_pa; 6696 6697 pte1 = pte1_load(pmap_pte1(pmap, va)); 6698 if (!pte1_is_link(pte1)) 6699 return (NULL); 6700 6701 if (pmap_is_current(pmap)) 6702 return (pt2map_entry(va)); 6703 6704 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 6705 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 6706 if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { 6707 pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); 6708 #ifdef SMP 6709 PMAP3cpu = PCPU_GET(cpuid); 6710 #endif 6711 tlb_flush_local((vm_offset_t)PADDR3); 6712 } 6713 #ifdef SMP 6714 else if (PMAP3cpu != PCPU_GET(cpuid)) { 6715 PMAP3cpu = PCPU_GET(cpuid); 6716 tlb_flush_local((vm_offset_t)PADDR3); 6717 } 6718 #endif 6719 return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 6720 } 6721 6722 static void 6723 dump_pmap(pmap_t pmap) 6724 { 6725 6726 printf("pmap %p\n", pmap); 6727 printf(" pm_pt1: %p\n", pmap->pm_pt1); 6728 printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); 6729 printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); 6730 } 6731 6732 DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) 6733 { 6734 6735 pmap_t pmap; 6736 LIST_FOREACH(pmap, &allpmaps, pm_list) { 6737 dump_pmap(pmap); 6738 } 6739 } 6740 6741 static int 6742 pte2_class(pt2_entry_t pte2) 6743 { 6744 int cls; 6745 6746 cls = (pte2 >> 2) & 0x03; 6747 cls |= (pte2 >> 4) & 0x04; 6748 return (cls); 6749 } 6750 6751 static void 6752 dump_section(pmap_t pmap, uint32_t pte1_idx) 6753 { 6754 } 6755 6756 static void 6757 dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) 6758 { 6759 uint32_t i; 6760 vm_offset_t va; 6761 pt2_entry_t *pte2p, pte2; 6762 vm_page_t m; 6763 6764 va = pte1_idx << PTE1_SHIFT; 6765 pte2p = pmap_pte2_ddb(pmap, va); 6766 for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { 6767 pte2 = pte2_load(pte2p); 6768 if (pte2 == 0) 6769 continue; 6770 if (!pte2_is_valid(pte2)) { 6771 printf(" 0x%08X: 0x%08X", va, pte2); 6772 if (!invalid_ok) 6773 printf(" - not valid !!!"); 6774 printf("\n"); 6775 continue; 6776 } 6777 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 6778 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, 6779 pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); 6780 if (m != NULL) { 6781 printf(" v:%d w:%d f:0x%04X\n", m->valid, 6782 m->ref_count, m->flags); 6783 } else { 6784 printf("\n"); 6785 } 6786 } 6787 } 6788 6789 static __inline boolean_t 6790 is_pv_chunk_space(vm_offset_t va) 6791 { 6792 6793 if ((((vm_offset_t)pv_chunkbase) <= va) && 6794 (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) 6795 return (TRUE); 6796 return (FALSE); 6797 } 6798 6799 DB_SHOW_COMMAND(pmap, pmap_pmap_print) 6800 { 6801 /* XXX convert args. */ 6802 pmap_t pmap = (pmap_t)addr; 6803 pt1_entry_t pte1; 6804 pt2_entry_t pte2; 6805 vm_offset_t va, eva; 6806 vm_page_t m; 6807 uint32_t i; 6808 boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; 6809 6810 if (have_addr) { 6811 pmap_t pm; 6812 6813 LIST_FOREACH(pm, &allpmaps, pm_list) 6814 if (pm == pmap) break; 6815 if (pm == NULL) { 6816 printf("given pmap %p is not in allpmaps list\n", pmap); 6817 return; 6818 } 6819 } else 6820 pmap = PCPU_GET(curpmap); 6821 6822 eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; 6823 dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ 6824 6825 printf("pmap: 0x%08X\n", (uint32_t)pmap); 6826 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6827 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6828 6829 for(i = 0; i < NPTE1_IN_PT1; i++) { 6830 pte1 = pte1_load(&pmap->pm_pt1[i]); 6831 if (pte1 == 0) 6832 continue; 6833 va = i << PTE1_SHIFT; 6834 if (va >= eva) 6835 break; 6836 6837 if (pte1_is_section(pte1)) { 6838 printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, 6839 !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); 6840 dump_section(pmap, i); 6841 } else if (pte1_is_link(pte1)) { 6842 dump_link_ok = TRUE; 6843 invalid_ok = FALSE; 6844 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6845 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 6846 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", 6847 va, pte1, pte2, m); 6848 if (is_pv_chunk_space(va)) { 6849 printf(" - pv_chunk space"); 6850 if (dump_pv_chunk) 6851 invalid_ok = TRUE; 6852 else 6853 dump_link_ok = FALSE; 6854 } 6855 else if (m != NULL) 6856 printf(" w:%d w2:%u", m->ref_count, 6857 pt2_wirecount_get(m, pte1_index(va))); 6858 if (pte2 == 0) 6859 printf(" !!! pt2tab entry is ZERO"); 6860 else if (pte2_pa(pte1) != pte2_pa(pte2)) 6861 printf(" !!! pt2tab entry is DIFFERENT - m: %p", 6862 PHYS_TO_VM_PAGE(pte2_pa(pte2))); 6863 printf("\n"); 6864 if (dump_link_ok) 6865 dump_link(pmap, i, invalid_ok); 6866 } else 6867 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6868 } 6869 } 6870 6871 static void 6872 dump_pt2tab(pmap_t pmap) 6873 { 6874 uint32_t i; 6875 pt2_entry_t pte2; 6876 vm_offset_t va; 6877 vm_paddr_t pa; 6878 vm_page_t m; 6879 6880 printf("PT2TAB:\n"); 6881 for (i = 0; i < PT2TAB_ENTRIES; i++) { 6882 pte2 = pte2_load(&pmap->pm_pt2tab[i]); 6883 if (!pte2_is_valid(pte2)) 6884 continue; 6885 va = i << PT2TAB_SHIFT; 6886 pa = pte2_pa(pte2); 6887 m = PHYS_TO_VM_PAGE(pa); 6888 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, 6889 pte2_class(pte2), !!(pte2 & PTE2_S), m); 6890 if (m != NULL) 6891 printf(" , w: %d, f: 0x%04X pidx: %lld", 6892 m->ref_count, m->flags, m->pindex); 6893 printf("\n"); 6894 } 6895 } 6896 6897 DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) 6898 { 6899 /* XXX convert args. */ 6900 pmap_t pmap = (pmap_t)addr; 6901 pt1_entry_t pte1; 6902 pt2_entry_t pte2; 6903 vm_offset_t va; 6904 uint32_t i, start; 6905 6906 if (have_addr) { 6907 printf("supported only on current pmap\n"); 6908 return; 6909 } 6910 6911 pmap = PCPU_GET(curpmap); 6912 printf("curpmap: 0x%08X\n", (uint32_t)pmap); 6913 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6914 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6915 6916 start = pte1_index((vm_offset_t)PT2MAP); 6917 for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { 6918 pte1 = pte1_load(&pmap->pm_pt1[i]); 6919 if (pte1 == 0) 6920 continue; 6921 va = i << PTE1_SHIFT; 6922 if (pte1_is_section(pte1)) { 6923 printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, 6924 !!(pte1 & PTE1_S)); 6925 dump_section(pmap, i); 6926 } else if (pte1_is_link(pte1)) { 6927 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6928 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, 6929 pte1, pte2); 6930 if (pte2 == 0) 6931 printf(" !!! pt2tab entry is ZERO\n"); 6932 } else 6933 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6934 } 6935 dump_pt2tab(pmap); 6936 } 6937 #endif 6938