1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1994 David Greenman 7 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 8 * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org> 9 * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org> 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 41 */ 42 /*- 43 * Copyright (c) 2003 Networks Associates Technology, Inc. 44 * All rights reserved. 45 * 46 * This software was developed for the FreeBSD Project by Jake Burkholder, 47 * Safeport Network Services, and Network Associates Laboratories, the 48 * Security Research Division of Network Associates, Inc. under 49 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 50 * CHATS research program. 51 * 52 * Redistribution and use in source and binary forms, with or without 53 * modification, are permitted provided that the following conditions 54 * are met: 55 * 1. Redistributions of source code must retain the above copyright 56 * notice, this list of conditions and the following disclaimer. 57 * 2. Redistributions in binary form must reproduce the above copyright 58 * notice, this list of conditions and the following disclaimer in the 59 * documentation and/or other materials provided with the distribution. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 */ 73 74 #include <sys/cdefs.h> 75 __FBSDID("$FreeBSD$"); 76 77 /* 78 * Manages physical address maps. 79 * 80 * Since the information managed by this module is 81 * also stored by the logical address mapping module, 82 * this module may throw away valid virtual-to-physical 83 * mappings at almost any time. However, invalidations 84 * of virtual-to-physical mappings must be done as 85 * requested. 86 * 87 * In order to cope with hardware architectures which 88 * make virtual-to-physical map invalidates expensive, 89 * this module may delay invalidate or reduced protection 90 * operations until such time as they are actually 91 * necessary. This module is given full information as 92 * to which processors are currently using which maps, 93 * and to when physical maps must be made correct. 94 */ 95 96 #include "opt_vm.h" 97 #include "opt_pmap.h" 98 #include "opt_ddb.h" 99 100 #include <sys/param.h> 101 #include <sys/systm.h> 102 #include <sys/kernel.h> 103 #include <sys/ktr.h> 104 #include <sys/lock.h> 105 #include <sys/proc.h> 106 #include <sys/rwlock.h> 107 #include <sys/malloc.h> 108 #include <sys/vmmeter.h> 109 #include <sys/malloc.h> 110 #include <sys/mman.h> 111 #include <sys/sf_buf.h> 112 #include <sys/smp.h> 113 #include <sys/sched.h> 114 #include <sys/sysctl.h> 115 116 #ifdef DDB 117 #include <ddb/ddb.h> 118 #endif 119 120 #include <vm/vm.h> 121 #include <vm/uma.h> 122 #include <vm/pmap.h> 123 #include <vm/vm_param.h> 124 #include <vm/vm_kern.h> 125 #include <vm/vm_object.h> 126 #include <vm/vm_map.h> 127 #include <vm/vm_page.h> 128 #include <vm/vm_pageout.h> 129 #include <vm/vm_phys.h> 130 #include <vm/vm_extern.h> 131 #include <vm/vm_reserv.h> 132 #include <sys/lock.h> 133 #include <sys/mutex.h> 134 135 #include <machine/md_var.h> 136 #include <machine/pmap_var.h> 137 #include <machine/cpu.h> 138 #include <machine/pcb.h> 139 #include <machine/sf_buf.h> 140 #ifdef SMP 141 #include <machine/smp.h> 142 #endif 143 #ifndef PMAP_SHPGPERPROC 144 #define PMAP_SHPGPERPROC 200 145 #endif 146 147 #ifndef DIAGNOSTIC 148 #define PMAP_INLINE __inline 149 #else 150 #define PMAP_INLINE 151 #endif 152 153 #ifdef PMAP_DEBUG 154 static void pmap_zero_page_check(vm_page_t m); 155 void pmap_debug(int level); 156 int pmap_pid_dump(int pid); 157 158 #define PDEBUG(_lev_,_stat_) \ 159 if (pmap_debug_level >= (_lev_)) \ 160 ((_stat_)) 161 #define dprintf printf 162 int pmap_debug_level = 1; 163 #else /* PMAP_DEBUG */ 164 #define PDEBUG(_lev_,_stat_) /* Nothing */ 165 #define dprintf(x, arg...) 166 #endif /* PMAP_DEBUG */ 167 168 /* 169 * Level 2 page tables map definion ('max' is excluded). 170 */ 171 172 #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 173 #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) 174 175 #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 176 #define UPT2V_MAX_ADDRESS \ 177 ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) 178 179 /* 180 * Promotion to a 1MB (PTE1) page mapping requires that the corresponding 181 * 4KB (PTE2) page mappings have identical settings for the following fields: 182 */ 183 #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ 184 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ 185 PTE2_ATTR_MASK) 186 187 #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ 188 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ 189 PTE1_ATTR_MASK) 190 191 #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ 192 (((l2_attr) & L2_C) ? L1_S_C : 0) | \ 193 (((l2_attr) & L2_B) ? L1_S_B : 0) | \ 194 (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ 195 (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ 196 (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ 197 (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ 198 (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ 199 (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ 200 (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ 201 (((l2_attr) & PTE2_W) ? PTE1_W : 0)) 202 203 #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ 204 (((l1_attr) & L1_S_C) ? L2_C : 0) | \ 205 (((l1_attr) & L1_S_B) ? L2_B : 0) | \ 206 (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ 207 (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ 208 (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ 209 (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ 210 (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ 211 (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ 212 (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ 213 (((l1_attr) & PTE1_W) ? PTE2_W : 0)) 214 215 /* 216 * PTE2 descriptors creation macros. 217 */ 218 #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) 219 #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) 220 221 #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 222 #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 223 224 #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) 225 #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) 226 227 #define PV_STATS 228 #ifdef PV_STATS 229 #define PV_STAT(x) do { x ; } while (0) 230 #else 231 #define PV_STAT(x) do { } while (0) 232 #endif 233 234 /* 235 * The boot_pt1 is used temporary in very early boot stage as L1 page table. 236 * We can init many things with no memory allocation thanks to its static 237 * allocation and this brings two main advantages: 238 * (1) other cores can be started very simply, 239 * (2) various boot loaders can be supported as its arguments can be processed 240 * in virtual address space and can be moved to safe location before 241 * first allocation happened. 242 * Only disadvantage is that boot_pt1 is used only in very early boot stage. 243 * However, the table is uninitialized and so lays in bss. Therefore kernel 244 * image size is not influenced. 245 * 246 * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and 247 * CPU suspend/resume game. 248 */ 249 extern pt1_entry_t boot_pt1[]; 250 251 vm_paddr_t base_pt1; 252 pt1_entry_t *kern_pt1; 253 pt2_entry_t *kern_pt2tab; 254 pt2_entry_t *PT2MAP; 255 256 static uint32_t ttb_flags; 257 static vm_memattr_t pt_memattr; 258 ttb_entry_t pmap_kern_ttb; 259 260 struct pmap kernel_pmap_store; 261 LIST_HEAD(pmaplist, pmap); 262 static struct pmaplist allpmaps; 263 static struct mtx allpmaps_lock; 264 265 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 266 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 267 268 static vm_offset_t kernel_vm_end_new; 269 vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; 270 vm_offset_t vm_max_kernel_address; 271 vm_paddr_t kernel_l1pa; 272 273 static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; 274 275 /* 276 * Data for the pv entry allocation mechanism 277 */ 278 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 279 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 280 static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ 281 static int shpgperproc = PMAP_SHPGPERPROC; 282 283 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 284 int pv_maxchunks; /* How many chunks we have KVA for */ 285 vm_offset_t pv_vafree; /* freelist stored in the PTE */ 286 287 vm_paddr_t first_managed_pa; 288 #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) 289 290 /* 291 * All those kernel PT submaps that BSD is so fond of 292 */ 293 caddr_t _tmppt = 0; 294 295 /* 296 * Crashdump maps. 297 */ 298 static caddr_t crashdumpmap; 299 300 static pt2_entry_t *PMAP1 = NULL, *PMAP2; 301 static pt2_entry_t *PADDR1 = NULL, *PADDR2; 302 #ifdef DDB 303 static pt2_entry_t *PMAP3; 304 static pt2_entry_t *PADDR3; 305 static int PMAP3cpu __unused; /* for SMP only */ 306 #endif 307 #ifdef SMP 308 static int PMAP1cpu; 309 static int PMAP1changedcpu; 310 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 311 &PMAP1changedcpu, 0, 312 "Number of times pmap_pte2_quick changed CPU with same PMAP1"); 313 #endif 314 static int PMAP1changed; 315 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 316 &PMAP1changed, 0, 317 "Number of times pmap_pte2_quick changed PMAP1"); 318 static int PMAP1unchanged; 319 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 320 &PMAP1unchanged, 0, 321 "Number of times pmap_pte2_quick didn't change PMAP1"); 322 static struct mtx PMAP2mutex; 323 324 /* 325 * Internal flags for pmap_enter()'s helper functions. 326 */ 327 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 328 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 329 330 static __inline void pt2_wirecount_init(vm_page_t m); 331 static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, 332 vm_offset_t va); 333 static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, 334 u_int flags, vm_page_t m); 335 void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); 336 337 /* 338 * Function to set the debug level of the pmap code. 339 */ 340 #ifdef PMAP_DEBUG 341 void 342 pmap_debug(int level) 343 { 344 345 pmap_debug_level = level; 346 dprintf("pmap_debug: level=%d\n", pmap_debug_level); 347 } 348 #endif /* PMAP_DEBUG */ 349 350 /* 351 * This table must corespond with memory attribute configuration in vm.h. 352 * First entry is used for normal system mapping. 353 * 354 * Device memory is always marked as shared. 355 * Normal memory is shared only in SMP . 356 * Not outer shareable bits are not used yet. 357 * Class 6 cannot be used on ARM11. 358 */ 359 #define TEXDEF_TYPE_SHIFT 0 360 #define TEXDEF_TYPE_MASK 0x3 361 #define TEXDEF_INNER_SHIFT 2 362 #define TEXDEF_INNER_MASK 0x3 363 #define TEXDEF_OUTER_SHIFT 4 364 #define TEXDEF_OUTER_MASK 0x3 365 #define TEXDEF_NOS_SHIFT 6 366 #define TEXDEF_NOS_MASK 0x1 367 368 #define TEX(t, i, o, s) \ 369 ((t) << TEXDEF_TYPE_SHIFT) | \ 370 ((i) << TEXDEF_INNER_SHIFT) | \ 371 ((o) << TEXDEF_OUTER_SHIFT | \ 372 ((s) << TEXDEF_NOS_SHIFT)) 373 374 static uint32_t tex_class[8] = { 375 /* type inner cache outer cache */ 376 TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ 377 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ 378 TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ 379 TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ 380 TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ 381 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ 382 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ 383 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ 384 }; 385 #undef TEX 386 387 static uint32_t pte2_attr_tab[8] = { 388 PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ 389 PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ 390 PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ 391 PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ 392 PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 393 0, /* 5 - NOT USED YET */ 394 0, /* 6 - NOT USED YET */ 395 0 /* 7 - NOT USED YET */ 396 }; 397 CTASSERT(VM_MEMATTR_WB_WA == 0); 398 CTASSERT(VM_MEMATTR_NOCACHE == 1); 399 CTASSERT(VM_MEMATTR_DEVICE == 2); 400 CTASSERT(VM_MEMATTR_SO == 3); 401 CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); 402 #define VM_MEMATTR_END (VM_MEMATTR_WRITE_THROUGH + 1) 403 404 boolean_t 405 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 406 { 407 408 return (mode >= 0 && mode < VM_MEMATTR_END); 409 } 410 411 static inline uint32_t 412 vm_memattr_to_pte2(vm_memattr_t ma) 413 { 414 415 KASSERT((u_int)ma < VM_MEMATTR_END, 416 ("%s: bad vm_memattr_t %d", __func__, ma)); 417 return (pte2_attr_tab[(u_int)ma]); 418 } 419 420 static inline uint32_t 421 vm_page_pte2_attr(vm_page_t m) 422 { 423 424 return (vm_memattr_to_pte2(m->md.pat_mode)); 425 } 426 427 /* 428 * Convert TEX definition entry to TTB flags. 429 */ 430 static uint32_t 431 encode_ttb_flags(int idx) 432 { 433 uint32_t inner, outer, nos, reg; 434 435 inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & 436 TEXDEF_INNER_MASK; 437 outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & 438 TEXDEF_OUTER_MASK; 439 nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & 440 TEXDEF_NOS_MASK; 441 442 reg = nos << 5; 443 reg |= outer << 3; 444 if (cpuinfo.coherent_walk) 445 reg |= (inner & 0x1) << 6; 446 reg |= (inner & 0x2) >> 1; 447 #ifdef SMP 448 ARM_SMP_UP( 449 reg |= 1 << 1, 450 ); 451 #endif 452 return reg; 453 } 454 455 /* 456 * Set TEX remapping registers in current CPU. 457 */ 458 void 459 pmap_set_tex(void) 460 { 461 uint32_t prrr, nmrr; 462 uint32_t type, inner, outer, nos; 463 int i; 464 465 #ifdef PMAP_PTE_NOCACHE 466 /* XXX fixme */ 467 if (cpuinfo.coherent_walk) { 468 pt_memattr = VM_MEMATTR_WB_WA; 469 ttb_flags = encode_ttb_flags(0); 470 } 471 else { 472 pt_memattr = VM_MEMATTR_NOCACHE; 473 ttb_flags = encode_ttb_flags(1); 474 } 475 #else 476 pt_memattr = VM_MEMATTR_WB_WA; 477 ttb_flags = encode_ttb_flags(0); 478 #endif 479 480 prrr = 0; 481 nmrr = 0; 482 483 /* Build remapping register from TEX classes. */ 484 for (i = 0; i < 8; i++) { 485 type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & 486 TEXDEF_TYPE_MASK; 487 inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & 488 TEXDEF_INNER_MASK; 489 outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & 490 TEXDEF_OUTER_MASK; 491 nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & 492 TEXDEF_NOS_MASK; 493 494 prrr |= type << (i * 2); 495 prrr |= nos << (i + 24); 496 nmrr |= inner << (i * 2); 497 nmrr |= outer << (i * 2 + 16); 498 } 499 /* Add shareable bits for device memory. */ 500 prrr |= PRRR_DS0 | PRRR_DS1; 501 502 /* Add shareable bits for normal memory in SMP case. */ 503 #ifdef SMP 504 ARM_SMP_UP( 505 prrr |= PRRR_NS1, 506 ); 507 #endif 508 cp15_prrr_set(prrr); 509 cp15_nmrr_set(nmrr); 510 511 /* Caches are disabled, so full TLB flush should be enough. */ 512 tlb_flush_all_local(); 513 } 514 515 /* 516 * Remap one vm_meattr class to another one. This can be useful as 517 * workaround for SOC errata, e.g. if devices must be accessed using 518 * SO memory class. 519 * 520 * !!! Please note that this function is absolutely last resort thing. 521 * It should not be used under normal circumstances. !!! 522 * 523 * Usage rules: 524 * - it shall be called after pmap_bootstrap_prepare() and before 525 * cpu_mp_start() (thus only on boot CPU). In practice, it's expected 526 * to be called from platform_attach() or platform_late_init(). 527 * 528 * - if remapping doesn't change caching mode, or until uncached class 529 * is remapped to any kind of cached one, then no other restriction exists. 530 * 531 * - if pmap_remap_vm_attr() changes caching mode, but both (original and 532 * remapped) remain cached, then caller is resposible for calling 533 * of dcache_wbinv_poc_all(). 534 * 535 * - remapping of any kind of cached class to uncached is not permitted. 536 */ 537 void 538 pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) 539 { 540 int old_idx, new_idx; 541 542 /* Map VM memattrs to indexes to tex_class table. */ 543 old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); 544 new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); 545 546 /* Replace TEX attribute and apply it. */ 547 tex_class[old_idx] = tex_class[new_idx]; 548 pmap_set_tex(); 549 } 550 551 /* 552 * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, 553 * KERNBASE is mapped by first L2 page table in L2 page table page. It 554 * meets same constrain due to PT2MAP being placed just under KERNBASE. 555 */ 556 CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); 557 CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); 558 559 /* 560 * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. 561 * For now, anyhow, the following check must be fulfilled. 562 */ 563 CTASSERT(PAGE_SIZE == PTE2_SIZE); 564 /* 565 * We don't want to mess up MI code with all MMU and PMAP definitions, 566 * so some things, which depend on other ones, are defined independently. 567 * Now, it is time to check that we don't screw up something. 568 */ 569 CTASSERT(PDRSHIFT == PTE1_SHIFT); 570 /* 571 * Check L1 and L2 page table entries definitions consistency. 572 */ 573 CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); 574 CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); 575 /* 576 * Check L2 page tables page consistency. 577 */ 578 CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); 579 CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); 580 /* 581 * Check PT2TAB consistency. 582 * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. 583 * This should be done without remainder. 584 */ 585 CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); 586 587 /* 588 * A PT2MAP magic. 589 * 590 * All level 2 page tables (PT2s) are mapped continuously and accordingly 591 * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can 592 * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page 593 * must be used together, but not necessary at once. The first PT2 in a page 594 * must map things on correctly aligned address and the others must follow 595 * in right order. 596 */ 597 #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) 598 #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) 599 #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) 600 601 /* 602 * Check PT2TAB consistency. 603 * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. 604 * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. 605 * The both should be done without remainder. 606 */ 607 CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); 608 CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); 609 /* 610 * The implementation was made general, however, with the assumption 611 * bellow in mind. In case of another value of NPG_IN_PT2TAB, 612 * the code should be once more rechecked. 613 */ 614 CTASSERT(NPG_IN_PT2TAB == 1); 615 616 /* 617 * Get offset of PT2 in a page 618 * associated with given PT1 index. 619 */ 620 static __inline u_int 621 page_pt2off(u_int pt1_idx) 622 { 623 624 return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); 625 } 626 627 /* 628 * Get physical address of PT2 629 * associated with given PT2s page and PT1 index. 630 */ 631 static __inline vm_paddr_t 632 page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) 633 { 634 635 return (pgpa + page_pt2off(pt1_idx)); 636 } 637 638 /* 639 * Get first entry of PT2 640 * associated with given PT2s page and PT1 index. 641 */ 642 static __inline pt2_entry_t * 643 page_pt2(vm_offset_t pgva, u_int pt1_idx) 644 { 645 646 return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); 647 } 648 649 /* 650 * Get virtual address of PT2s page (mapped in PT2MAP) 651 * which holds PT2 which holds entry which maps given virtual address. 652 */ 653 static __inline vm_offset_t 654 pt2map_pt2pg(vm_offset_t va) 655 { 656 657 va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); 658 return ((vm_offset_t)pt2map_entry(va)); 659 } 660 661 /***************************************************************************** 662 * 663 * THREE pmap initialization milestones exist: 664 * 665 * locore.S 666 * -> fundamental init (including MMU) in ASM 667 * 668 * initarm() 669 * -> fundamental init continues in C 670 * -> first available physical address is known 671 * 672 * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) 673 * -> basic (safe) interface for physical address allocation is made 674 * -> basic (safe) interface for virtual mapping is made 675 * -> limited not SMP coherent work is possible 676 * 677 * -> more fundamental init continues in C 678 * -> locks and some more things are available 679 * -> all fundamental allocations and mappings are done 680 * 681 * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) 682 * -> phys_avail[] and virtual_avail is set 683 * -> control is passed to vm subsystem 684 * -> physical and virtual address allocation are off limit 685 * -> low level mapping functions, some SMP coherent, 686 * are available, which cannot be used before vm subsystem 687 * is being inited 688 * 689 * mi_startup() 690 * -> vm subsystem is being inited 691 * 692 * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) 693 * -> pmap is fully inited 694 * 695 *****************************************************************************/ 696 697 /***************************************************************************** 698 * 699 * PMAP first stage initialization and utility functions 700 * for pre-bootstrap epoch. 701 * 702 * After pmap_bootstrap_prepare() is called, the following functions 703 * can be used: 704 * 705 * (1) strictly only for this stage functions for physical page allocations, 706 * virtual space allocations, and mappings: 707 * 708 * vm_paddr_t pmap_preboot_get_pages(u_int num); 709 * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); 710 * vm_offset_t pmap_preboot_reserve_pages(u_int num); 711 * vm_offset_t pmap_preboot_get_vpages(u_int num); 712 * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 713 * vm_prot_t prot, vm_memattr_t attr); 714 * 715 * (2) for all stages: 716 * 717 * vm_paddr_t pmap_kextract(vm_offset_t va); 718 * 719 * NOTE: This is not SMP coherent stage. 720 * 721 *****************************************************************************/ 722 723 #define KERNEL_P2V(pa) \ 724 ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) 725 #define KERNEL_V2P(va) \ 726 ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) 727 728 static vm_paddr_t last_paddr; 729 730 /* 731 * Pre-bootstrap epoch page allocator. 732 */ 733 vm_paddr_t 734 pmap_preboot_get_pages(u_int num) 735 { 736 vm_paddr_t ret; 737 738 ret = last_paddr; 739 last_paddr += num * PAGE_SIZE; 740 741 return (ret); 742 } 743 744 /* 745 * The fundamental initialization of PMAP stuff. 746 * 747 * Some things already happened in locore.S and some things could happen 748 * before pmap_bootstrap_prepare() is called, so let's recall what is done: 749 * 1. Caches are disabled. 750 * 2. We are running on virtual addresses already with 'boot_pt1' 751 * as L1 page table. 752 * 3. So far, all virtual addresses can be converted to physical ones and 753 * vice versa by the following macros: 754 * KERNEL_P2V(pa) .... physical to virtual ones, 755 * KERNEL_V2P(va) .... virtual to physical ones. 756 * 757 * What is done herein: 758 * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. 759 * 2. PT2MAP magic is brought to live. 760 * 3. Basic preboot functions for page allocations and mappings can be used. 761 * 4. Everything is prepared for L1 cache enabling. 762 * 763 * Variations: 764 * 1. To use second TTB register, so kernel and users page tables will be 765 * separated. This way process forking - pmap_pinit() - could be faster, 766 * it saves physical pages and KVA per a process, and it's simple change. 767 * However, it will lead, due to hardware matter, to the following: 768 * (a) 2G space for kernel and 2G space for users. 769 * (b) 1G space for kernel in low addresses and 3G for users above it. 770 * A question is: Is the case (b) really an option? Note that case (b) 771 * does save neither physical memory and KVA. 772 */ 773 void 774 pmap_bootstrap_prepare(vm_paddr_t last) 775 { 776 vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; 777 vm_offset_t pt2pg_va; 778 pt1_entry_t *pte1p; 779 pt2_entry_t *pte2p; 780 u_int i; 781 uint32_t l1_attr; 782 783 /* 784 * Now, we are going to make real kernel mapping. Note that we are 785 * already running on some mapping made in locore.S and we expect 786 * that it's large enough to ensure nofault access to physical memory 787 * allocated herein before switch. 788 * 789 * As kernel image and everything needed before are and will be mapped 790 * by section mappings, we align last physical address to PTE1_SIZE. 791 */ 792 last_paddr = pte1_roundup(last); 793 794 /* 795 * Allocate and zero page(s) for kernel L1 page table. 796 * 797 * Note that it's first allocation on space which was PTE1_SIZE 798 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. 799 */ 800 base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); 801 kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); 802 bzero((void*)kern_pt1, NB_IN_PT1); 803 pte1_sync_range(kern_pt1, NB_IN_PT1); 804 805 /* Allocate and zero page(s) for kernel PT2TAB. */ 806 pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); 807 kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); 808 bzero(kern_pt2tab, NB_IN_PT2TAB); 809 pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); 810 811 /* Allocate and zero page(s) for kernel L2 page tables. */ 812 pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); 813 pt2pg_va = KERNEL_P2V(pt2pg_pa); 814 size = NKPT2PG * PAGE_SIZE; 815 bzero((void*)pt2pg_va, size); 816 pte2_sync_range((pt2_entry_t *)pt2pg_va, size); 817 818 /* 819 * Add a physical memory segment (vm_phys_seg) corresponding to the 820 * preallocated pages for kernel L2 page tables so that vm_page 821 * structures representing these pages will be created. The vm_page 822 * structures are required for promotion of the corresponding kernel 823 * virtual addresses to section mappings. 824 */ 825 vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); 826 827 /* 828 * Insert allocated L2 page table pages to PT2TAB and make 829 * link to all PT2s in L1 page table. See how kernel_vm_end 830 * is initialized. 831 * 832 * We play simple and safe. So every KVA will have underlaying 833 * L2 page table, even kernel image mapped by sections. 834 */ 835 pte2p = kern_pt2tab_entry(KERNBASE); 836 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) 837 pt2tab_store(pte2p++, PTE2_KPT(pa)); 838 839 pte1p = kern_pte1(KERNBASE); 840 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) 841 pte1_store(pte1p++, PTE1_LINK(pa)); 842 843 /* Make section mappings for kernel. */ 844 l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); 845 pte1p = kern_pte1(KERNBASE); 846 for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) 847 pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); 848 849 /* 850 * Get free and aligned space for PT2MAP and make L1 page table links 851 * to L2 page tables held in PT2TAB. 852 * 853 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t 854 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus 855 * each entry in PT2TAB maps all PT2s in a page. This implies that 856 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. 857 */ 858 PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); 859 pte1p = kern_pte1((vm_offset_t)PT2MAP); 860 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 861 pte1_store(pte1p++, PTE1_LINK(pa)); 862 } 863 864 /* 865 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. 866 * Each pmap will hold own PT2TAB, so the mapping should be not global. 867 */ 868 pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); 869 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 870 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 871 } 872 873 /* 874 * Choose correct L2 page table and make mappings for allocations 875 * made herein which replaces temporary locore.S mappings after a while. 876 * Note that PT2MAP cannot be used until we switch to kern_pt1. 877 * 878 * Note, that these allocations started aligned on 1M section and 879 * kernel PT1 was allocated first. Making of mappings must follow 880 * order of physical allocations as we've used KERNEL_P2V() macro 881 * for virtual addresses resolution. 882 */ 883 pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); 884 pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); 885 886 pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); 887 888 /* Make mapping for kernel L1 page table. */ 889 for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) 890 pte2_store(pte2p++, PTE2_KPT(pa)); 891 892 /* Make mapping for kernel PT2TAB. */ 893 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) 894 pte2_store(pte2p++, PTE2_KPT(pa)); 895 896 /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ 897 pmap_kern_ttb = base_pt1 | ttb_flags; 898 cpuinfo_reinit_mmu(pmap_kern_ttb); 899 /* 900 * Initialize the first available KVA. As kernel image is mapped by 901 * sections, we are leaving some gap behind. 902 */ 903 virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; 904 } 905 906 /* 907 * Setup L2 page table page for given KVA. 908 * Used in pre-bootstrap epoch. 909 * 910 * Note that we have allocated NKPT2PG pages for L2 page tables in advance 911 * and used them for mapping KVA starting from KERNBASE. However, this is not 912 * enough. Vectors and devices need L2 page tables too. Note that they are 913 * even above VM_MAX_KERNEL_ADDRESS. 914 */ 915 static __inline vm_paddr_t 916 pmap_preboot_pt2pg_setup(vm_offset_t va) 917 { 918 pt2_entry_t *pte2p, pte2; 919 vm_paddr_t pt2pg_pa; 920 921 /* Get associated entry in PT2TAB. */ 922 pte2p = kern_pt2tab_entry(va); 923 924 /* Just return, if PT2s page exists already. */ 925 pte2 = pt2tab_load(pte2p); 926 if (pte2_is_valid(pte2)) 927 return (pte2_pa(pte2)); 928 929 KASSERT(va >= VM_MAX_KERNEL_ADDRESS, 930 ("%s: NKPT2PG too small", __func__)); 931 932 /* 933 * Allocate page for PT2s and insert it to PT2TAB. 934 * In other words, map it into PT2MAP space. 935 */ 936 pt2pg_pa = pmap_preboot_get_pages(1); 937 pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); 938 939 /* Zero all PT2s in allocated page. */ 940 bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); 941 pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); 942 943 return (pt2pg_pa); 944 } 945 946 /* 947 * Setup L2 page table for given KVA. 948 * Used in pre-bootstrap epoch. 949 */ 950 static void 951 pmap_preboot_pt2_setup(vm_offset_t va) 952 { 953 pt1_entry_t *pte1p; 954 vm_paddr_t pt2pg_pa, pt2_pa; 955 956 /* Setup PT2's page. */ 957 pt2pg_pa = pmap_preboot_pt2pg_setup(va); 958 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); 959 960 /* Insert PT2 to PT1. */ 961 pte1p = kern_pte1(va); 962 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 963 } 964 965 /* 966 * Get L2 page entry associated with given KVA. 967 * Used in pre-bootstrap epoch. 968 */ 969 static __inline pt2_entry_t* 970 pmap_preboot_vtopte2(vm_offset_t va) 971 { 972 pt1_entry_t *pte1p; 973 974 /* Setup PT2 if needed. */ 975 pte1p = kern_pte1(va); 976 if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ 977 pmap_preboot_pt2_setup(va); 978 979 return (pt2map_entry(va)); 980 } 981 982 /* 983 * Pre-bootstrap epoch page(s) mapping(s). 984 */ 985 void 986 pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) 987 { 988 u_int i; 989 pt2_entry_t *pte2p; 990 991 /* Map all the pages. */ 992 for (i = 0; i < num; i++) { 993 pte2p = pmap_preboot_vtopte2(va); 994 pte2_store(pte2p, PTE2_KRW(pa)); 995 va += PAGE_SIZE; 996 pa += PAGE_SIZE; 997 } 998 } 999 1000 /* 1001 * Pre-bootstrap epoch virtual space alocator. 1002 */ 1003 vm_offset_t 1004 pmap_preboot_reserve_pages(u_int num) 1005 { 1006 u_int i; 1007 vm_offset_t start, va; 1008 pt2_entry_t *pte2p; 1009 1010 /* Allocate virtual space. */ 1011 start = va = virtual_avail; 1012 virtual_avail += num * PAGE_SIZE; 1013 1014 /* Zero the mapping. */ 1015 for (i = 0; i < num; i++) { 1016 pte2p = pmap_preboot_vtopte2(va); 1017 pte2_store(pte2p, 0); 1018 va += PAGE_SIZE; 1019 } 1020 1021 return (start); 1022 } 1023 1024 /* 1025 * Pre-bootstrap epoch page(s) allocation and mapping(s). 1026 */ 1027 vm_offset_t 1028 pmap_preboot_get_vpages(u_int num) 1029 { 1030 vm_paddr_t pa; 1031 vm_offset_t va; 1032 1033 /* Allocate physical page(s). */ 1034 pa = pmap_preboot_get_pages(num); 1035 1036 /* Allocate virtual space. */ 1037 va = virtual_avail; 1038 virtual_avail += num * PAGE_SIZE; 1039 1040 /* Map and zero all. */ 1041 pmap_preboot_map_pages(pa, va, num); 1042 bzero((void *)va, num * PAGE_SIZE); 1043 1044 return (va); 1045 } 1046 1047 /* 1048 * Pre-bootstrap epoch page mapping(s) with attributes. 1049 */ 1050 void 1051 pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 1052 vm_prot_t prot, vm_memattr_t attr) 1053 { 1054 u_int num; 1055 u_int l1_attr, l1_prot, l2_prot, l2_attr; 1056 pt1_entry_t *pte1p; 1057 pt2_entry_t *pte2p; 1058 1059 l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; 1060 l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1061 l2_attr = vm_memattr_to_pte2(attr); 1062 l1_prot = ATTR_TO_L1(l2_prot); 1063 l1_attr = ATTR_TO_L1(l2_attr); 1064 1065 /* Map all the pages. */ 1066 num = round_page(size); 1067 while (num > 0) { 1068 if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { 1069 pte1p = kern_pte1(va); 1070 pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); 1071 va += PTE1_SIZE; 1072 pa += PTE1_SIZE; 1073 num -= PTE1_SIZE; 1074 } else { 1075 pte2p = pmap_preboot_vtopte2(va); 1076 pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); 1077 va += PAGE_SIZE; 1078 pa += PAGE_SIZE; 1079 num -= PAGE_SIZE; 1080 } 1081 } 1082 } 1083 1084 /* 1085 * Extract from the kernel page table the physical address 1086 * that is mapped by the given virtual address "va". 1087 */ 1088 vm_paddr_t 1089 pmap_kextract(vm_offset_t va) 1090 { 1091 vm_paddr_t pa; 1092 pt1_entry_t pte1; 1093 pt2_entry_t pte2; 1094 1095 pte1 = pte1_load(kern_pte1(va)); 1096 if (pte1_is_section(pte1)) { 1097 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1098 } else if (pte1_is_link(pte1)) { 1099 /* 1100 * We should beware of concurrent promotion that changes 1101 * pte1 at this point. However, it's not a problem as PT2 1102 * page is preserved by promotion in PT2TAB. So even if 1103 * it happens, using of PT2MAP is still safe. 1104 * 1105 * QQQ: However, concurrent removing is a problem which 1106 * ends in abort on PT2MAP space. Locking must be used 1107 * to deal with this. 1108 */ 1109 pte2 = pte2_load(pt2map_entry(va)); 1110 pa = pte2_pa(pte2) | (va & PTE2_OFFSET); 1111 } 1112 else { 1113 panic("%s: va %#x pte1 %#x", __func__, va, pte1); 1114 } 1115 return (pa); 1116 } 1117 1118 /* 1119 * Extract from the kernel page table the physical address 1120 * that is mapped by the given virtual address "va". Also 1121 * return L2 page table entry which maps the address. 1122 * 1123 * This is only intended to be used for panic dumps. 1124 */ 1125 vm_paddr_t 1126 pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) 1127 { 1128 vm_paddr_t pa; 1129 pt1_entry_t pte1; 1130 pt2_entry_t pte2; 1131 1132 pte1 = pte1_load(kern_pte1(va)); 1133 if (pte1_is_section(pte1)) { 1134 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1135 pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; 1136 } else if (pte1_is_link(pte1)) { 1137 pte2 = pte2_load(pt2map_entry(va)); 1138 pa = pte2_pa(pte2); 1139 } else { 1140 pte2 = 0; 1141 pa = 0; 1142 } 1143 if (pte2p != NULL) 1144 *pte2p = pte2; 1145 return (pa); 1146 } 1147 1148 /***************************************************************************** 1149 * 1150 * PMAP second stage initialization and utility functions 1151 * for bootstrap epoch. 1152 * 1153 * After pmap_bootstrap() is called, the following functions for 1154 * mappings can be used: 1155 * 1156 * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); 1157 * void pmap_kremove(vm_offset_t va); 1158 * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1159 * int prot); 1160 * 1161 * NOTE: This is not SMP coherent stage. And physical page allocation is not 1162 * allowed during this stage. 1163 * 1164 *****************************************************************************/ 1165 1166 /* 1167 * Initialize kernel PMAP locks and lists, kernel_pmap itself, and 1168 * reserve various virtual spaces for temporary mappings. 1169 */ 1170 void 1171 pmap_bootstrap(vm_offset_t firstaddr) 1172 { 1173 pt2_entry_t *unused __unused; 1174 struct pcpu *pc; 1175 1176 /* 1177 * Initialize the kernel pmap (which is statically allocated). 1178 */ 1179 PMAP_LOCK_INIT(kernel_pmap); 1180 kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ 1181 kernel_pmap->pm_pt1 = kern_pt1; 1182 kernel_pmap->pm_pt2tab = kern_pt2tab; 1183 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1184 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1185 1186 /* 1187 * Initialize the global pv list lock. 1188 */ 1189 rw_init(&pvh_global_lock, "pmap pv global"); 1190 1191 LIST_INIT(&allpmaps); 1192 1193 /* 1194 * Request a spin mutex so that changes to allpmaps cannot be 1195 * preempted by smp_rendezvous_cpus(). 1196 */ 1197 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 1198 mtx_lock_spin(&allpmaps_lock); 1199 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 1200 mtx_unlock_spin(&allpmaps_lock); 1201 1202 /* 1203 * Reserve some special page table entries/VA space for temporary 1204 * mapping of pages. 1205 */ 1206 #define SYSMAP(c, p, v, n) do { \ 1207 v = (c)pmap_preboot_reserve_pages(n); \ 1208 p = pt2map_entry((vm_offset_t)v); \ 1209 } while (0) 1210 1211 /* 1212 * Local CMAP1/CMAP2 are used for zeroing and copying pages. 1213 * Local CMAP2 is also used for data cache cleaning. 1214 */ 1215 pc = get_pcpu(); 1216 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1217 SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); 1218 SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); 1219 SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); 1220 1221 /* 1222 * Crashdump maps. 1223 */ 1224 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); 1225 1226 /* 1227 * _tmppt is used for reading arbitrary physical pages via /dev/mem. 1228 */ 1229 SYSMAP(caddr_t, unused, _tmppt, 1); 1230 1231 /* 1232 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), 1233 * respectively. PADDR3 is used by pmap_pte2_ddb(). 1234 */ 1235 SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); 1236 SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); 1237 #ifdef DDB 1238 SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); 1239 #endif 1240 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 1241 1242 /* 1243 * Note that in very short time in initarm(), we are going to 1244 * initialize phys_avail[] array and no further page allocation 1245 * can happen after that until vm subsystem will be initialized. 1246 */ 1247 kernel_vm_end_new = kernel_vm_end; 1248 virtual_end = vm_max_kernel_address; 1249 } 1250 1251 static void 1252 pmap_init_reserved_pages(void) 1253 { 1254 struct pcpu *pc; 1255 vm_offset_t pages; 1256 int i; 1257 1258 CPU_FOREACH(i) { 1259 pc = pcpu_find(i); 1260 /* 1261 * Skip if the mapping has already been initialized, 1262 * i.e. this is the BSP. 1263 */ 1264 if (pc->pc_cmap1_addr != 0) 1265 continue; 1266 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1267 pages = kva_alloc(PAGE_SIZE * 3); 1268 if (pages == 0) 1269 panic("%s: unable to allocate KVA", __func__); 1270 pc->pc_cmap1_pte2p = pt2map_entry(pages); 1271 pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); 1272 pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); 1273 pc->pc_cmap1_addr = (caddr_t)pages; 1274 pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); 1275 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 1276 } 1277 } 1278 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 1279 1280 /* 1281 * The function can already be use in second initialization stage. 1282 * As such, the function DOES NOT call pmap_growkernel() where PT2 1283 * allocation can happen. So if used, be sure that PT2 for given 1284 * virtual address is allocated already! 1285 * 1286 * Add a wired page to the kva. 1287 * Note: not SMP coherent. 1288 */ 1289 static __inline void 1290 pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, 1291 uint32_t attr) 1292 { 1293 pt1_entry_t *pte1p; 1294 pt2_entry_t *pte2p; 1295 1296 pte1p = kern_pte1(va); 1297 if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ 1298 /* 1299 * This is a very low level function, so PT2 and particularly 1300 * PT2PG associated with given virtual address must be already 1301 * allocated. It's a pain mainly during pmap initialization 1302 * stage. However, called after pmap initialization with 1303 * virtual address not under kernel_vm_end will lead to 1304 * the same misery. 1305 */ 1306 if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) 1307 panic("%s: kernel PT2 not allocated!", __func__); 1308 } 1309 1310 pte2p = pt2map_entry(va); 1311 pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); 1312 } 1313 1314 PMAP_INLINE void 1315 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1316 { 1317 1318 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); 1319 } 1320 1321 /* 1322 * Remove a page from the kernel pagetables. 1323 * Note: not SMP coherent. 1324 */ 1325 PMAP_INLINE void 1326 pmap_kremove(vm_offset_t va) 1327 { 1328 pt1_entry_t *pte1p; 1329 pt2_entry_t *pte2p; 1330 1331 pte1p = kern_pte1(va); 1332 if (pte1_is_section(pte1_load(pte1p))) { 1333 pte1_clear(pte1p); 1334 } else { 1335 pte2p = pt2map_entry(va); 1336 pte2_clear(pte2p); 1337 } 1338 } 1339 1340 /* 1341 * Share new kernel PT2PG with all pmaps. 1342 * The caller is responsible for maintaining TLB consistency. 1343 */ 1344 static void 1345 pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) 1346 { 1347 pmap_t pmap; 1348 pt2_entry_t *pte2p; 1349 1350 mtx_lock_spin(&allpmaps_lock); 1351 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1352 pte2p = pmap_pt2tab_entry(pmap, va); 1353 pt2tab_store(pte2p, npte2); 1354 } 1355 mtx_unlock_spin(&allpmaps_lock); 1356 } 1357 1358 /* 1359 * Share new kernel PTE1 with all pmaps. 1360 * The caller is responsible for maintaining TLB consistency. 1361 */ 1362 static void 1363 pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) 1364 { 1365 pmap_t pmap; 1366 pt1_entry_t *pte1p; 1367 1368 mtx_lock_spin(&allpmaps_lock); 1369 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1370 pte1p = pmap_pte1(pmap, va); 1371 pte1_store(pte1p, npte1); 1372 } 1373 mtx_unlock_spin(&allpmaps_lock); 1374 } 1375 1376 /* 1377 * Used to map a range of physical addresses into kernel 1378 * virtual address space. 1379 * 1380 * The value passed in '*virt' is a suggested virtual address for 1381 * the mapping. Architectures which can support a direct-mapped 1382 * physical to virtual region can return the appropriate address 1383 * within that region, leaving '*virt' unchanged. Other 1384 * architectures should map the pages starting at '*virt' and 1385 * update '*virt' with the first usable address after the mapped 1386 * region. 1387 * 1388 * NOTE: Read the comments above pmap_kenter_prot_attr() as 1389 * the function is used herein! 1390 */ 1391 vm_offset_t 1392 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1393 { 1394 vm_offset_t va, sva; 1395 vm_paddr_t pte1_offset; 1396 pt1_entry_t npte1; 1397 uint32_t l1prot, l2prot; 1398 uint32_t l1attr, l2attr; 1399 1400 PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," 1401 " prot = %d\n", __func__, *virt, start, end, end - start, prot)); 1402 1403 l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; 1404 l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1405 l1prot = ATTR_TO_L1(l2prot); 1406 1407 l2attr = PTE2_ATTR_DEFAULT; 1408 l1attr = ATTR_TO_L1(l2attr); 1409 1410 va = *virt; 1411 /* 1412 * Does the physical address range's size and alignment permit at 1413 * least one section mapping to be created? 1414 */ 1415 pte1_offset = start & PTE1_OFFSET; 1416 if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= 1417 PTE1_SIZE) { 1418 /* 1419 * Increase the starting virtual address so that its alignment 1420 * does not preclude the use of section mappings. 1421 */ 1422 if ((va & PTE1_OFFSET) < pte1_offset) 1423 va = pte1_trunc(va) + pte1_offset; 1424 else if ((va & PTE1_OFFSET) > pte1_offset) 1425 va = pte1_roundup(va) + pte1_offset; 1426 } 1427 sva = va; 1428 while (start < end) { 1429 if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { 1430 KASSERT((va & PTE1_OFFSET) == 0, 1431 ("%s: misaligned va %#x", __func__, va)); 1432 npte1 = PTE1_KERN(start, l1prot, l1attr); 1433 pmap_kenter_pte1(va, npte1); 1434 va += PTE1_SIZE; 1435 start += PTE1_SIZE; 1436 } else { 1437 pmap_kenter_prot_attr(va, start, l2prot, l2attr); 1438 va += PAGE_SIZE; 1439 start += PAGE_SIZE; 1440 } 1441 } 1442 tlb_flush_range(sva, va - sva); 1443 *virt = va; 1444 return (sva); 1445 } 1446 1447 /* 1448 * Make a temporary mapping for a physical address. 1449 * This is only intended to be used for panic dumps. 1450 */ 1451 void * 1452 pmap_kenter_temporary(vm_paddr_t pa, int i) 1453 { 1454 vm_offset_t va; 1455 1456 /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ 1457 1458 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 1459 pmap_kenter(va, pa); 1460 tlb_flush_local(va); 1461 return ((void *)crashdumpmap); 1462 } 1463 1464 1465 /************************************* 1466 * 1467 * TLB & cache maintenance routines. 1468 * 1469 *************************************/ 1470 1471 /* 1472 * We inline these within pmap.c for speed. 1473 */ 1474 PMAP_INLINE void 1475 pmap_tlb_flush(pmap_t pmap, vm_offset_t va) 1476 { 1477 1478 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1479 tlb_flush(va); 1480 } 1481 1482 PMAP_INLINE void 1483 pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) 1484 { 1485 1486 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1487 tlb_flush_range(sva, size); 1488 } 1489 1490 /* 1491 * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. 1492 * Requirements: 1493 * - Must deal with pages in order to ensure that none of the PTE2_* bits 1494 * are ever set, PTE2_V in particular. 1495 * - Assumes we can write to pte2s without pte2_store() atomic ops. 1496 * - Assumes nothing will ever test these addresses for 0 to indicate 1497 * no mapping instead of correctly checking PTE2_V. 1498 * - Assumes a vm_offset_t will fit in a pte2 (true for arm). 1499 * Because PTE2_V is never set, there can be no mappings to invalidate. 1500 */ 1501 static vm_offset_t 1502 pmap_pte2list_alloc(vm_offset_t *head) 1503 { 1504 pt2_entry_t *pte2p; 1505 vm_offset_t va; 1506 1507 va = *head; 1508 if (va == 0) 1509 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 1510 pte2p = pt2map_entry(va); 1511 *head = *pte2p; 1512 if (*head & PTE2_V) 1513 panic("%s: va with PTE2_V set!", __func__); 1514 *pte2p = 0; 1515 return (va); 1516 } 1517 1518 static void 1519 pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) 1520 { 1521 pt2_entry_t *pte2p; 1522 1523 if (va & PTE2_V) 1524 panic("%s: freeing va with PTE2_V set!", __func__); 1525 pte2p = pt2map_entry(va); 1526 *pte2p = *head; /* virtual! PTE2_V is 0 though */ 1527 *head = va; 1528 } 1529 1530 static void 1531 pmap_pte2list_init(vm_offset_t *head, void *base, int npages) 1532 { 1533 int i; 1534 vm_offset_t va; 1535 1536 *head = 0; 1537 for (i = npages - 1; i >= 0; i--) { 1538 va = (vm_offset_t)base + i * PAGE_SIZE; 1539 pmap_pte2list_free(head, va); 1540 } 1541 } 1542 1543 /***************************************************************************** 1544 * 1545 * PMAP third and final stage initialization. 1546 * 1547 * After pmap_init() is called, PMAP subsystem is fully initialized. 1548 * 1549 *****************************************************************************/ 1550 1551 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1552 "VM/pmap parameters"); 1553 1554 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1555 "Max number of PV entries"); 1556 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1557 "Page share factor per proc"); 1558 1559 static u_long nkpt2pg = NKPT2PG; 1560 SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, 1561 &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); 1562 1563 static int sp_enabled = 1; 1564 SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 1565 &sp_enabled, 0, "Are large page mappings enabled?"); 1566 1567 bool 1568 pmap_ps_enabled(pmap_t pmap __unused) 1569 { 1570 1571 return (sp_enabled != 0); 1572 } 1573 1574 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1575 "1MB page mapping counters"); 1576 1577 static u_long pmap_pte1_demotions; 1578 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, 1579 &pmap_pte1_demotions, 0, "1MB page demotions"); 1580 1581 static u_long pmap_pte1_mappings; 1582 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, 1583 &pmap_pte1_mappings, 0, "1MB page mappings"); 1584 1585 static u_long pmap_pte1_p_failures; 1586 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, 1587 &pmap_pte1_p_failures, 0, "1MB page promotion failures"); 1588 1589 static u_long pmap_pte1_promotions; 1590 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, 1591 &pmap_pte1_promotions, 0, "1MB page promotions"); 1592 1593 static u_long pmap_pte1_kern_demotions; 1594 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, 1595 &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); 1596 1597 static u_long pmap_pte1_kern_promotions; 1598 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, 1599 &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); 1600 1601 static __inline ttb_entry_t 1602 pmap_ttb_get(pmap_t pmap) 1603 { 1604 1605 return (vtophys(pmap->pm_pt1) | ttb_flags); 1606 } 1607 1608 /* 1609 * Initialize a vm_page's machine-dependent fields. 1610 * 1611 * Variations: 1612 * 1. Pages for L2 page tables are always not managed. So, pv_list and 1613 * pt2_wirecount can share same physical space. However, proper 1614 * initialization on a page alloc for page tables and reinitialization 1615 * on the page free must be ensured. 1616 */ 1617 void 1618 pmap_page_init(vm_page_t m) 1619 { 1620 1621 TAILQ_INIT(&m->md.pv_list); 1622 pt2_wirecount_init(m); 1623 m->md.pat_mode = VM_MEMATTR_DEFAULT; 1624 } 1625 1626 /* 1627 * Virtualization for faster way how to zero whole page. 1628 */ 1629 static __inline void 1630 pagezero(void *page) 1631 { 1632 1633 bzero(page, PAGE_SIZE); 1634 } 1635 1636 /* 1637 * Zero L2 page table page. 1638 * Use same KVA as in pmap_zero_page(). 1639 */ 1640 static __inline vm_paddr_t 1641 pmap_pt2pg_zero(vm_page_t m) 1642 { 1643 pt2_entry_t *cmap2_pte2p; 1644 vm_paddr_t pa; 1645 struct pcpu *pc; 1646 1647 pa = VM_PAGE_TO_PHYS(m); 1648 1649 /* 1650 * XXX: For now, we map whole page even if it's already zero, 1651 * to sync it even if the sync is only DSB. 1652 */ 1653 sched_pin(); 1654 pc = get_pcpu(); 1655 cmap2_pte2p = pc->pc_cmap2_pte2p; 1656 mtx_lock(&pc->pc_cmap_lock); 1657 if (pte2_load(cmap2_pte2p) != 0) 1658 panic("%s: CMAP2 busy", __func__); 1659 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 1660 vm_page_pte2_attr(m))); 1661 /* Even VM_ALLOC_ZERO request is only advisory. */ 1662 if ((m->flags & PG_ZERO) == 0) 1663 pagezero(pc->pc_cmap2_addr); 1664 pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); 1665 pte2_clear(cmap2_pte2p); 1666 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 1667 1668 /* 1669 * Unpin the thread before releasing the lock. Otherwise the thread 1670 * could be rescheduled while still bound to the current CPU, only 1671 * to unpin itself immediately upon resuming execution. 1672 */ 1673 sched_unpin(); 1674 mtx_unlock(&pc->pc_cmap_lock); 1675 1676 return (pa); 1677 } 1678 1679 /* 1680 * Init just allocated page as L2 page table(s) holder 1681 * and return its physical address. 1682 */ 1683 static __inline vm_paddr_t 1684 pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) 1685 { 1686 vm_paddr_t pa; 1687 pt2_entry_t *pte2p; 1688 1689 /* Check page attributes. */ 1690 if (m->md.pat_mode != pt_memattr) 1691 pmap_page_set_memattr(m, pt_memattr); 1692 1693 /* Zero page and init wire counts. */ 1694 pa = pmap_pt2pg_zero(m); 1695 pt2_wirecount_init(m); 1696 1697 /* 1698 * Map page to PT2MAP address space for given pmap. 1699 * Note that PT2MAP space is shared with all pmaps. 1700 */ 1701 if (pmap == kernel_pmap) 1702 pmap_kenter_pt2tab(va, PTE2_KPT(pa)); 1703 else { 1704 pte2p = pmap_pt2tab_entry(pmap, va); 1705 pt2tab_store(pte2p, PTE2_KPT_NG(pa)); 1706 } 1707 1708 return (pa); 1709 } 1710 1711 /* 1712 * Initialize the pmap module. 1713 * Called by vm_init, to initialize any structures that the pmap 1714 * system needs to map virtual memory. 1715 */ 1716 void 1717 pmap_init(void) 1718 { 1719 vm_size_t s; 1720 pt2_entry_t *pte2p, pte2; 1721 u_int i, pte1_idx, pv_npg; 1722 1723 PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); 1724 1725 /* 1726 * Initialize the vm page array entries for kernel pmap's 1727 * L2 page table pages allocated in advance. 1728 */ 1729 pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); 1730 pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); 1731 for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { 1732 vm_paddr_t pa; 1733 vm_page_t m; 1734 1735 pte2 = pte2_load(pte2p); 1736 KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); 1737 1738 pa = pte2_pa(pte2); 1739 m = PHYS_TO_VM_PAGE(pa); 1740 KASSERT(m >= vm_page_array && 1741 m < &vm_page_array[vm_page_array_size], 1742 ("%s: L2 page table page is out of range", __func__)); 1743 1744 m->pindex = pte1_idx; 1745 m->phys_addr = pa; 1746 pte1_idx += NPT2_IN_PG; 1747 } 1748 1749 /* 1750 * Initialize the address space (zone) for the pv entries. Set a 1751 * high water mark so that the system can recover from excessive 1752 * numbers of pv entries. 1753 */ 1754 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1755 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1756 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1757 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1758 pv_entry_high_water = 9 * (pv_entry_max / 10); 1759 1760 /* 1761 * Are large page mappings enabled? 1762 */ 1763 TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); 1764 if (sp_enabled) { 1765 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1766 ("%s: can't assign to pagesizes[1]", __func__)); 1767 pagesizes[1] = PTE1_SIZE; 1768 } 1769 1770 /* 1771 * Calculate the size of the pv head table for sections. 1772 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1773 * Note that the table is only for sections which could be promoted. 1774 */ 1775 first_managed_pa = pte1_trunc(vm_phys_segs[0].start); 1776 pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) 1777 - first_managed_pa) / PTE1_SIZE + 1; 1778 1779 /* 1780 * Allocate memory for the pv head table for sections. 1781 */ 1782 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1783 s = round_page(s); 1784 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 1785 for (i = 0; i < pv_npg; i++) 1786 TAILQ_INIT(&pv_table[i].pv_list); 1787 1788 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1789 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1790 if (pv_chunkbase == NULL) 1791 panic("%s: not enough kvm for pv chunks", __func__); 1792 pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1793 } 1794 1795 /* 1796 * Add a list of wired pages to the kva 1797 * this routine is only used for temporary 1798 * kernel mappings that do not need to have 1799 * page modification or references recorded. 1800 * Note that old mappings are simply written 1801 * over. The page *must* be wired. 1802 * Note: SMP coherent. Uses a ranged shootdown IPI. 1803 */ 1804 void 1805 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1806 { 1807 u_int anychanged; 1808 pt2_entry_t *epte2p, *pte2p, pte2; 1809 vm_page_t m; 1810 vm_paddr_t pa; 1811 1812 anychanged = 0; 1813 pte2p = pt2map_entry(sva); 1814 epte2p = pte2p + count; 1815 while (pte2p < epte2p) { 1816 m = *ma++; 1817 pa = VM_PAGE_TO_PHYS(m); 1818 pte2 = pte2_load(pte2p); 1819 if ((pte2_pa(pte2) != pa) || 1820 (pte2_attr(pte2) != vm_page_pte2_attr(m))) { 1821 anychanged++; 1822 pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, 1823 vm_page_pte2_attr(m))); 1824 } 1825 pte2p++; 1826 } 1827 if (__predict_false(anychanged)) 1828 tlb_flush_range(sva, count * PAGE_SIZE); 1829 } 1830 1831 /* 1832 * This routine tears out page mappings from the 1833 * kernel -- it is meant only for temporary mappings. 1834 * Note: SMP coherent. Uses a ranged shootdown IPI. 1835 */ 1836 void 1837 pmap_qremove(vm_offset_t sva, int count) 1838 { 1839 vm_offset_t va; 1840 1841 va = sva; 1842 while (count-- > 0) { 1843 pmap_kremove(va); 1844 va += PAGE_SIZE; 1845 } 1846 tlb_flush_range(sva, va - sva); 1847 } 1848 1849 /* 1850 * Are we current address space or kernel? 1851 */ 1852 static __inline int 1853 pmap_is_current(pmap_t pmap) 1854 { 1855 1856 return (pmap == kernel_pmap || 1857 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); 1858 } 1859 1860 /* 1861 * If the given pmap is not the current or kernel pmap, the returned 1862 * pte2 must be released by passing it to pmap_pte2_release(). 1863 */ 1864 static pt2_entry_t * 1865 pmap_pte2(pmap_t pmap, vm_offset_t va) 1866 { 1867 pt1_entry_t pte1; 1868 vm_paddr_t pt2pg_pa; 1869 1870 pte1 = pte1_load(pmap_pte1(pmap, va)); 1871 if (pte1_is_section(pte1)) 1872 panic("%s: attempt to map PTE1", __func__); 1873 if (pte1_is_link(pte1)) { 1874 /* Are we current address space or kernel? */ 1875 if (pmap_is_current(pmap)) 1876 return (pt2map_entry(va)); 1877 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1878 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1879 mtx_lock(&PMAP2mutex); 1880 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 1881 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 1882 tlb_flush((vm_offset_t)PADDR2); 1883 } 1884 return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1885 } 1886 return (NULL); 1887 } 1888 1889 /* 1890 * Releases a pte2 that was obtained from pmap_pte2(). 1891 * Be prepared for the pte2p being NULL. 1892 */ 1893 static __inline void 1894 pmap_pte2_release(pt2_entry_t *pte2p) 1895 { 1896 1897 if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { 1898 mtx_unlock(&PMAP2mutex); 1899 } 1900 } 1901 1902 /* 1903 * Super fast pmap_pte2 routine best used when scanning 1904 * the pv lists. This eliminates many coarse-grained 1905 * invltlb calls. Note that many of the pv list 1906 * scans are across different pmaps. It is very wasteful 1907 * to do an entire tlb flush for checking a single mapping. 1908 * 1909 * If the given pmap is not the current pmap, pvh_global_lock 1910 * must be held and curthread pinned to a CPU. 1911 */ 1912 static pt2_entry_t * 1913 pmap_pte2_quick(pmap_t pmap, vm_offset_t va) 1914 { 1915 pt1_entry_t pte1; 1916 vm_paddr_t pt2pg_pa; 1917 1918 pte1 = pte1_load(pmap_pte1(pmap, va)); 1919 if (pte1_is_section(pte1)) 1920 panic("%s: attempt to map PTE1", __func__); 1921 if (pte1_is_link(pte1)) { 1922 /* Are we current address space or kernel? */ 1923 if (pmap_is_current(pmap)) 1924 return (pt2map_entry(va)); 1925 rw_assert(&pvh_global_lock, RA_WLOCKED); 1926 KASSERT(curthread->td_pinned > 0, 1927 ("%s: curthread not pinned", __func__)); 1928 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1929 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1930 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 1931 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 1932 #ifdef SMP 1933 PMAP1cpu = PCPU_GET(cpuid); 1934 #endif 1935 tlb_flush_local((vm_offset_t)PADDR1); 1936 PMAP1changed++; 1937 } else 1938 #ifdef SMP 1939 if (PMAP1cpu != PCPU_GET(cpuid)) { 1940 PMAP1cpu = PCPU_GET(cpuid); 1941 tlb_flush_local((vm_offset_t)PADDR1); 1942 PMAP1changedcpu++; 1943 } else 1944 #endif 1945 PMAP1unchanged++; 1946 return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1947 } 1948 return (NULL); 1949 } 1950 1951 /* 1952 * Routine: pmap_extract 1953 * Function: 1954 * Extract the physical page address associated 1955 * with the given map/virtual_address pair. 1956 */ 1957 vm_paddr_t 1958 pmap_extract(pmap_t pmap, vm_offset_t va) 1959 { 1960 vm_paddr_t pa; 1961 pt1_entry_t pte1; 1962 pt2_entry_t *pte2p; 1963 1964 PMAP_LOCK(pmap); 1965 pte1 = pte1_load(pmap_pte1(pmap, va)); 1966 if (pte1_is_section(pte1)) 1967 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1968 else if (pte1_is_link(pte1)) { 1969 pte2p = pmap_pte2(pmap, va); 1970 pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); 1971 pmap_pte2_release(pte2p); 1972 } else 1973 pa = 0; 1974 PMAP_UNLOCK(pmap); 1975 return (pa); 1976 } 1977 1978 /* 1979 * Routine: pmap_extract_and_hold 1980 * Function: 1981 * Atomically extract and hold the physical page 1982 * with the given pmap and virtual address pair 1983 * if that mapping permits the given protection. 1984 */ 1985 vm_page_t 1986 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1987 { 1988 vm_paddr_t pa; 1989 pt1_entry_t pte1; 1990 pt2_entry_t pte2, *pte2p; 1991 vm_page_t m; 1992 1993 m = NULL; 1994 PMAP_LOCK(pmap); 1995 pte1 = pte1_load(pmap_pte1(pmap, va)); 1996 if (pte1_is_section(pte1)) { 1997 if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { 1998 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1999 m = PHYS_TO_VM_PAGE(pa); 2000 if (!vm_page_wire_mapped(m)) 2001 m = NULL; 2002 } 2003 } else if (pte1_is_link(pte1)) { 2004 pte2p = pmap_pte2(pmap, va); 2005 pte2 = pte2_load(pte2p); 2006 pmap_pte2_release(pte2p); 2007 if (pte2_is_valid(pte2) && 2008 (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { 2009 pa = pte2_pa(pte2); 2010 m = PHYS_TO_VM_PAGE(pa); 2011 if (!vm_page_wire_mapped(m)) 2012 m = NULL; 2013 } 2014 } 2015 PMAP_UNLOCK(pmap); 2016 return (m); 2017 } 2018 2019 /* 2020 * Grow the number of kernel L2 page table entries, if needed. 2021 */ 2022 void 2023 pmap_growkernel(vm_offset_t addr) 2024 { 2025 vm_page_t m; 2026 vm_paddr_t pt2pg_pa, pt2_pa; 2027 pt1_entry_t pte1; 2028 pt2_entry_t pte2; 2029 2030 PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); 2031 /* 2032 * All the time kernel_vm_end is first KVA for which underlying 2033 * L2 page table is either not allocated or linked from L1 page table 2034 * (not considering sections). Except for two possible cases: 2035 * 2036 * (1) in the very beginning as long as pmap_growkernel() was 2037 * not called, it could be first unused KVA (which is not 2038 * rounded up to PTE1_SIZE), 2039 * 2040 * (2) when all KVA space is mapped and vm_map_max(kernel_map) 2041 * address is not rounded up to PTE1_SIZE. (For example, 2042 * it could be 0xFFFFFFFF.) 2043 */ 2044 kernel_vm_end = pte1_roundup(kernel_vm_end); 2045 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2046 addr = roundup2(addr, PTE1_SIZE); 2047 if (addr - 1 >= vm_map_max(kernel_map)) 2048 addr = vm_map_max(kernel_map); 2049 while (kernel_vm_end < addr) { 2050 pte1 = pte1_load(kern_pte1(kernel_vm_end)); 2051 if (pte1_is_valid(pte1)) { 2052 kernel_vm_end += PTE1_SIZE; 2053 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2054 kernel_vm_end = vm_map_max(kernel_map); 2055 break; 2056 } 2057 continue; 2058 } 2059 2060 /* 2061 * kernel_vm_end_new is used in pmap_pinit() when kernel 2062 * mappings are entered to new pmap all at once to avoid race 2063 * between pmap_kenter_pte1() and kernel_vm_end increase. 2064 * The same aplies to pmap_kenter_pt2tab(). 2065 */ 2066 kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; 2067 2068 pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); 2069 if (!pte2_is_valid(pte2)) { 2070 /* 2071 * Install new PT2s page into kernel PT2TAB. 2072 */ 2073 m = vm_page_alloc(NULL, 2074 pte1_index(kernel_vm_end) & ~PT2PG_MASK, 2075 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2076 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2077 if (m == NULL) 2078 panic("%s: no memory to grow kernel", __func__); 2079 /* 2080 * QQQ: To link all new L2 page tables from L1 page 2081 * table now and so pmap_kenter_pte1() them 2082 * at once together with pmap_kenter_pt2tab() 2083 * could be nice speed up. However, 2084 * pmap_growkernel() does not happen so often... 2085 * QQQ: The other TTBR is another option. 2086 */ 2087 pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, 2088 m); 2089 } else 2090 pt2pg_pa = pte2_pa(pte2); 2091 2092 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); 2093 pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); 2094 2095 kernel_vm_end = kernel_vm_end_new; 2096 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2097 kernel_vm_end = vm_map_max(kernel_map); 2098 break; 2099 } 2100 } 2101 } 2102 2103 static int 2104 kvm_size(SYSCTL_HANDLER_ARGS) 2105 { 2106 unsigned long ksize = vm_max_kernel_address - KERNBASE; 2107 2108 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2109 } 2110 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, 2111 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_size, "IU", 2112 "Size of KVM"); 2113 2114 static int 2115 kvm_free(SYSCTL_HANDLER_ARGS) 2116 { 2117 unsigned long kfree = vm_max_kernel_address - kernel_vm_end; 2118 2119 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2120 } 2121 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, 2122 CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, kvm_free, "IU", 2123 "Amount of KVM free"); 2124 2125 /*********************************************** 2126 * 2127 * Pmap allocation/deallocation routines. 2128 * 2129 ***********************************************/ 2130 2131 /* 2132 * Initialize the pmap for the swapper process. 2133 */ 2134 void 2135 pmap_pinit0(pmap_t pmap) 2136 { 2137 PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); 2138 2139 PMAP_LOCK_INIT(pmap); 2140 2141 /* 2142 * Kernel page table directory and pmap stuff around is already 2143 * initialized, we are using it right now and here. So, finish 2144 * only PMAP structures initialization for process0 ... 2145 * 2146 * Since the L1 page table and PT2TAB is shared with the kernel pmap, 2147 * which is already included in the list "allpmaps", this pmap does 2148 * not need to be inserted into that list. 2149 */ 2150 pmap->pm_pt1 = kern_pt1; 2151 pmap->pm_pt2tab = kern_pt2tab; 2152 CPU_ZERO(&pmap->pm_active); 2153 PCPU_SET(curpmap, pmap); 2154 TAILQ_INIT(&pmap->pm_pvchunk); 2155 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2156 CPU_SET(0, &pmap->pm_active); 2157 } 2158 2159 static __inline void 2160 pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, 2161 vm_offset_t eva) 2162 { 2163 u_int idx, count; 2164 2165 idx = pte1_index(sva); 2166 count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); 2167 bcopy(spte1p + idx, dpte1p + idx, count); 2168 } 2169 2170 static __inline void 2171 pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, 2172 vm_offset_t eva) 2173 { 2174 u_int idx, count; 2175 2176 idx = pt2tab_index(sva); 2177 count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); 2178 bcopy(spte2p + idx, dpte2p + idx, count); 2179 } 2180 2181 /* 2182 * Initialize a preallocated and zeroed pmap structure, 2183 * such as one in a vmspace structure. 2184 */ 2185 int 2186 pmap_pinit(pmap_t pmap) 2187 { 2188 pt1_entry_t *pte1p; 2189 pt2_entry_t *pte2p; 2190 vm_paddr_t pa, pt2tab_pa; 2191 u_int i; 2192 2193 PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, 2194 pmap->pm_pt1)); 2195 2196 /* 2197 * No need to allocate L2 page table space yet but we do need 2198 * a valid L1 page table and PT2TAB table. 2199 * 2200 * Install shared kernel mappings to these tables. It's a little 2201 * tricky as some parts of KVA are reserved for vectors, devices, 2202 * and whatever else. These parts are supposed to be above 2203 * vm_max_kernel_address. Thus two regions should be installed: 2204 * 2205 * (1) <KERNBASE, kernel_vm_end), 2206 * (2) <vm_max_kernel_address, 0xFFFFFFFF>. 2207 * 2208 * QQQ: The second region should be stable enough to be installed 2209 * only once in time when the tables are allocated. 2210 * QQQ: Maybe copy of both regions at once could be faster ... 2211 * QQQ: Maybe the other TTBR is an option. 2212 * 2213 * Finally, install own PT2TAB table to these tables. 2214 */ 2215 2216 if (pmap->pm_pt1 == NULL) { 2217 pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(NB_IN_PT1, 2218 M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); 2219 if (pmap->pm_pt1 == NULL) 2220 return (0); 2221 } 2222 if (pmap->pm_pt2tab == NULL) { 2223 /* 2224 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page 2225 * only, what should be the only size for 32 bit systems, 2226 * then we could allocate it with vm_page_alloc() and all 2227 * the stuff needed as other L2 page table pages. 2228 * (2) Note that a process PT2TAB is special L2 page table 2229 * page. Its mapping in kernel_arena is permanent and can 2230 * be used no matter which process is current. Its mapping 2231 * in PT2MAP can be used only for current process. 2232 */ 2233 pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(NB_IN_PT2TAB, 2234 M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); 2235 if (pmap->pm_pt2tab == NULL) { 2236 /* 2237 * QQQ: As struct pmap is allocated from UMA with 2238 * UMA_ZONE_NOFREE flag, it's important to leave 2239 * no allocation in pmap if initialization failed. 2240 */ 2241 kmem_free((vm_offset_t)pmap->pm_pt1, NB_IN_PT1); 2242 pmap->pm_pt1 = NULL; 2243 return (0); 2244 } 2245 /* 2246 * QQQ: Each L2 page table page vm_page_t has pindex set to 2247 * pte1 index of virtual address mapped by this page. 2248 * It's not valid for non kernel PT2TABs themselves. 2249 * The pindex of these pages can not be altered because 2250 * of the way how they are allocated now. However, it 2251 * should not be a problem. 2252 */ 2253 } 2254 2255 mtx_lock_spin(&allpmaps_lock); 2256 /* 2257 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), 2258 * kernel_vm_end_new is used here instead of kernel_vm_end. 2259 */ 2260 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, 2261 kernel_vm_end_new - 1); 2262 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 2263 0xFFFFFFFF); 2264 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, 2265 kernel_vm_end_new - 1); 2266 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 2267 0xFFFFFFFF); 2268 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 2269 mtx_unlock_spin(&allpmaps_lock); 2270 2271 /* 2272 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. 2273 * I.e. self reference mapping. The PT2TAB is private, however mapped 2274 * into shared PT2MAP space, so the mapping should be not global. 2275 */ 2276 pt2tab_pa = vtophys(pmap->pm_pt2tab); 2277 pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); 2278 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 2279 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 2280 } 2281 2282 /* Insert PT2MAP PT2s into pmap PT1. */ 2283 pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); 2284 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 2285 pte1_store(pte1p++, PTE1_LINK(pa)); 2286 } 2287 2288 /* 2289 * Now synchronize new mapping which was made above. 2290 */ 2291 pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); 2292 pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); 2293 2294 CPU_ZERO(&pmap->pm_active); 2295 TAILQ_INIT(&pmap->pm_pvchunk); 2296 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2297 2298 return (1); 2299 } 2300 2301 #ifdef INVARIANTS 2302 static boolean_t 2303 pt2tab_user_is_empty(pt2_entry_t *tab) 2304 { 2305 u_int i, end; 2306 2307 end = pt2tab_index(VM_MAXUSER_ADDRESS); 2308 for (i = 0; i < end; i++) 2309 if (tab[i] != 0) return (FALSE); 2310 return (TRUE); 2311 } 2312 #endif 2313 /* 2314 * Release any resources held by the given physical map. 2315 * Called when a pmap initialized by pmap_pinit is being released. 2316 * Should only be called if the map contains no valid mappings. 2317 */ 2318 void 2319 pmap_release(pmap_t pmap) 2320 { 2321 #ifdef INVARIANTS 2322 vm_offset_t start, end; 2323 #endif 2324 KASSERT(pmap->pm_stats.resident_count == 0, 2325 ("%s: pmap resident count %ld != 0", __func__, 2326 pmap->pm_stats.resident_count)); 2327 KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), 2328 ("%s: has allocated user PT2(s)", __func__)); 2329 KASSERT(CPU_EMPTY(&pmap->pm_active), 2330 ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); 2331 2332 mtx_lock_spin(&allpmaps_lock); 2333 LIST_REMOVE(pmap, pm_list); 2334 mtx_unlock_spin(&allpmaps_lock); 2335 2336 #ifdef INVARIANTS 2337 start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); 2338 end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); 2339 bzero((char *)pmap->pm_pt1 + start, end - start); 2340 2341 start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); 2342 end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); 2343 bzero((char *)pmap->pm_pt2tab + start, end - start); 2344 #endif 2345 /* 2346 * We are leaving PT1 and PT2TAB allocated on released pmap, 2347 * so hopefully UMA vmspace_zone will always be inited with 2348 * UMA_ZONE_NOFREE flag. 2349 */ 2350 } 2351 2352 /********************************************************* 2353 * 2354 * L2 table pages and their pages management routines. 2355 * 2356 *********************************************************/ 2357 2358 /* 2359 * Virtual interface for L2 page table wire counting. 2360 * 2361 * Each L2 page table in a page has own counter which counts a number of 2362 * valid mappings in a table. Global page counter counts mappings in all 2363 * tables in a page plus a single itself mapping in PT2TAB. 2364 * 2365 * During a promotion we leave the associated L2 page table counter 2366 * untouched, so the table (strictly speaking a page which holds it) 2367 * is never freed if promoted. 2368 * 2369 * If a page m->ref_count == 1 then no valid mappings exist in any L2 page 2370 * table in the page and the page itself is only mapped in PT2TAB. 2371 */ 2372 2373 static __inline void 2374 pt2_wirecount_init(vm_page_t m) 2375 { 2376 u_int i; 2377 2378 /* 2379 * Note: A page m is allocated with VM_ALLOC_WIRED flag and 2380 * m->ref_count should be already set correctly. 2381 * So, there is no need to set it again herein. 2382 */ 2383 for (i = 0; i < NPT2_IN_PG; i++) 2384 m->md.pt2_wirecount[i] = 0; 2385 } 2386 2387 static __inline void 2388 pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) 2389 { 2390 2391 /* 2392 * Note: A just modificated pte2 (i.e. already allocated) 2393 * is acquiring one extra reference which must be 2394 * explicitly cleared. It influences the KASSERTs herein. 2395 * All L2 page tables in a page always belong to the same 2396 * pmap, so we allow only one extra reference for the page. 2397 */ 2398 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), 2399 ("%s: PT2 is overflowing ...", __func__)); 2400 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2401 ("%s: PT2PG is overflowing ...", __func__)); 2402 2403 m->ref_count++; 2404 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; 2405 } 2406 2407 static __inline void 2408 pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) 2409 { 2410 2411 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, 2412 ("%s: PT2 is underflowing ...", __func__)); 2413 KASSERT(m->ref_count > 1, 2414 ("%s: PT2PG is underflowing ...", __func__)); 2415 2416 m->ref_count--; 2417 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; 2418 } 2419 2420 static __inline void 2421 pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) 2422 { 2423 2424 KASSERT(count <= NPTE2_IN_PT2, 2425 ("%s: invalid count %u", __func__, count)); 2426 KASSERT(m->ref_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], 2427 ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->ref_count, 2428 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); 2429 2430 m->ref_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; 2431 m->ref_count += count; 2432 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; 2433 2434 KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), 2435 ("%s: PT2PG is overflowed (%u) ...", __func__, m->ref_count)); 2436 } 2437 2438 static __inline uint32_t 2439 pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) 2440 { 2441 2442 return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); 2443 } 2444 2445 static __inline boolean_t 2446 pt2_is_empty(vm_page_t m, vm_offset_t va) 2447 { 2448 2449 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); 2450 } 2451 2452 static __inline boolean_t 2453 pt2_is_full(vm_page_t m, vm_offset_t va) 2454 { 2455 2456 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 2457 NPTE2_IN_PT2); 2458 } 2459 2460 static __inline boolean_t 2461 pt2pg_is_empty(vm_page_t m) 2462 { 2463 2464 return (m->ref_count == 1); 2465 } 2466 2467 /* 2468 * This routine is called if the L2 page table 2469 * is not mapped correctly. 2470 */ 2471 static vm_page_t 2472 _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2473 { 2474 uint32_t pte1_idx; 2475 pt1_entry_t *pte1p; 2476 pt2_entry_t pte2; 2477 vm_page_t m; 2478 vm_paddr_t pt2pg_pa, pt2_pa; 2479 2480 pte1_idx = pte1_index(va); 2481 pte1p = pmap->pm_pt1 + pte1_idx; 2482 2483 KASSERT(pte1_load(pte1p) == 0, 2484 ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, 2485 pte1_load(pte1p))); 2486 2487 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); 2488 if (!pte2_is_valid(pte2)) { 2489 /* 2490 * Install new PT2s page into pmap PT2TAB. 2491 */ 2492 m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, 2493 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2494 if (m == NULL) { 2495 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2496 PMAP_UNLOCK(pmap); 2497 rw_wunlock(&pvh_global_lock); 2498 vm_wait(NULL); 2499 rw_wlock(&pvh_global_lock); 2500 PMAP_LOCK(pmap); 2501 } 2502 2503 /* 2504 * Indicate the need to retry. While waiting, 2505 * the L2 page table page may have been allocated. 2506 */ 2507 return (NULL); 2508 } 2509 pmap->pm_stats.resident_count++; 2510 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 2511 } else { 2512 pt2pg_pa = pte2_pa(pte2); 2513 m = PHYS_TO_VM_PAGE(pt2pg_pa); 2514 } 2515 2516 pt2_wirecount_inc(m, pte1_idx); 2517 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 2518 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 2519 2520 return (m); 2521 } 2522 2523 static vm_page_t 2524 pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2525 { 2526 u_int pte1_idx; 2527 pt1_entry_t *pte1p, pte1; 2528 vm_page_t m; 2529 2530 pte1_idx = pte1_index(va); 2531 retry: 2532 pte1p = pmap->pm_pt1 + pte1_idx; 2533 pte1 = pte1_load(pte1p); 2534 2535 /* 2536 * This supports switching from a 1MB page to a 2537 * normal 4K page. 2538 */ 2539 if (pte1_is_section(pte1)) { 2540 (void)pmap_demote_pte1(pmap, pte1p, va); 2541 /* 2542 * Reload pte1 after demotion. 2543 * 2544 * Note: Demotion can even fail as either PT2 is not find for 2545 * the virtual address or PT2PG can not be allocated. 2546 */ 2547 pte1 = pte1_load(pte1p); 2548 } 2549 2550 /* 2551 * If the L2 page table page is mapped, we just increment the 2552 * hold count, and activate it. 2553 */ 2554 if (pte1_is_link(pte1)) { 2555 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2556 pt2_wirecount_inc(m, pte1_idx); 2557 } else { 2558 /* 2559 * Here if the PT2 isn't mapped, or if it has 2560 * been deallocated. 2561 */ 2562 m = _pmap_allocpte2(pmap, va, flags); 2563 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2564 goto retry; 2565 } 2566 2567 return (m); 2568 } 2569 2570 /* 2571 * Schedule the specified unused L2 page table page to be freed. Specifically, 2572 * add the page to the specified list of pages that will be released to the 2573 * physical memory manager after the TLB has been updated. 2574 */ 2575 static __inline void 2576 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) 2577 { 2578 2579 /* 2580 * Put page on a list so that it is released after 2581 * *ALL* TLB shootdown is done 2582 */ 2583 #ifdef PMAP_DEBUG 2584 pmap_zero_page_check(m); 2585 #endif 2586 m->flags |= PG_ZERO; 2587 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2588 } 2589 2590 /* 2591 * Unwire L2 page tables page. 2592 */ 2593 static void 2594 pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) 2595 { 2596 pt1_entry_t *pte1p, opte1 __unused; 2597 pt2_entry_t *pte2p; 2598 uint32_t i; 2599 2600 KASSERT(pt2pg_is_empty(m), 2601 ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); 2602 2603 /* 2604 * Unmap all L2 page tables in the page from L1 page table. 2605 * 2606 * QQQ: Individual L2 page tables (except the last one) can be unmapped 2607 * earlier. However, we are doing that this way. 2608 */ 2609 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 2610 ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); 2611 pte1p = pmap->pm_pt1 + m->pindex; 2612 for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { 2613 KASSERT(m->md.pt2_wirecount[i] == 0, 2614 ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); 2615 opte1 = pte1_load(pte1p); 2616 if (pte1_is_link(opte1)) { 2617 pte1_clear(pte1p); 2618 /* 2619 * Flush intermediate TLB cache. 2620 */ 2621 pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); 2622 } 2623 #ifdef INVARIANTS 2624 else 2625 KASSERT((opte1 == 0) || pte1_is_section(opte1), 2626 ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, 2627 pmap, va, opte1, i)); 2628 #endif 2629 } 2630 2631 /* 2632 * Unmap the page from PT2TAB. 2633 */ 2634 pte2p = pmap_pt2tab_entry(pmap, va); 2635 (void)pt2tab_load_clear(pte2p); 2636 pmap_tlb_flush(pmap, pt2map_pt2pg(va)); 2637 2638 m->ref_count = 0; 2639 pmap->pm_stats.resident_count--; 2640 2641 /* 2642 * This barrier is so that the ordinary store unmapping 2643 * the L2 page table page is globally performed before TLB shoot- 2644 * down is begun. 2645 */ 2646 wmb(); 2647 vm_wire_sub(1); 2648 } 2649 2650 /* 2651 * Decrements a L2 page table page's wire count, which is used to record the 2652 * number of valid page table entries within the page. If the wire count 2653 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2654 * page table page was unmapped and FALSE otherwise. 2655 */ 2656 static __inline boolean_t 2657 pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2658 { 2659 pt2_wirecount_dec(m, pte1_index(va)); 2660 if (pt2pg_is_empty(m)) { 2661 /* 2662 * QQQ: Wire count is zero, so whole page should be zero and 2663 * we can set PG_ZERO flag to it. 2664 * Note that when promotion is enabled, it takes some 2665 * more efforts. See pmap_unwire_pt2_all() below. 2666 */ 2667 pmap_unwire_pt2pg(pmap, va, m); 2668 pmap_add_delayed_free_list(m, free); 2669 return (TRUE); 2670 } else 2671 return (FALSE); 2672 } 2673 2674 /* 2675 * Drop a L2 page table page's wire count at once, which is used to record 2676 * the number of valid L2 page table entries within the page. If the wire 2677 * count drops to zero, then the L2 page table page is unmapped. 2678 */ 2679 static __inline void 2680 pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, 2681 struct spglist *free) 2682 { 2683 u_int pte1_idx = pte1_index(va); 2684 2685 KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), 2686 ("%s: PT2 page's pindex is wrong", __func__)); 2687 KASSERT(m->ref_count > pt2_wirecount_get(m, pte1_idx), 2688 ("%s: bad pt2 wire count %u > %u", __func__, m->ref_count, 2689 pt2_wirecount_get(m, pte1_idx))); 2690 2691 /* 2692 * It's possible that the L2 page table was never used. 2693 * It happened in case that a section was created without promotion. 2694 */ 2695 if (pt2_is_full(m, va)) { 2696 pt2_wirecount_set(m, pte1_idx, 0); 2697 2698 /* 2699 * QQQ: We clear L2 page table now, so when L2 page table page 2700 * is going to be freed, we can set it PG_ZERO flag ... 2701 * This function is called only on section mappings, so 2702 * hopefully it's not to big overload. 2703 * 2704 * XXX: If pmap is current, existing PT2MAP mapping could be 2705 * used for zeroing. 2706 */ 2707 pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); 2708 } 2709 #ifdef INVARIANTS 2710 else 2711 KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", 2712 __func__, pt2_wirecount_get(m, pte1_idx))); 2713 #endif 2714 if (pt2pg_is_empty(m)) { 2715 pmap_unwire_pt2pg(pmap, va, m); 2716 pmap_add_delayed_free_list(m, free); 2717 } 2718 } 2719 2720 /* 2721 * After removing a L2 page table entry, this routine is used to 2722 * conditionally free the page, and manage the hold/wire counts. 2723 */ 2724 static boolean_t 2725 pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) 2726 { 2727 pt1_entry_t pte1; 2728 vm_page_t mpte; 2729 2730 if (va >= VM_MAXUSER_ADDRESS) 2731 return (FALSE); 2732 pte1 = pte1_load(pmap_pte1(pmap, va)); 2733 mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2734 return (pmap_unwire_pt2(pmap, va, mpte, free)); 2735 } 2736 2737 /************************************* 2738 * 2739 * Page management routines. 2740 * 2741 *************************************/ 2742 2743 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2744 CTASSERT(_NPCM == 11); 2745 CTASSERT(_NPCPV == 336); 2746 2747 static __inline struct pv_chunk * 2748 pv_to_chunk(pv_entry_t pv) 2749 { 2750 2751 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2752 } 2753 2754 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2755 2756 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2757 #define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2758 2759 static const uint32_t pc_freemask[_NPCM] = { 2760 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2761 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2762 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2763 PC_FREE0_9, PC_FREE10 2764 }; 2765 2766 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2767 "Current number of pv entries"); 2768 2769 #ifdef PV_STATS 2770 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2771 2772 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2773 "Current number of pv entry chunks"); 2774 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2775 "Current number of pv entry chunks allocated"); 2776 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2777 "Current number of pv entry chunks frees"); 2778 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 2779 0, "Number of times tried to get a chunk page but failed."); 2780 2781 static long pv_entry_frees, pv_entry_allocs; 2782 static int pv_entry_spare; 2783 2784 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2785 "Current number of pv entry frees"); 2786 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 2787 0, "Current number of pv entry allocs"); 2788 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2789 "Current number of spare pv entries"); 2790 #endif 2791 2792 /* 2793 * Is given page managed? 2794 */ 2795 static __inline bool 2796 is_managed(vm_paddr_t pa) 2797 { 2798 vm_page_t m; 2799 2800 m = PHYS_TO_VM_PAGE(pa); 2801 if (m == NULL) 2802 return (false); 2803 return ((m->oflags & VPO_UNMANAGED) == 0); 2804 } 2805 2806 static __inline bool 2807 pte1_is_managed(pt1_entry_t pte1) 2808 { 2809 2810 return (is_managed(pte1_pa(pte1))); 2811 } 2812 2813 static __inline bool 2814 pte2_is_managed(pt2_entry_t pte2) 2815 { 2816 2817 return (is_managed(pte2_pa(pte2))); 2818 } 2819 2820 /* 2821 * We are in a serious low memory condition. Resort to 2822 * drastic measures to free some pages so we can allocate 2823 * another pv entry chunk. 2824 */ 2825 static vm_page_t 2826 pmap_pv_reclaim(pmap_t locked_pmap) 2827 { 2828 struct pch newtail; 2829 struct pv_chunk *pc; 2830 struct md_page *pvh; 2831 pt1_entry_t *pte1p; 2832 pmap_t pmap; 2833 pt2_entry_t *pte2p, tpte2; 2834 pv_entry_t pv; 2835 vm_offset_t va; 2836 vm_page_t m, m_pc; 2837 struct spglist free; 2838 uint32_t inuse; 2839 int bit, field, freed; 2840 2841 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2842 pmap = NULL; 2843 m_pc = NULL; 2844 SLIST_INIT(&free); 2845 TAILQ_INIT(&newtail); 2846 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2847 SLIST_EMPTY(&free))) { 2848 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2849 if (pmap != pc->pc_pmap) { 2850 if (pmap != NULL) { 2851 if (pmap != locked_pmap) 2852 PMAP_UNLOCK(pmap); 2853 } 2854 pmap = pc->pc_pmap; 2855 /* Avoid deadlock and lock recursion. */ 2856 if (pmap > locked_pmap) 2857 PMAP_LOCK(pmap); 2858 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2859 pmap = NULL; 2860 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2861 continue; 2862 } 2863 } 2864 2865 /* 2866 * Destroy every non-wired, 4 KB page mapping in the chunk. 2867 */ 2868 freed = 0; 2869 for (field = 0; field < _NPCM; field++) { 2870 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2871 inuse != 0; inuse &= ~(1UL << bit)) { 2872 bit = ffs(inuse) - 1; 2873 pv = &pc->pc_pventry[field * 32 + bit]; 2874 va = pv->pv_va; 2875 pte1p = pmap_pte1(pmap, va); 2876 if (pte1_is_section(pte1_load(pte1p))) 2877 continue; 2878 pte2p = pmap_pte2(pmap, va); 2879 tpte2 = pte2_load(pte2p); 2880 if ((tpte2 & PTE2_W) == 0) 2881 tpte2 = pte2_load_clear(pte2p); 2882 pmap_pte2_release(pte2p); 2883 if ((tpte2 & PTE2_W) != 0) 2884 continue; 2885 KASSERT(tpte2 != 0, 2886 ("pmap_pv_reclaim: pmap %p va %#x zero pte", 2887 pmap, va)); 2888 pmap_tlb_flush(pmap, va); 2889 m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); 2890 if (pte2_is_dirty(tpte2)) 2891 vm_page_dirty(m); 2892 if ((tpte2 & PTE2_A) != 0) 2893 vm_page_aflag_set(m, PGA_REFERENCED); 2894 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2895 if (TAILQ_EMPTY(&m->md.pv_list) && 2896 (m->flags & PG_FICTITIOUS) == 0) { 2897 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2898 if (TAILQ_EMPTY(&pvh->pv_list)) { 2899 vm_page_aflag_clear(m, 2900 PGA_WRITEABLE); 2901 } 2902 } 2903 pc->pc_map[field] |= 1UL << bit; 2904 pmap_unuse_pt2(pmap, va, &free); 2905 freed++; 2906 } 2907 } 2908 if (freed == 0) { 2909 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2910 continue; 2911 } 2912 /* Every freed mapping is for a 4 KB page. */ 2913 pmap->pm_stats.resident_count -= freed; 2914 PV_STAT(pv_entry_frees += freed); 2915 PV_STAT(pv_entry_spare += freed); 2916 pv_entry_count -= freed; 2917 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2918 for (field = 0; field < _NPCM; field++) 2919 if (pc->pc_map[field] != pc_freemask[field]) { 2920 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2921 pc_list); 2922 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2923 2924 /* 2925 * One freed pv entry in locked_pmap is 2926 * sufficient. 2927 */ 2928 if (pmap == locked_pmap) 2929 goto out; 2930 break; 2931 } 2932 if (field == _NPCM) { 2933 PV_STAT(pv_entry_spare -= _NPCPV); 2934 PV_STAT(pc_chunk_count--); 2935 PV_STAT(pc_chunk_frees++); 2936 /* Entire chunk is free; return it. */ 2937 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2938 pmap_qremove((vm_offset_t)pc, 1); 2939 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2940 break; 2941 } 2942 } 2943 out: 2944 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2945 if (pmap != NULL) { 2946 if (pmap != locked_pmap) 2947 PMAP_UNLOCK(pmap); 2948 } 2949 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2950 m_pc = SLIST_FIRST(&free); 2951 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2952 /* Recycle a freed page table page. */ 2953 m_pc->ref_count = 1; 2954 vm_wire_add(1); 2955 } 2956 vm_page_free_pages_toq(&free, false); 2957 return (m_pc); 2958 } 2959 2960 static void 2961 free_pv_chunk(struct pv_chunk *pc) 2962 { 2963 vm_page_t m; 2964 2965 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2966 PV_STAT(pv_entry_spare -= _NPCPV); 2967 PV_STAT(pc_chunk_count--); 2968 PV_STAT(pc_chunk_frees++); 2969 /* entire chunk is free, return it */ 2970 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2971 pmap_qremove((vm_offset_t)pc, 1); 2972 vm_page_unwire_noq(m); 2973 vm_page_free(m); 2974 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2975 } 2976 2977 /* 2978 * Free the pv_entry back to the free list. 2979 */ 2980 static void 2981 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2982 { 2983 struct pv_chunk *pc; 2984 int idx, field, bit; 2985 2986 rw_assert(&pvh_global_lock, RA_WLOCKED); 2987 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2988 PV_STAT(pv_entry_frees++); 2989 PV_STAT(pv_entry_spare++); 2990 pv_entry_count--; 2991 pc = pv_to_chunk(pv); 2992 idx = pv - &pc->pc_pventry[0]; 2993 field = idx / 32; 2994 bit = idx % 32; 2995 pc->pc_map[field] |= 1ul << bit; 2996 for (idx = 0; idx < _NPCM; idx++) 2997 if (pc->pc_map[idx] != pc_freemask[idx]) { 2998 /* 2999 * 98% of the time, pc is already at the head of the 3000 * list. If it isn't already, move it to the head. 3001 */ 3002 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 3003 pc)) { 3004 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3005 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 3006 pc_list); 3007 } 3008 return; 3009 } 3010 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3011 free_pv_chunk(pc); 3012 } 3013 3014 /* 3015 * Get a new pv_entry, allocating a block from the system 3016 * when needed. 3017 */ 3018 static pv_entry_t 3019 get_pv_entry(pmap_t pmap, boolean_t try) 3020 { 3021 static const struct timeval printinterval = { 60, 0 }; 3022 static struct timeval lastprint; 3023 int bit, field; 3024 pv_entry_t pv; 3025 struct pv_chunk *pc; 3026 vm_page_t m; 3027 3028 rw_assert(&pvh_global_lock, RA_WLOCKED); 3029 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3030 PV_STAT(pv_entry_allocs++); 3031 pv_entry_count++; 3032 if (pv_entry_count > pv_entry_high_water) 3033 if (ratecheck(&lastprint, &printinterval)) 3034 printf("Approaching the limit on PV entries, consider " 3035 "increasing either the vm.pmap.shpgperproc or the " 3036 "vm.pmap.pv_entries tunable.\n"); 3037 retry: 3038 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3039 if (pc != NULL) { 3040 for (field = 0; field < _NPCM; field++) { 3041 if (pc->pc_map[field]) { 3042 bit = ffs(pc->pc_map[field]) - 1; 3043 break; 3044 } 3045 } 3046 if (field < _NPCM) { 3047 pv = &pc->pc_pventry[field * 32 + bit]; 3048 pc->pc_map[field] &= ~(1ul << bit); 3049 /* If this was the last item, move it to tail */ 3050 for (field = 0; field < _NPCM; field++) 3051 if (pc->pc_map[field] != 0) { 3052 PV_STAT(pv_entry_spare--); 3053 return (pv); /* not full, return */ 3054 } 3055 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3056 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3057 PV_STAT(pv_entry_spare--); 3058 return (pv); 3059 } 3060 } 3061 /* 3062 * Access to the pte2list "pv_vafree" is synchronized by the pvh 3063 * global lock. If "pv_vafree" is currently non-empty, it will 3064 * remain non-empty until pmap_pte2list_alloc() completes. 3065 */ 3066 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 3067 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3068 if (try) { 3069 pv_entry_count--; 3070 PV_STAT(pc_chunk_tryfail++); 3071 return (NULL); 3072 } 3073 m = pmap_pv_reclaim(pmap); 3074 if (m == NULL) 3075 goto retry; 3076 } 3077 PV_STAT(pc_chunk_count++); 3078 PV_STAT(pc_chunk_allocs++); 3079 pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); 3080 pmap_qenter((vm_offset_t)pc, &m, 1); 3081 pc->pc_pmap = pmap; 3082 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 3083 for (field = 1; field < _NPCM; field++) 3084 pc->pc_map[field] = pc_freemask[field]; 3085 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3086 pv = &pc->pc_pventry[0]; 3087 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3088 PV_STAT(pv_entry_spare += _NPCPV - 1); 3089 return (pv); 3090 } 3091 3092 /* 3093 * Create a pv entry for page at pa for 3094 * (pmap, va). 3095 */ 3096 static void 3097 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3098 { 3099 pv_entry_t pv; 3100 3101 rw_assert(&pvh_global_lock, RA_WLOCKED); 3102 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3103 pv = get_pv_entry(pmap, FALSE); 3104 pv->pv_va = va; 3105 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3106 } 3107 3108 static __inline pv_entry_t 3109 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3110 { 3111 pv_entry_t pv; 3112 3113 rw_assert(&pvh_global_lock, RA_WLOCKED); 3114 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3115 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3116 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3117 break; 3118 } 3119 } 3120 return (pv); 3121 } 3122 3123 static void 3124 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3125 { 3126 pv_entry_t pv; 3127 3128 pv = pmap_pvh_remove(pvh, pmap, va); 3129 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3130 free_pv_entry(pmap, pv); 3131 } 3132 3133 static void 3134 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 3135 { 3136 struct md_page *pvh; 3137 3138 rw_assert(&pvh_global_lock, RA_WLOCKED); 3139 pmap_pvh_free(&m->md, pmap, va); 3140 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 3141 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3142 if (TAILQ_EMPTY(&pvh->pv_list)) 3143 vm_page_aflag_clear(m, PGA_WRITEABLE); 3144 } 3145 } 3146 3147 static void 3148 pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3149 { 3150 struct md_page *pvh; 3151 pv_entry_t pv; 3152 vm_offset_t va_last; 3153 vm_page_t m; 3154 3155 rw_assert(&pvh_global_lock, RA_WLOCKED); 3156 KASSERT((pa & PTE1_OFFSET) == 0, 3157 ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); 3158 3159 /* 3160 * Transfer the 1mpage's pv entry for this mapping to the first 3161 * page's pv list. 3162 */ 3163 pvh = pa_to_pvh(pa); 3164 va = pte1_trunc(va); 3165 pv = pmap_pvh_remove(pvh, pmap, va); 3166 KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); 3167 m = PHYS_TO_VM_PAGE(pa); 3168 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3169 /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3170 va_last = va + PTE1_SIZE - PAGE_SIZE; 3171 do { 3172 m++; 3173 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3174 ("pmap_pv_demote_pte1: page %p is not managed", m)); 3175 va += PAGE_SIZE; 3176 pmap_insert_entry(pmap, va, m); 3177 } while (va < va_last); 3178 } 3179 3180 #if VM_NRESERVLEVEL > 0 3181 static void 3182 pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3183 { 3184 struct md_page *pvh; 3185 pv_entry_t pv; 3186 vm_offset_t va_last; 3187 vm_page_t m; 3188 3189 rw_assert(&pvh_global_lock, RA_WLOCKED); 3190 KASSERT((pa & PTE1_OFFSET) == 0, 3191 ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); 3192 3193 /* 3194 * Transfer the first page's pv entry for this mapping to the 3195 * 1mpage's pv list. Aside from avoiding the cost of a call 3196 * to get_pv_entry(), a transfer avoids the possibility that 3197 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() 3198 * removes one of the mappings that is being promoted. 3199 */ 3200 m = PHYS_TO_VM_PAGE(pa); 3201 va = pte1_trunc(va); 3202 pv = pmap_pvh_remove(&m->md, pmap, va); 3203 KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); 3204 pvh = pa_to_pvh(pa); 3205 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3206 /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3207 va_last = va + PTE1_SIZE - PAGE_SIZE; 3208 do { 3209 m++; 3210 va += PAGE_SIZE; 3211 pmap_pvh_free(&m->md, pmap, va); 3212 } while (va < va_last); 3213 } 3214 #endif 3215 3216 /* 3217 * Conditionally create a pv entry. 3218 */ 3219 static boolean_t 3220 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3221 { 3222 pv_entry_t pv; 3223 3224 rw_assert(&pvh_global_lock, RA_WLOCKED); 3225 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3226 if (pv_entry_count < pv_entry_high_water && 3227 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3228 pv->pv_va = va; 3229 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3230 return (TRUE); 3231 } else 3232 return (FALSE); 3233 } 3234 3235 /* 3236 * Create the pv entries for each of the pages within a section. 3237 */ 3238 static bool 3239 pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags) 3240 { 3241 struct md_page *pvh; 3242 pv_entry_t pv; 3243 bool noreclaim; 3244 3245 rw_assert(&pvh_global_lock, RA_WLOCKED); 3246 noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; 3247 if ((noreclaim && pv_entry_count >= pv_entry_high_water) || 3248 (pv = get_pv_entry(pmap, noreclaim)) == NULL) 3249 return (false); 3250 pv->pv_va = va; 3251 pvh = pa_to_pvh(pte1_pa(pte1)); 3252 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3253 return (true); 3254 } 3255 3256 static inline void 3257 pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) 3258 { 3259 3260 /* Kill all the small mappings or the big one only. */ 3261 if (pte1_is_section(npte1)) 3262 pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); 3263 else 3264 pmap_tlb_flush(pmap, pte1_trunc(va)); 3265 } 3266 3267 /* 3268 * Update kernel pte1 on all pmaps. 3269 * 3270 * The following function is called only on one cpu with disabled interrupts. 3271 * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way 3272 * nobody can invoke explicit hardware table walk during the update of pte1. 3273 * Unsolicited hardware table walk can still happen, invoked by speculative 3274 * data or instruction prefetch or even by speculative hardware table walk. 3275 * 3276 * The break-before-make approach should be implemented here. However, it's 3277 * not so easy to do that for kernel mappings as it would be unhappy to unmap 3278 * itself unexpectedly but voluntarily. 3279 */ 3280 static void 3281 pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) 3282 { 3283 pmap_t pmap; 3284 pt1_entry_t *pte1p; 3285 3286 /* 3287 * Get current pmap. Interrupts should be disabled here 3288 * so PCPU_GET() is done atomically. 3289 */ 3290 pmap = PCPU_GET(curpmap); 3291 if (pmap == NULL) 3292 pmap = kernel_pmap; 3293 3294 /* 3295 * (1) Change pte1 on current pmap. 3296 * (2) Flush all obsolete TLB entries on current CPU. 3297 * (3) Change pte1 on all pmaps. 3298 * (4) Flush all obsolete TLB entries on all CPUs in SMP case. 3299 */ 3300 3301 pte1p = pmap_pte1(pmap, va); 3302 pte1_store(pte1p, npte1); 3303 3304 /* Kill all the small mappings or the big one only. */ 3305 if (pte1_is_section(npte1)) { 3306 pmap_pte1_kern_promotions++; 3307 tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); 3308 } else { 3309 pmap_pte1_kern_demotions++; 3310 tlb_flush_local(pte1_trunc(va)); 3311 } 3312 3313 /* 3314 * In SMP case, this function is called when all cpus are at smp 3315 * rendezvous, so there is no need to use 'allpmaps_lock' lock here. 3316 * In UP case, the function is called with this lock locked. 3317 */ 3318 LIST_FOREACH(pmap, &allpmaps, pm_list) { 3319 pte1p = pmap_pte1(pmap, va); 3320 pte1_store(pte1p, npte1); 3321 } 3322 3323 #ifdef SMP 3324 /* Kill all the small mappings or the big one only. */ 3325 if (pte1_is_section(npte1)) 3326 tlb_flush_range(pte1_trunc(va), PTE1_SIZE); 3327 else 3328 tlb_flush(pte1_trunc(va)); 3329 #endif 3330 } 3331 3332 #ifdef SMP 3333 struct pte1_action { 3334 vm_offset_t va; 3335 pt1_entry_t npte1; 3336 u_int update; /* CPU that updates the PTE1 */ 3337 }; 3338 3339 static void 3340 pmap_update_pte1_action(void *arg) 3341 { 3342 struct pte1_action *act = arg; 3343 3344 if (act->update == PCPU_GET(cpuid)) 3345 pmap_update_pte1_kernel(act->va, act->npte1); 3346 } 3347 3348 /* 3349 * Change pte1 on current pmap. 3350 * Note that kernel pte1 must be changed on all pmaps. 3351 * 3352 * According to the architecture reference manual published by ARM, 3353 * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. 3354 * According to this manual, UNPREDICTABLE behaviours must never happen in 3355 * a viable system. In contrast, on x86 processors, it is not specified which 3356 * TLB entry mapping the virtual address will be used, but the MMU doesn't 3357 * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone 3358 * Black). 3359 * 3360 * It's a problem when either promotion or demotion is being done. The pte1 3361 * update and appropriate TLB flush must be done atomically in general. 3362 */ 3363 static void 3364 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3365 pt1_entry_t npte1) 3366 { 3367 3368 if (pmap == kernel_pmap) { 3369 struct pte1_action act; 3370 3371 sched_pin(); 3372 act.va = va; 3373 act.npte1 = npte1; 3374 act.update = PCPU_GET(cpuid); 3375 smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, 3376 pmap_update_pte1_action, NULL, &act); 3377 sched_unpin(); 3378 } else { 3379 register_t cspr; 3380 3381 /* 3382 * Use break-before-make approach for changing userland 3383 * mappings. It can cause L1 translation aborts on other 3384 * cores in SMP case. So, special treatment is implemented 3385 * in pmap_fault(). To reduce the likelihood that another core 3386 * will be affected by the broken mapping, disable interrupts 3387 * until the mapping change is completed. 3388 */ 3389 cspr = disable_interrupts(PSR_I | PSR_F); 3390 pte1_clear(pte1p); 3391 pmap_tlb_flush_pte1(pmap, va, npte1); 3392 pte1_store(pte1p, npte1); 3393 restore_interrupts(cspr); 3394 } 3395 } 3396 #else 3397 static void 3398 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3399 pt1_entry_t npte1) 3400 { 3401 3402 if (pmap == kernel_pmap) { 3403 mtx_lock_spin(&allpmaps_lock); 3404 pmap_update_pte1_kernel(va, npte1); 3405 mtx_unlock_spin(&allpmaps_lock); 3406 } else { 3407 register_t cspr; 3408 3409 /* 3410 * Use break-before-make approach for changing userland 3411 * mappings. It's absolutely safe in UP case when interrupts 3412 * are disabled. 3413 */ 3414 cspr = disable_interrupts(PSR_I | PSR_F); 3415 pte1_clear(pte1p); 3416 pmap_tlb_flush_pte1(pmap, va, npte1); 3417 pte1_store(pte1p, npte1); 3418 restore_interrupts(cspr); 3419 } 3420 } 3421 #endif 3422 3423 #if VM_NRESERVLEVEL > 0 3424 /* 3425 * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are 3426 * within a single page table page (PT2) to a single 1MB page mapping. 3427 * For promotion to occur, two conditions must be met: (1) the 4KB page 3428 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3429 * mappings must have identical characteristics. 3430 * 3431 * Managed (PG_MANAGED) mappings within the kernel address space are not 3432 * promoted. The reason is that kernel PTE1s are replicated in each pmap but 3433 * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only 3434 * read the PTE1 from the kernel pmap. 3435 */ 3436 static void 3437 pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3438 { 3439 pt1_entry_t npte1; 3440 pt2_entry_t *fpte2p, fpte2, fpte2_fav; 3441 pt2_entry_t *pte2p, pte2; 3442 vm_offset_t pteva __unused; 3443 vm_page_t m __unused; 3444 3445 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3446 pmap, va, pte1_load(pte1p), pte1p)); 3447 3448 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3449 3450 /* 3451 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is 3452 * either invalid, unused, or does not map the first 4KB physical page 3453 * within a 1MB page. 3454 */ 3455 fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); 3456 fpte2 = pte2_load(fpte2p); 3457 if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != 3458 (PTE2_A | PTE2_V)) { 3459 pmap_pte1_p_failures++; 3460 CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", 3461 __func__, va, pmap); 3462 return; 3463 } 3464 if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { 3465 pmap_pte1_p_failures++; 3466 CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", 3467 __func__, va, pmap); 3468 return; 3469 } 3470 if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3471 /* 3472 * When page is not modified, PTE2_RO can be set without 3473 * a TLB invalidation. 3474 */ 3475 fpte2 |= PTE2_RO; 3476 pte2_store(fpte2p, fpte2); 3477 } 3478 3479 /* 3480 * Examine each of the other PTE2s in the specified PT2. Abort if this 3481 * PTE2 maps an unexpected 4KB physical page or does not have identical 3482 * characteristics to the first PTE2. 3483 */ 3484 fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); 3485 fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ 3486 for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { 3487 pte2 = pte2_load(pte2p); 3488 if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { 3489 pmap_pte1_p_failures++; 3490 CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", 3491 __func__, va, pmap); 3492 return; 3493 } 3494 if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3495 /* 3496 * When page is not modified, PTE2_RO can be set 3497 * without a TLB invalidation. See note above. 3498 */ 3499 pte2 |= PTE2_RO; 3500 pte2_store(pte2p, pte2); 3501 pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & 3502 PTE2_FRAME); 3503 CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", 3504 __func__, pteva, pmap); 3505 } 3506 if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { 3507 pmap_pte1_p_failures++; 3508 CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", 3509 __func__, va, pmap); 3510 return; 3511 } 3512 3513 fpte2_fav -= PTE2_SIZE; 3514 } 3515 /* 3516 * The page table page in its current state will stay in PT2TAB 3517 * until the PTE1 mapping the section is demoted by pmap_demote_pte1() 3518 * or destroyed by pmap_remove_pte1(). 3519 * 3520 * Note that L2 page table size is not equal to PAGE_SIZE. 3521 */ 3522 m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); 3523 KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], 3524 ("%s: PT2 page is out of range", __func__)); 3525 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 3526 ("%s: PT2 page's pindex is wrong", __func__)); 3527 3528 /* 3529 * Get pte1 from pte2 format. 3530 */ 3531 npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; 3532 3533 /* 3534 * Promote the pv entries. 3535 */ 3536 if (pte2_is_managed(fpte2)) 3537 pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); 3538 3539 /* 3540 * Promote the mappings. 3541 */ 3542 pmap_change_pte1(pmap, pte1p, va, npte1); 3543 3544 pmap_pte1_promotions++; 3545 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3546 __func__, va, pmap); 3547 3548 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3549 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3550 } 3551 #endif /* VM_NRESERVLEVEL > 0 */ 3552 3553 /* 3554 * Zero L2 page table page. 3555 */ 3556 static __inline void 3557 pmap_clear_pt2(pt2_entry_t *fpte2p) 3558 { 3559 pt2_entry_t *pte2p; 3560 3561 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) 3562 pte2_clear(pte2p); 3563 3564 } 3565 3566 /* 3567 * Removes a 1MB page mapping from the kernel pmap. 3568 */ 3569 static void 3570 pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3571 { 3572 vm_page_t m; 3573 uint32_t pte1_idx; 3574 pt2_entry_t *fpte2p; 3575 vm_paddr_t pt2_pa; 3576 3577 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3578 m = pmap_pt2_page(pmap, va); 3579 if (m == NULL) 3580 /* 3581 * QQQ: Is this function called only on promoted pte1? 3582 * We certainly do section mappings directly 3583 * (without promotion) in kernel !!! 3584 */ 3585 panic("%s: missing pt2 page", __func__); 3586 3587 pte1_idx = pte1_index(va); 3588 3589 /* 3590 * Initialize the L2 page table. 3591 */ 3592 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3593 pmap_clear_pt2(fpte2p); 3594 3595 /* 3596 * Remove the mapping. 3597 */ 3598 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); 3599 pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); 3600 3601 /* 3602 * QQQ: We do not need to invalidate PT2MAP mapping 3603 * as we did not change it. I.e. the L2 page table page 3604 * was and still is mapped the same way. 3605 */ 3606 } 3607 3608 /* 3609 * Do the things to unmap a section in a process 3610 */ 3611 static void 3612 pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 3613 struct spglist *free) 3614 { 3615 pt1_entry_t opte1; 3616 struct md_page *pvh; 3617 vm_offset_t eva, va; 3618 vm_page_t m; 3619 3620 PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, 3621 pte1_load(pte1p), pte1p)); 3622 3623 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3624 KASSERT((sva & PTE1_OFFSET) == 0, 3625 ("%s: sva is not 1mpage aligned", __func__)); 3626 3627 /* 3628 * Clear and invalidate the mapping. It should occupy one and only TLB 3629 * entry. So, pmap_tlb_flush() called with aligned address should be 3630 * sufficient. 3631 */ 3632 opte1 = pte1_load_clear(pte1p); 3633 pmap_tlb_flush(pmap, sva); 3634 3635 if (pte1_is_wired(opte1)) 3636 pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; 3637 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 3638 if (pte1_is_managed(opte1)) { 3639 pvh = pa_to_pvh(pte1_pa(opte1)); 3640 pmap_pvh_free(pvh, pmap, sva); 3641 eva = sva + PTE1_SIZE; 3642 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 3643 va < eva; va += PAGE_SIZE, m++) { 3644 if (pte1_is_dirty(opte1)) 3645 vm_page_dirty(m); 3646 if (opte1 & PTE1_A) 3647 vm_page_aflag_set(m, PGA_REFERENCED); 3648 if (TAILQ_EMPTY(&m->md.pv_list) && 3649 TAILQ_EMPTY(&pvh->pv_list)) 3650 vm_page_aflag_clear(m, PGA_WRITEABLE); 3651 } 3652 } 3653 if (pmap == kernel_pmap) { 3654 /* 3655 * L2 page table(s) can't be removed from kernel map as 3656 * kernel counts on it (stuff around pmap_growkernel()). 3657 */ 3658 pmap_remove_kernel_pte1(pmap, pte1p, sva); 3659 } else { 3660 /* 3661 * Get associated L2 page table page. 3662 * It's possible that the page was never allocated. 3663 */ 3664 m = pmap_pt2_page(pmap, sva); 3665 if (m != NULL) 3666 pmap_unwire_pt2_all(pmap, sva, m, free); 3667 } 3668 } 3669 3670 /* 3671 * Fills L2 page table page with mappings to consecutive physical pages. 3672 */ 3673 static __inline void 3674 pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) 3675 { 3676 pt2_entry_t *pte2p; 3677 3678 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { 3679 pte2_store(pte2p, npte2); 3680 npte2 += PTE2_SIZE; 3681 } 3682 } 3683 3684 /* 3685 * Tries to demote a 1MB page mapping. If demotion fails, the 3686 * 1MB page mapping is invalidated. 3687 */ 3688 static boolean_t 3689 pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3690 { 3691 pt1_entry_t opte1, npte1; 3692 pt2_entry_t *fpte2p, npte2; 3693 vm_paddr_t pt2pg_pa, pt2_pa; 3694 vm_page_t m; 3695 struct spglist free; 3696 uint32_t pte1_idx, isnew = 0; 3697 3698 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3699 pmap, va, pte1_load(pte1p), pte1p)); 3700 3701 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3702 3703 opte1 = pte1_load(pte1p); 3704 KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); 3705 3706 if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { 3707 KASSERT(!pte1_is_wired(opte1), 3708 ("%s: PT2 page for a wired mapping is missing", __func__)); 3709 3710 /* 3711 * Invalidate the 1MB page mapping and return 3712 * "failure" if the mapping was never accessed or the 3713 * allocation of the new page table page fails. 3714 */ 3715 if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, 3716 pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | 3717 VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { 3718 SLIST_INIT(&free); 3719 pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); 3720 vm_page_free_pages_toq(&free, false); 3721 CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", 3722 __func__, va, pmap); 3723 return (FALSE); 3724 } 3725 if (va < VM_MAXUSER_ADDRESS) 3726 pmap->pm_stats.resident_count++; 3727 3728 isnew = 1; 3729 3730 /* 3731 * We init all L2 page tables in the page even if 3732 * we are going to change everything for one L2 page 3733 * table in a while. 3734 */ 3735 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 3736 } else { 3737 if (va < VM_MAXUSER_ADDRESS) { 3738 if (pt2_is_empty(m, va)) 3739 isnew = 1; /* Demoting section w/o promotion. */ 3740 #ifdef INVARIANTS 3741 else 3742 KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" 3743 " count %u", __func__, 3744 pt2_wirecount_get(m, pte1_index(va)))); 3745 #endif 3746 } 3747 } 3748 3749 pt2pg_pa = VM_PAGE_TO_PHYS(m); 3750 pte1_idx = pte1_index(va); 3751 /* 3752 * If the pmap is current, then the PT2MAP can provide access to 3753 * the page table page (promoted L2 page tables are not unmapped). 3754 * Otherwise, temporarily map the L2 page table page (m) into 3755 * the kernel's address space at either PADDR1 or PADDR2. 3756 * 3757 * Note that L2 page table size is not equal to PAGE_SIZE. 3758 */ 3759 if (pmap_is_current(pmap)) 3760 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3761 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 3762 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 3763 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 3764 #ifdef SMP 3765 PMAP1cpu = PCPU_GET(cpuid); 3766 #endif 3767 tlb_flush_local((vm_offset_t)PADDR1); 3768 PMAP1changed++; 3769 } else 3770 #ifdef SMP 3771 if (PMAP1cpu != PCPU_GET(cpuid)) { 3772 PMAP1cpu = PCPU_GET(cpuid); 3773 tlb_flush_local((vm_offset_t)PADDR1); 3774 PMAP1changedcpu++; 3775 } else 3776 #endif 3777 PMAP1unchanged++; 3778 fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); 3779 } else { 3780 mtx_lock(&PMAP2mutex); 3781 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 3782 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 3783 tlb_flush((vm_offset_t)PADDR2); 3784 } 3785 fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); 3786 } 3787 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 3788 npte1 = PTE1_LINK(pt2_pa); 3789 3790 KASSERT((opte1 & PTE1_A) != 0, 3791 ("%s: opte1 is missing PTE1_A", __func__)); 3792 KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, 3793 ("%s: opte1 has PTE1_NM", __func__)); 3794 3795 /* 3796 * Get pte2 from pte1 format. 3797 */ 3798 npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; 3799 3800 /* 3801 * If the L2 page table page is new, initialize it. If the mapping 3802 * has changed attributes, update the page table entries. 3803 */ 3804 if (isnew != 0) { 3805 pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); 3806 pmap_fill_pt2(fpte2p, npte2); 3807 } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != 3808 (npte2 & PTE2_PROMOTE)) 3809 pmap_fill_pt2(fpte2p, npte2); 3810 3811 KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), 3812 ("%s: fpte2p and npte2 map different physical addresses", 3813 __func__)); 3814 3815 if (fpte2p == PADDR2) 3816 mtx_unlock(&PMAP2mutex); 3817 3818 /* 3819 * Demote the mapping. This pmap is locked. The old PTE1 has 3820 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also 3821 * has not PTE1_NM set. Thus, there is no danger of a race with 3822 * another processor changing the setting of PTE1_A and/or PTE1_NM 3823 * between the read above and the store below. 3824 */ 3825 pmap_change_pte1(pmap, pte1p, va, npte1); 3826 3827 /* 3828 * Demote the pv entry. This depends on the earlier demotion 3829 * of the mapping. Specifically, the (re)creation of a per- 3830 * page pv entry might trigger the execution of pmap_pv_reclaim(), 3831 * which might reclaim a newly (re)created per-page pv entry 3832 * and destroy the associated mapping. In order to destroy 3833 * the mapping, the PTE1 must have already changed from mapping 3834 * the 1mpage to referencing the page table page. 3835 */ 3836 if (pte1_is_managed(opte1)) 3837 pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); 3838 3839 pmap_pte1_demotions++; 3840 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3841 __func__, va, pmap); 3842 3843 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3844 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3845 return (TRUE); 3846 } 3847 3848 /* 3849 * Insert the given physical page (p) at 3850 * the specified virtual address (v) in the 3851 * target physical map with the protection requested. 3852 * 3853 * If specified, the page will be wired down, meaning 3854 * that the related pte can not be reclaimed. 3855 * 3856 * NB: This is the only routine which MAY NOT lazy-evaluate 3857 * or lose information. That is, this routine must actually 3858 * insert this page into the given map NOW. 3859 */ 3860 int 3861 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3862 u_int flags, int8_t psind) 3863 { 3864 pt1_entry_t *pte1p; 3865 pt2_entry_t *pte2p; 3866 pt2_entry_t npte2, opte2; 3867 pv_entry_t pv; 3868 vm_paddr_t opa, pa; 3869 vm_page_t mpte2, om; 3870 int rv; 3871 3872 va = trunc_page(va); 3873 KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); 3874 KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, 3875 ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, 3876 va)); 3877 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 3878 va >= kmi.clean_eva, 3879 ("%s: managed mapping within the clean submap", __func__)); 3880 if ((m->oflags & VPO_UNMANAGED) == 0) 3881 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3882 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 3883 ("%s: flags %u has reserved bits set", __func__, flags)); 3884 pa = VM_PAGE_TO_PHYS(m); 3885 npte2 = PTE2(pa, PTE2_A, vm_page_pte2_attr(m)); 3886 if ((flags & VM_PROT_WRITE) == 0) 3887 npte2 |= PTE2_NM; 3888 if ((prot & VM_PROT_WRITE) == 0) 3889 npte2 |= PTE2_RO; 3890 KASSERT((npte2 & (PTE2_NM | PTE2_RO)) != PTE2_RO, 3891 ("%s: flags includes VM_PROT_WRITE but prot doesn't", __func__)); 3892 if ((prot & VM_PROT_EXECUTE) == 0) 3893 npte2 |= PTE2_NX; 3894 if ((flags & PMAP_ENTER_WIRED) != 0) 3895 npte2 |= PTE2_W; 3896 if (va < VM_MAXUSER_ADDRESS) 3897 npte2 |= PTE2_U; 3898 if (pmap != kernel_pmap) 3899 npte2 |= PTE2_NG; 3900 3901 rw_wlock(&pvh_global_lock); 3902 PMAP_LOCK(pmap); 3903 sched_pin(); 3904 if (psind == 1) { 3905 /* Assert the required virtual and physical alignment. */ 3906 KASSERT((va & PTE1_OFFSET) == 0, 3907 ("%s: va unaligned", __func__)); 3908 KASSERT(m->psind > 0, ("%s: m->psind < psind", __func__)); 3909 rv = pmap_enter_pte1(pmap, va, PTE1_PA(pa) | ATTR_TO_L1(npte2) | 3910 PTE1_V, flags, m); 3911 goto out; 3912 } 3913 3914 /* 3915 * In the case that a page table page is not 3916 * resident, we are creating it here. 3917 */ 3918 if (va < VM_MAXUSER_ADDRESS) { 3919 mpte2 = pmap_allocpte2(pmap, va, flags); 3920 if (mpte2 == NULL) { 3921 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3922 ("pmap_allocpte2 failed with sleep allowed")); 3923 rv = KERN_RESOURCE_SHORTAGE; 3924 goto out; 3925 } 3926 } else 3927 mpte2 = NULL; 3928 pte1p = pmap_pte1(pmap, va); 3929 if (pte1_is_section(pte1_load(pte1p))) 3930 panic("%s: attempted on 1MB page", __func__); 3931 pte2p = pmap_pte2_quick(pmap, va); 3932 if (pte2p == NULL) 3933 panic("%s: invalid L1 page table entry va=%#x", __func__, va); 3934 3935 om = NULL; 3936 opte2 = pte2_load(pte2p); 3937 opa = pte2_pa(opte2); 3938 /* 3939 * Mapping has not changed, must be protection or wiring change. 3940 */ 3941 if (pte2_is_valid(opte2) && (opa == pa)) { 3942 /* 3943 * Wiring change, just update stats. We don't worry about 3944 * wiring PT2 pages as they remain resident as long as there 3945 * are valid mappings in them. Hence, if a user page is wired, 3946 * the PT2 page will be also. 3947 */ 3948 if (pte2_is_wired(npte2) && !pte2_is_wired(opte2)) 3949 pmap->pm_stats.wired_count++; 3950 else if (!pte2_is_wired(npte2) && pte2_is_wired(opte2)) 3951 pmap->pm_stats.wired_count--; 3952 3953 /* 3954 * Remove extra pte2 reference 3955 */ 3956 if (mpte2) 3957 pt2_wirecount_dec(mpte2, pte1_index(va)); 3958 if ((m->oflags & VPO_UNMANAGED) == 0) 3959 om = m; 3960 goto validate; 3961 } 3962 3963 /* 3964 * QQQ: We think that changing physical address on writeable mapping 3965 * is not safe. Well, maybe on kernel address space with correct 3966 * locking, it can make a sense. However, we have no idea why 3967 * anyone should do that on user address space. Are we wrong? 3968 */ 3969 KASSERT((opa == 0) || (opa == pa) || 3970 !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), 3971 ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", 3972 __func__, pmap, va, opte2, opa, pa, flags, prot)); 3973 3974 pv = NULL; 3975 3976 /* 3977 * Mapping has changed, invalidate old range and fall through to 3978 * handle validating new mapping. 3979 */ 3980 if (opa) { 3981 if (pte2_is_wired(opte2)) 3982 pmap->pm_stats.wired_count--; 3983 om = PHYS_TO_VM_PAGE(opa); 3984 if (om != NULL && (om->oflags & VPO_UNMANAGED) != 0) 3985 om = NULL; 3986 if (om != NULL) 3987 pv = pmap_pvh_remove(&om->md, pmap, va); 3988 3989 /* 3990 * Remove extra pte2 reference 3991 */ 3992 if (mpte2 != NULL) 3993 pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); 3994 } else 3995 pmap->pm_stats.resident_count++; 3996 3997 /* 3998 * Enter on the PV list if part of our managed memory. 3999 */ 4000 if ((m->oflags & VPO_UNMANAGED) == 0) { 4001 if (pv == NULL) { 4002 pv = get_pv_entry(pmap, FALSE); 4003 pv->pv_va = va; 4004 } 4005 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4006 } else if (pv != NULL) 4007 free_pv_entry(pmap, pv); 4008 4009 /* 4010 * Increment counters 4011 */ 4012 if (pte2_is_wired(npte2)) 4013 pmap->pm_stats.wired_count++; 4014 4015 validate: 4016 /* 4017 * Now validate mapping with desired protection/wiring. 4018 */ 4019 if (prot & VM_PROT_WRITE) { 4020 if ((m->oflags & VPO_UNMANAGED) == 0) 4021 vm_page_aflag_set(m, PGA_WRITEABLE); 4022 } 4023 4024 /* 4025 * If the mapping or permission bits are different, we need 4026 * to update the pte2. 4027 * 4028 * QQQ: Think again and again what to do 4029 * if the mapping is going to be changed! 4030 */ 4031 if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { 4032 /* 4033 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4034 * is set. Do it now, before the mapping is stored and made 4035 * valid for hardware table walk. If done later, there is a race 4036 * for other threads of current process in lazy loading case. 4037 * Don't do it for kernel memory which is mapped with exec 4038 * permission even if the memory isn't going to hold executable 4039 * code. The only time when icache sync is needed is after 4040 * kernel module is loaded and the relocation info is processed. 4041 * And it's done in elf_cpu_load_file(). 4042 * 4043 * QQQ: (1) Does it exist any better way where 4044 * or how to sync icache? 4045 * (2) Now, we do it on a page basis. 4046 */ 4047 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4048 m->md.pat_mode == VM_MEMATTR_WB_WA && 4049 (opa != pa || (opte2 & PTE2_NX))) 4050 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4051 4052 if (opte2 & PTE2_V) { 4053 /* Change mapping with break-before-make approach. */ 4054 opte2 = pte2_load_clear(pte2p); 4055 pmap_tlb_flush(pmap, va); 4056 pte2_store(pte2p, npte2); 4057 if (om != NULL) { 4058 KASSERT((om->oflags & VPO_UNMANAGED) == 0, 4059 ("%s: om %p unmanaged", __func__, om)); 4060 if ((opte2 & PTE2_A) != 0) 4061 vm_page_aflag_set(om, PGA_REFERENCED); 4062 if (pte2_is_dirty(opte2)) 4063 vm_page_dirty(om); 4064 if (TAILQ_EMPTY(&om->md.pv_list) && 4065 ((om->flags & PG_FICTITIOUS) != 0 || 4066 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4067 vm_page_aflag_clear(om, PGA_WRITEABLE); 4068 } 4069 } else 4070 pte2_store(pte2p, npte2); 4071 } 4072 #if 0 4073 else { 4074 /* 4075 * QQQ: In time when both access and not mofified bits are 4076 * emulated by software, this should not happen. Some 4077 * analysis is need, if this really happen. Missing 4078 * tlb flush somewhere could be the reason. 4079 */ 4080 panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, 4081 va, opte2, npte2); 4082 } 4083 #endif 4084 4085 #if VM_NRESERVLEVEL > 0 4086 /* 4087 * If both the L2 page table page and the reservation are fully 4088 * populated, then attempt promotion. 4089 */ 4090 if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && 4091 sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && 4092 vm_reserv_level_iffullpop(m) == 0) 4093 pmap_promote_pte1(pmap, pte1p, va); 4094 #endif 4095 4096 rv = KERN_SUCCESS; 4097 out: 4098 sched_unpin(); 4099 rw_wunlock(&pvh_global_lock); 4100 PMAP_UNLOCK(pmap); 4101 return (rv); 4102 } 4103 4104 /* 4105 * Do the things to unmap a page in a process. 4106 */ 4107 static int 4108 pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, 4109 struct spglist *free) 4110 { 4111 pt2_entry_t opte2; 4112 vm_page_t m; 4113 4114 rw_assert(&pvh_global_lock, RA_WLOCKED); 4115 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4116 4117 /* Clear and invalidate the mapping. */ 4118 opte2 = pte2_load_clear(pte2p); 4119 pmap_tlb_flush(pmap, va); 4120 4121 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", 4122 __func__, pmap, va, opte2)); 4123 4124 if (opte2 & PTE2_W) 4125 pmap->pm_stats.wired_count -= 1; 4126 pmap->pm_stats.resident_count -= 1; 4127 if (pte2_is_managed(opte2)) { 4128 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4129 if (pte2_is_dirty(opte2)) 4130 vm_page_dirty(m); 4131 if (opte2 & PTE2_A) 4132 vm_page_aflag_set(m, PGA_REFERENCED); 4133 pmap_remove_entry(pmap, m, va); 4134 } 4135 return (pmap_unuse_pt2(pmap, va, free)); 4136 } 4137 4138 /* 4139 * Remove a single page from a process address space. 4140 */ 4141 static void 4142 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 4143 { 4144 pt2_entry_t *pte2p; 4145 4146 rw_assert(&pvh_global_lock, RA_WLOCKED); 4147 KASSERT(curthread->td_pinned > 0, 4148 ("%s: curthread not pinned", __func__)); 4149 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4150 if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || 4151 !pte2_is_valid(pte2_load(pte2p))) 4152 return; 4153 pmap_remove_pte2(pmap, pte2p, va, free); 4154 } 4155 4156 /* 4157 * Remove the given range of addresses from the specified map. 4158 * 4159 * It is assumed that the start and end are properly 4160 * rounded to the page size. 4161 */ 4162 void 4163 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4164 { 4165 vm_offset_t nextva; 4166 pt1_entry_t *pte1p, pte1; 4167 pt2_entry_t *pte2p, pte2; 4168 struct spglist free; 4169 4170 /* 4171 * Perform an unsynchronized read. This is, however, safe. 4172 */ 4173 if (pmap->pm_stats.resident_count == 0) 4174 return; 4175 4176 SLIST_INIT(&free); 4177 4178 rw_wlock(&pvh_global_lock); 4179 sched_pin(); 4180 PMAP_LOCK(pmap); 4181 4182 /* 4183 * Special handling of removing one page. A very common 4184 * operation and easy to short circuit some code. 4185 */ 4186 if (sva + PAGE_SIZE == eva) { 4187 pte1 = pte1_load(pmap_pte1(pmap, sva)); 4188 if (pte1_is_link(pte1)) { 4189 pmap_remove_page(pmap, sva, &free); 4190 goto out; 4191 } 4192 } 4193 4194 for (; sva < eva; sva = nextva) { 4195 /* 4196 * Calculate address for next L2 page table. 4197 */ 4198 nextva = pte1_trunc(sva + PTE1_SIZE); 4199 if (nextva < sva) 4200 nextva = eva; 4201 if (pmap->pm_stats.resident_count == 0) 4202 break; 4203 4204 pte1p = pmap_pte1(pmap, sva); 4205 pte1 = pte1_load(pte1p); 4206 4207 /* 4208 * Weed out invalid mappings. Note: we assume that the L1 page 4209 * table is always allocated, and in kernel virtual. 4210 */ 4211 if (pte1 == 0) 4212 continue; 4213 4214 if (pte1_is_section(pte1)) { 4215 /* 4216 * Are we removing the entire large page? If not, 4217 * demote the mapping and fall through. 4218 */ 4219 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4220 pmap_remove_pte1(pmap, pte1p, sva, &free); 4221 continue; 4222 } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4223 /* The large page mapping was destroyed. */ 4224 continue; 4225 } 4226 #ifdef INVARIANTS 4227 else { 4228 /* Update pte1 after demotion. */ 4229 pte1 = pte1_load(pte1p); 4230 } 4231 #endif 4232 } 4233 4234 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4235 " is not link", __func__, pmap, sva, pte1, pte1p)); 4236 4237 /* 4238 * Limit our scan to either the end of the va represented 4239 * by the current L2 page table page, or to the end of the 4240 * range being removed. 4241 */ 4242 if (nextva > eva) 4243 nextva = eva; 4244 4245 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; 4246 pte2p++, sva += PAGE_SIZE) { 4247 pte2 = pte2_load(pte2p); 4248 if (!pte2_is_valid(pte2)) 4249 continue; 4250 if (pmap_remove_pte2(pmap, pte2p, sva, &free)) 4251 break; 4252 } 4253 } 4254 out: 4255 sched_unpin(); 4256 rw_wunlock(&pvh_global_lock); 4257 PMAP_UNLOCK(pmap); 4258 vm_page_free_pages_toq(&free, false); 4259 } 4260 4261 /* 4262 * Routine: pmap_remove_all 4263 * Function: 4264 * Removes this physical page from 4265 * all physical maps in which it resides. 4266 * Reflects back modify bits to the pager. 4267 * 4268 * Notes: 4269 * Original versions of this routine were very 4270 * inefficient because they iteratively called 4271 * pmap_remove (slow...) 4272 */ 4273 4274 void 4275 pmap_remove_all(vm_page_t m) 4276 { 4277 struct md_page *pvh; 4278 pv_entry_t pv; 4279 pmap_t pmap; 4280 pt2_entry_t *pte2p, opte2; 4281 pt1_entry_t *pte1p; 4282 vm_offset_t va; 4283 struct spglist free; 4284 4285 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4286 ("%s: page %p is not managed", __func__, m)); 4287 SLIST_INIT(&free); 4288 rw_wlock(&pvh_global_lock); 4289 sched_pin(); 4290 if ((m->flags & PG_FICTITIOUS) != 0) 4291 goto small_mappings; 4292 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4293 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4294 va = pv->pv_va; 4295 pmap = PV_PMAP(pv); 4296 PMAP_LOCK(pmap); 4297 pte1p = pmap_pte1(pmap, va); 4298 (void)pmap_demote_pte1(pmap, pte1p, va); 4299 PMAP_UNLOCK(pmap); 4300 } 4301 small_mappings: 4302 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4303 pmap = PV_PMAP(pv); 4304 PMAP_LOCK(pmap); 4305 pmap->pm_stats.resident_count--; 4306 pte1p = pmap_pte1(pmap, pv->pv_va); 4307 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " 4308 "a 1mpage in page %p's pv list", __func__, m)); 4309 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 4310 opte2 = pte2_load_clear(pte2p); 4311 pmap_tlb_flush(pmap, pv->pv_va); 4312 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", 4313 __func__, pmap, pv->pv_va)); 4314 if (pte2_is_wired(opte2)) 4315 pmap->pm_stats.wired_count--; 4316 if (opte2 & PTE2_A) 4317 vm_page_aflag_set(m, PGA_REFERENCED); 4318 4319 /* 4320 * Update the vm_page_t clean and reference bits. 4321 */ 4322 if (pte2_is_dirty(opte2)) 4323 vm_page_dirty(m); 4324 pmap_unuse_pt2(pmap, pv->pv_va, &free); 4325 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4326 free_pv_entry(pmap, pv); 4327 PMAP_UNLOCK(pmap); 4328 } 4329 vm_page_aflag_clear(m, PGA_WRITEABLE); 4330 sched_unpin(); 4331 rw_wunlock(&pvh_global_lock); 4332 vm_page_free_pages_toq(&free, false); 4333 } 4334 4335 /* 4336 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4337 * good coding style, a.k.a. 80 character line width limit hell. 4338 */ 4339 static __inline void 4340 pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, 4341 struct spglist *free) 4342 { 4343 vm_paddr_t pa; 4344 vm_page_t m, mt, mpt2pg; 4345 struct md_page *pvh; 4346 4347 pa = pte1_pa(pte1); 4348 m = PHYS_TO_VM_PAGE(pa); 4349 4350 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4351 __func__, m, m->phys_addr, pa)); 4352 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4353 m < &vm_page_array[vm_page_array_size], 4354 ("%s: bad pte1 %#x", __func__, pte1)); 4355 4356 if (pte1_is_dirty(pte1)) { 4357 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4358 vm_page_dirty(mt); 4359 } 4360 4361 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 4362 pvh = pa_to_pvh(pa); 4363 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4364 if (TAILQ_EMPTY(&pvh->pv_list)) { 4365 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4366 if (TAILQ_EMPTY(&mt->md.pv_list)) 4367 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4368 } 4369 mpt2pg = pmap_pt2_page(pmap, pv->pv_va); 4370 if (mpt2pg != NULL) 4371 pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); 4372 } 4373 4374 /* 4375 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4376 * good coding style, a.k.a. 80 character line width limit hell. 4377 */ 4378 static __inline void 4379 pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, 4380 struct spglist *free) 4381 { 4382 vm_paddr_t pa; 4383 vm_page_t m; 4384 struct md_page *pvh; 4385 4386 pa = pte2_pa(pte2); 4387 m = PHYS_TO_VM_PAGE(pa); 4388 4389 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4390 __func__, m, m->phys_addr, pa)); 4391 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4392 m < &vm_page_array[vm_page_array_size], 4393 ("%s: bad pte2 %#x", __func__, pte2)); 4394 4395 if (pte2_is_dirty(pte2)) 4396 vm_page_dirty(m); 4397 4398 pmap->pm_stats.resident_count--; 4399 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4400 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 4401 pvh = pa_to_pvh(pa); 4402 if (TAILQ_EMPTY(&pvh->pv_list)) 4403 vm_page_aflag_clear(m, PGA_WRITEABLE); 4404 } 4405 pmap_unuse_pt2(pmap, pv->pv_va, free); 4406 } 4407 4408 /* 4409 * Remove all pages from specified address space this aids process 4410 * exit speeds. Also, this code is special cased for current process 4411 * only, but can have the more generic (and slightly slower) mode enabled. 4412 * This is much faster than pmap_remove in the case of running down 4413 * an entire address space. 4414 */ 4415 void 4416 pmap_remove_pages(pmap_t pmap) 4417 { 4418 pt1_entry_t *pte1p, pte1; 4419 pt2_entry_t *pte2p, pte2; 4420 pv_entry_t pv; 4421 struct pv_chunk *pc, *npc; 4422 struct spglist free; 4423 int field, idx; 4424 int32_t bit; 4425 uint32_t inuse, bitmask; 4426 boolean_t allfree; 4427 4428 /* 4429 * Assert that the given pmap is only active on the current 4430 * CPU. Unfortunately, we cannot block another CPU from 4431 * activating the pmap while this function is executing. 4432 */ 4433 KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), 4434 ("%s: non-current pmap %p", __func__, pmap)); 4435 #if defined(SMP) && defined(INVARIANTS) 4436 { 4437 cpuset_t other_cpus; 4438 4439 sched_pin(); 4440 other_cpus = pmap->pm_active; 4441 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 4442 sched_unpin(); 4443 KASSERT(CPU_EMPTY(&other_cpus), 4444 ("%s: pmap %p active on other cpus", __func__, pmap)); 4445 } 4446 #endif 4447 SLIST_INIT(&free); 4448 rw_wlock(&pvh_global_lock); 4449 PMAP_LOCK(pmap); 4450 sched_pin(); 4451 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4452 KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", 4453 __func__, pmap, pc->pc_pmap)); 4454 allfree = TRUE; 4455 for (field = 0; field < _NPCM; field++) { 4456 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4457 while (inuse != 0) { 4458 bit = ffs(inuse) - 1; 4459 bitmask = 1UL << bit; 4460 idx = field * 32 + bit; 4461 pv = &pc->pc_pventry[idx]; 4462 inuse &= ~bitmask; 4463 4464 /* 4465 * Note that we cannot remove wired pages 4466 * from a process' mapping at this time 4467 */ 4468 pte1p = pmap_pte1(pmap, pv->pv_va); 4469 pte1 = pte1_load(pte1p); 4470 if (pte1_is_section(pte1)) { 4471 if (pte1_is_wired(pte1)) { 4472 allfree = FALSE; 4473 continue; 4474 } 4475 pte1_clear(pte1p); 4476 pmap_remove_pte1_quick(pmap, pte1, pv, 4477 &free); 4478 } 4479 else if (pte1_is_link(pte1)) { 4480 pte2p = pt2map_entry(pv->pv_va); 4481 pte2 = pte2_load(pte2p); 4482 4483 if (!pte2_is_valid(pte2)) { 4484 printf("%s: pmap %p va %#x " 4485 "pte2 %#x\n", __func__, 4486 pmap, pv->pv_va, pte2); 4487 panic("bad pte2"); 4488 } 4489 4490 if (pte2_is_wired(pte2)) { 4491 allfree = FALSE; 4492 continue; 4493 } 4494 pte2_clear(pte2p); 4495 pmap_remove_pte2_quick(pmap, pte2, pv, 4496 &free); 4497 } else { 4498 printf("%s: pmap %p va %#x pte1 %#x\n", 4499 __func__, pmap, pv->pv_va, pte1); 4500 panic("bad pte1"); 4501 } 4502 4503 /* Mark free */ 4504 PV_STAT(pv_entry_frees++); 4505 PV_STAT(pv_entry_spare++); 4506 pv_entry_count--; 4507 pc->pc_map[field] |= bitmask; 4508 } 4509 } 4510 if (allfree) { 4511 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4512 free_pv_chunk(pc); 4513 } 4514 } 4515 tlb_flush_all_ng_local(); 4516 sched_unpin(); 4517 rw_wunlock(&pvh_global_lock); 4518 PMAP_UNLOCK(pmap); 4519 vm_page_free_pages_toq(&free, false); 4520 } 4521 4522 /* 4523 * This code makes some *MAJOR* assumptions: 4524 * 1. Current pmap & pmap exists. 4525 * 2. Not wired. 4526 * 3. Read access. 4527 * 4. No L2 page table pages. 4528 * but is *MUCH* faster than pmap_enter... 4529 */ 4530 static vm_page_t 4531 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4532 vm_prot_t prot, vm_page_t mpt2pg) 4533 { 4534 pt2_entry_t *pte2p, pte2; 4535 vm_paddr_t pa; 4536 struct spglist free; 4537 uint32_t l2prot; 4538 4539 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4540 (m->oflags & VPO_UNMANAGED) != 0, 4541 ("%s: managed mapping within the clean submap", __func__)); 4542 rw_assert(&pvh_global_lock, RA_WLOCKED); 4543 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4544 4545 /* 4546 * In the case that a L2 page table page is not 4547 * resident, we are creating it here. 4548 */ 4549 if (va < VM_MAXUSER_ADDRESS) { 4550 u_int pte1_idx; 4551 pt1_entry_t pte1, *pte1p; 4552 vm_paddr_t pt2_pa; 4553 4554 /* 4555 * Get L1 page table things. 4556 */ 4557 pte1_idx = pte1_index(va); 4558 pte1p = pmap_pte1(pmap, va); 4559 pte1 = pte1_load(pte1p); 4560 4561 if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { 4562 /* 4563 * Each of NPT2_IN_PG L2 page tables on the page can 4564 * come here. Make sure that associated L1 page table 4565 * link is established. 4566 * 4567 * QQQ: It comes that we don't establish all links to 4568 * L2 page tables for newly allocated L2 page 4569 * tables page. 4570 */ 4571 KASSERT(!pte1_is_section(pte1), 4572 ("%s: pte1 %#x is section", __func__, pte1)); 4573 if (!pte1_is_link(pte1)) { 4574 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), 4575 pte1_idx); 4576 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 4577 } 4578 pt2_wirecount_inc(mpt2pg, pte1_idx); 4579 } else { 4580 /* 4581 * If the L2 page table page is mapped, we just 4582 * increment the hold count, and activate it. 4583 */ 4584 if (pte1_is_section(pte1)) { 4585 return (NULL); 4586 } else if (pte1_is_link(pte1)) { 4587 mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 4588 pt2_wirecount_inc(mpt2pg, pte1_idx); 4589 } else { 4590 mpt2pg = _pmap_allocpte2(pmap, va, 4591 PMAP_ENTER_NOSLEEP); 4592 if (mpt2pg == NULL) 4593 return (NULL); 4594 } 4595 } 4596 } else { 4597 mpt2pg = NULL; 4598 } 4599 4600 /* 4601 * This call to pt2map_entry() makes the assumption that we are 4602 * entering the page into the current pmap. In order to support 4603 * quick entry into any pmap, one would likely use pmap_pte2_quick(). 4604 * But that isn't as quick as pt2map_entry(). 4605 */ 4606 pte2p = pt2map_entry(va); 4607 pte2 = pte2_load(pte2p); 4608 if (pte2_is_valid(pte2)) { 4609 if (mpt2pg != NULL) { 4610 /* 4611 * Remove extra pte2 reference 4612 */ 4613 pt2_wirecount_dec(mpt2pg, pte1_index(va)); 4614 mpt2pg = NULL; 4615 } 4616 return (NULL); 4617 } 4618 4619 /* 4620 * Enter on the PV list if part of our managed memory. 4621 */ 4622 if ((m->oflags & VPO_UNMANAGED) == 0 && 4623 !pmap_try_insert_pv_entry(pmap, va, m)) { 4624 if (mpt2pg != NULL) { 4625 SLIST_INIT(&free); 4626 if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { 4627 pmap_tlb_flush(pmap, va); 4628 vm_page_free_pages_toq(&free, false); 4629 } 4630 4631 mpt2pg = NULL; 4632 } 4633 return (NULL); 4634 } 4635 4636 /* 4637 * Increment counters 4638 */ 4639 pmap->pm_stats.resident_count++; 4640 4641 /* 4642 * Now validate mapping with RO protection 4643 */ 4644 pa = VM_PAGE_TO_PHYS(m); 4645 l2prot = PTE2_RO | PTE2_NM; 4646 if (va < VM_MAXUSER_ADDRESS) 4647 l2prot |= PTE2_U | PTE2_NG; 4648 if ((prot & VM_PROT_EXECUTE) == 0) 4649 l2prot |= PTE2_NX; 4650 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4651 /* 4652 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4653 * is set. QQQ: For more info, see comments in pmap_enter(). 4654 */ 4655 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4656 } 4657 pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); 4658 4659 return (mpt2pg); 4660 } 4661 4662 void 4663 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4664 { 4665 4666 rw_wlock(&pvh_global_lock); 4667 PMAP_LOCK(pmap); 4668 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4669 rw_wunlock(&pvh_global_lock); 4670 PMAP_UNLOCK(pmap); 4671 } 4672 4673 /* 4674 * Tries to create a read- and/or execute-only 1 MB page mapping. Returns 4675 * true if successful. Returns false if (1) a mapping already exists at the 4676 * specified virtual address or (2) a PV entry cannot be allocated without 4677 * reclaiming another PV entry. 4678 */ 4679 static bool 4680 pmap_enter_1mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4681 { 4682 pt1_entry_t pte1; 4683 vm_paddr_t pa; 4684 4685 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4686 pa = VM_PAGE_TO_PHYS(m); 4687 pte1 = PTE1(pa, PTE1_NM | PTE1_RO, ATTR_TO_L1(vm_page_pte2_attr(m))); 4688 if ((prot & VM_PROT_EXECUTE) == 0) 4689 pte1 |= PTE1_NX; 4690 if (va < VM_MAXUSER_ADDRESS) 4691 pte1 |= PTE1_U; 4692 if (pmap != kernel_pmap) 4693 pte1 |= PTE1_NG; 4694 return (pmap_enter_pte1(pmap, va, pte1, PMAP_ENTER_NOSLEEP | 4695 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m) == KERN_SUCCESS); 4696 } 4697 4698 /* 4699 * Tries to create the specified 1 MB page mapping. Returns KERN_SUCCESS if 4700 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 4701 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 4702 * a mapping already exists at the specified virtual address. Returns 4703 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NORECLAIM was specified and PV entry 4704 * allocation failed. 4705 */ 4706 static int 4707 pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, 4708 vm_page_t m) 4709 { 4710 struct spglist free; 4711 pt1_entry_t opte1, *pte1p; 4712 pt2_entry_t pte2, *pte2p; 4713 vm_offset_t cur, end; 4714 vm_page_t mt; 4715 4716 rw_assert(&pvh_global_lock, RA_WLOCKED); 4717 KASSERT((pte1 & (PTE1_NM | PTE1_RO)) == 0 || 4718 (pte1 & (PTE1_NM | PTE1_RO)) == (PTE1_NM | PTE1_RO), 4719 ("%s: pte1 has inconsistent NM and RO attributes", __func__)); 4720 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4721 pte1p = pmap_pte1(pmap, va); 4722 opte1 = pte1_load(pte1p); 4723 if (pte1_is_valid(opte1)) { 4724 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 4725 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4726 __func__, va, pmap); 4727 return (KERN_FAILURE); 4728 } 4729 /* Break the existing mapping(s). */ 4730 SLIST_INIT(&free); 4731 if (pte1_is_section(opte1)) { 4732 /* 4733 * If the section resulted from a promotion, then a 4734 * reserved PT page could be freed. 4735 */ 4736 pmap_remove_pte1(pmap, pte1p, va, &free); 4737 } else { 4738 sched_pin(); 4739 end = va + PTE1_SIZE; 4740 for (cur = va, pte2p = pmap_pte2_quick(pmap, va); 4741 cur != end; cur += PAGE_SIZE, pte2p++) { 4742 pte2 = pte2_load(pte2p); 4743 if (!pte2_is_valid(pte2)) 4744 continue; 4745 if (pmap_remove_pte2(pmap, pte2p, cur, &free)) 4746 break; 4747 } 4748 sched_unpin(); 4749 } 4750 vm_page_free_pages_toq(&free, false); 4751 } 4752 if ((m->oflags & VPO_UNMANAGED) == 0) { 4753 /* 4754 * Abort this mapping if its PV entry could not be created. 4755 */ 4756 if (!pmap_pv_insert_pte1(pmap, va, pte1, flags)) { 4757 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4758 __func__, va, pmap); 4759 return (KERN_RESOURCE_SHORTAGE); 4760 } 4761 if ((pte1 & PTE1_RO) == 0) { 4762 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4763 vm_page_aflag_set(mt, PGA_WRITEABLE); 4764 } 4765 } 4766 4767 /* 4768 * Increment counters. 4769 */ 4770 if (pte1_is_wired(pte1)) 4771 pmap->pm_stats.wired_count += PTE1_SIZE / PAGE_SIZE; 4772 pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; 4773 4774 /* 4775 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4776 * is set. QQQ: For more info, see comments in pmap_enter(). 4777 */ 4778 if ((pte1 & PTE1_NX) == 0 && m->md.pat_mode == VM_MEMATTR_WB_WA && 4779 pmap != kernel_pmap && (!pte1_is_section(opte1) || 4780 pte1_pa(opte1) != VM_PAGE_TO_PHYS(m) || (opte1 & PTE2_NX) != 0)) 4781 cache_icache_sync_fresh(va, VM_PAGE_TO_PHYS(m), PTE1_SIZE); 4782 4783 /* 4784 * Map the section. 4785 */ 4786 pte1_store(pte1p, pte1); 4787 4788 pmap_pte1_mappings++; 4789 CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, 4790 pmap); 4791 return (KERN_SUCCESS); 4792 } 4793 4794 /* 4795 * Maps a sequence of resident pages belonging to the same object. 4796 * The sequence begins with the given page m_start. This page is 4797 * mapped at the given virtual address start. Each subsequent page is 4798 * mapped at a virtual address that is offset from start by the same 4799 * amount as the page is offset from m_start within the object. The 4800 * last page in the sequence is the page with the largest offset from 4801 * m_start that can be mapped at a virtual address less than the given 4802 * virtual address end. Not every virtual page between start and end 4803 * is mapped; only those for which a resident page exists with the 4804 * corresponding offset from m_start are mapped. 4805 */ 4806 void 4807 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4808 vm_page_t m_start, vm_prot_t prot) 4809 { 4810 vm_offset_t va; 4811 vm_page_t m, mpt2pg; 4812 vm_pindex_t diff, psize; 4813 4814 PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", 4815 __func__, pmap, start, end, m_start, prot)); 4816 4817 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4818 psize = atop(end - start); 4819 mpt2pg = NULL; 4820 m = m_start; 4821 rw_wlock(&pvh_global_lock); 4822 PMAP_LOCK(pmap); 4823 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4824 va = start + ptoa(diff); 4825 if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && 4826 m->psind == 1 && sp_enabled && 4827 pmap_enter_1mpage(pmap, va, m, prot)) 4828 m = &m[PTE1_SIZE / PAGE_SIZE - 1]; 4829 else 4830 mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, 4831 mpt2pg); 4832 m = TAILQ_NEXT(m, listq); 4833 } 4834 rw_wunlock(&pvh_global_lock); 4835 PMAP_UNLOCK(pmap); 4836 } 4837 4838 /* 4839 * This code maps large physical mmap regions into the 4840 * processor address space. Note that some shortcuts 4841 * are taken, but the code works. 4842 */ 4843 void 4844 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4845 vm_pindex_t pindex, vm_size_t size) 4846 { 4847 pt1_entry_t *pte1p; 4848 vm_paddr_t pa, pte2_pa; 4849 vm_page_t p; 4850 vm_memattr_t pat_mode; 4851 u_int l1attr, l1prot; 4852 4853 VM_OBJECT_ASSERT_WLOCKED(object); 4854 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4855 ("%s: non-device object", __func__)); 4856 if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { 4857 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4858 return; 4859 p = vm_page_lookup(object, pindex); 4860 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4861 ("%s: invalid page %p", __func__, p)); 4862 pat_mode = p->md.pat_mode; 4863 4864 /* 4865 * Abort the mapping if the first page is not physically 4866 * aligned to a 1MB page boundary. 4867 */ 4868 pte2_pa = VM_PAGE_TO_PHYS(p); 4869 if (pte2_pa & PTE1_OFFSET) 4870 return; 4871 4872 /* 4873 * Skip the first page. Abort the mapping if the rest of 4874 * the pages are not physically contiguous or have differing 4875 * memory attributes. 4876 */ 4877 p = TAILQ_NEXT(p, listq); 4878 for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; 4879 pa += PAGE_SIZE) { 4880 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4881 ("%s: invalid page %p", __func__, p)); 4882 if (pa != VM_PAGE_TO_PHYS(p) || 4883 pat_mode != p->md.pat_mode) 4884 return; 4885 p = TAILQ_NEXT(p, listq); 4886 } 4887 4888 /* 4889 * Map using 1MB pages. 4890 * 4891 * QQQ: Well, we are mapping a section, so same condition must 4892 * be hold like during promotion. It looks that only RW mapping 4893 * is done here, so readonly mapping must be done elsewhere. 4894 */ 4895 l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; 4896 l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); 4897 PMAP_LOCK(pmap); 4898 for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { 4899 pte1p = pmap_pte1(pmap, addr); 4900 if (!pte1_is_valid(pte1_load(pte1p))) { 4901 pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); 4902 pmap->pm_stats.resident_count += PTE1_SIZE / 4903 PAGE_SIZE; 4904 pmap_pte1_mappings++; 4905 } 4906 /* Else continue on if the PTE1 is already valid. */ 4907 addr += PTE1_SIZE; 4908 } 4909 PMAP_UNLOCK(pmap); 4910 } 4911 } 4912 4913 /* 4914 * Do the things to protect a 1mpage in a process. 4915 */ 4916 static void 4917 pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 4918 vm_prot_t prot) 4919 { 4920 pt1_entry_t npte1, opte1; 4921 vm_offset_t eva, va; 4922 vm_page_t m; 4923 4924 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4925 KASSERT((sva & PTE1_OFFSET) == 0, 4926 ("%s: sva is not 1mpage aligned", __func__)); 4927 4928 opte1 = npte1 = pte1_load(pte1p); 4929 if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { 4930 eva = sva + PTE1_SIZE; 4931 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 4932 va < eva; va += PAGE_SIZE, m++) 4933 vm_page_dirty(m); 4934 } 4935 if ((prot & VM_PROT_WRITE) == 0) 4936 npte1 |= PTE1_RO | PTE1_NM; 4937 if ((prot & VM_PROT_EXECUTE) == 0) 4938 npte1 |= PTE1_NX; 4939 4940 /* 4941 * QQQ: Herein, execute permission is never set. 4942 * It only can be cleared. So, no icache 4943 * syncing is needed. 4944 */ 4945 4946 if (npte1 != opte1) { 4947 pte1_store(pte1p, npte1); 4948 pmap_tlb_flush(pmap, sva); 4949 } 4950 } 4951 4952 /* 4953 * Set the physical protection on the 4954 * specified range of this map as requested. 4955 */ 4956 void 4957 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4958 { 4959 boolean_t pv_lists_locked; 4960 vm_offset_t nextva; 4961 pt1_entry_t *pte1p, pte1; 4962 pt2_entry_t *pte2p, opte2, npte2; 4963 4964 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4965 if (prot == VM_PROT_NONE) { 4966 pmap_remove(pmap, sva, eva); 4967 return; 4968 } 4969 4970 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 4971 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 4972 return; 4973 4974 if (pmap_is_current(pmap)) 4975 pv_lists_locked = FALSE; 4976 else { 4977 pv_lists_locked = TRUE; 4978 resume: 4979 rw_wlock(&pvh_global_lock); 4980 sched_pin(); 4981 } 4982 4983 PMAP_LOCK(pmap); 4984 for (; sva < eva; sva = nextva) { 4985 /* 4986 * Calculate address for next L2 page table. 4987 */ 4988 nextva = pte1_trunc(sva + PTE1_SIZE); 4989 if (nextva < sva) 4990 nextva = eva; 4991 4992 pte1p = pmap_pte1(pmap, sva); 4993 pte1 = pte1_load(pte1p); 4994 4995 /* 4996 * Weed out invalid mappings. Note: we assume that L1 page 4997 * page table is always allocated, and in kernel virtual. 4998 */ 4999 if (pte1 == 0) 5000 continue; 5001 5002 if (pte1_is_section(pte1)) { 5003 /* 5004 * Are we protecting the entire large page? If not, 5005 * demote the mapping and fall through. 5006 */ 5007 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5008 pmap_protect_pte1(pmap, pte1p, sva, prot); 5009 continue; 5010 } else { 5011 if (!pv_lists_locked) { 5012 pv_lists_locked = TRUE; 5013 if (!rw_try_wlock(&pvh_global_lock)) { 5014 PMAP_UNLOCK(pmap); 5015 goto resume; 5016 } 5017 sched_pin(); 5018 } 5019 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5020 /* 5021 * The large page mapping 5022 * was destroyed. 5023 */ 5024 continue; 5025 } 5026 #ifdef INVARIANTS 5027 else { 5028 /* Update pte1 after demotion */ 5029 pte1 = pte1_load(pte1p); 5030 } 5031 #endif 5032 } 5033 } 5034 5035 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5036 " is not link", __func__, pmap, sva, pte1, pte1p)); 5037 5038 /* 5039 * Limit our scan to either the end of the va represented 5040 * by the current L2 page table page, or to the end of the 5041 * range being protected. 5042 */ 5043 if (nextva > eva) 5044 nextva = eva; 5045 5046 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5047 sva += PAGE_SIZE) { 5048 vm_page_t m; 5049 5050 opte2 = npte2 = pte2_load(pte2p); 5051 if (!pte2_is_valid(opte2)) 5052 continue; 5053 5054 if ((prot & VM_PROT_WRITE) == 0) { 5055 if (pte2_is_managed(opte2) && 5056 pte2_is_dirty(opte2)) { 5057 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 5058 vm_page_dirty(m); 5059 } 5060 npte2 |= PTE2_RO | PTE2_NM; 5061 } 5062 5063 if ((prot & VM_PROT_EXECUTE) == 0) 5064 npte2 |= PTE2_NX; 5065 5066 /* 5067 * QQQ: Herein, execute permission is never set. 5068 * It only can be cleared. So, no icache 5069 * syncing is needed. 5070 */ 5071 5072 if (npte2 != opte2) { 5073 pte2_store(pte2p, npte2); 5074 pmap_tlb_flush(pmap, sva); 5075 } 5076 } 5077 } 5078 if (pv_lists_locked) { 5079 sched_unpin(); 5080 rw_wunlock(&pvh_global_lock); 5081 } 5082 PMAP_UNLOCK(pmap); 5083 } 5084 5085 /* 5086 * pmap_pvh_wired_mappings: 5087 * 5088 * Return the updated number "count" of managed mappings that are wired. 5089 */ 5090 static int 5091 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 5092 { 5093 pmap_t pmap; 5094 pt1_entry_t pte1; 5095 pt2_entry_t pte2; 5096 pv_entry_t pv; 5097 5098 rw_assert(&pvh_global_lock, RA_WLOCKED); 5099 sched_pin(); 5100 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5101 pmap = PV_PMAP(pv); 5102 PMAP_LOCK(pmap); 5103 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5104 if (pte1_is_section(pte1)) { 5105 if (pte1_is_wired(pte1)) 5106 count++; 5107 } else { 5108 KASSERT(pte1_is_link(pte1), 5109 ("%s: pte1 %#x is not link", __func__, pte1)); 5110 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5111 if (pte2_is_wired(pte2)) 5112 count++; 5113 } 5114 PMAP_UNLOCK(pmap); 5115 } 5116 sched_unpin(); 5117 return (count); 5118 } 5119 5120 /* 5121 * pmap_page_wired_mappings: 5122 * 5123 * Return the number of managed mappings to the given physical page 5124 * that are wired. 5125 */ 5126 int 5127 pmap_page_wired_mappings(vm_page_t m) 5128 { 5129 int count; 5130 5131 count = 0; 5132 if ((m->oflags & VPO_UNMANAGED) != 0) 5133 return (count); 5134 rw_wlock(&pvh_global_lock); 5135 count = pmap_pvh_wired_mappings(&m->md, count); 5136 if ((m->flags & PG_FICTITIOUS) == 0) { 5137 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 5138 count); 5139 } 5140 rw_wunlock(&pvh_global_lock); 5141 return (count); 5142 } 5143 5144 /* 5145 * Returns TRUE if any of the given mappings were used to modify 5146 * physical memory. Otherwise, returns FALSE. Both page and 1mpage 5147 * mappings are supported. 5148 */ 5149 static boolean_t 5150 pmap_is_modified_pvh(struct md_page *pvh) 5151 { 5152 pv_entry_t pv; 5153 pt1_entry_t pte1; 5154 pt2_entry_t pte2; 5155 pmap_t pmap; 5156 boolean_t rv; 5157 5158 rw_assert(&pvh_global_lock, RA_WLOCKED); 5159 rv = FALSE; 5160 sched_pin(); 5161 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5162 pmap = PV_PMAP(pv); 5163 PMAP_LOCK(pmap); 5164 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5165 if (pte1_is_section(pte1)) { 5166 rv = pte1_is_dirty(pte1); 5167 } else { 5168 KASSERT(pte1_is_link(pte1), 5169 ("%s: pte1 %#x is not link", __func__, pte1)); 5170 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5171 rv = pte2_is_dirty(pte2); 5172 } 5173 PMAP_UNLOCK(pmap); 5174 if (rv) 5175 break; 5176 } 5177 sched_unpin(); 5178 return (rv); 5179 } 5180 5181 /* 5182 * pmap_is_modified: 5183 * 5184 * Return whether or not the specified physical page was modified 5185 * in any physical maps. 5186 */ 5187 boolean_t 5188 pmap_is_modified(vm_page_t m) 5189 { 5190 boolean_t rv; 5191 5192 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5193 ("%s: page %p is not managed", __func__, m)); 5194 5195 /* 5196 * If the page is not busied then this check is racy. 5197 */ 5198 if (!pmap_page_is_write_mapped(m)) 5199 return (FALSE); 5200 rw_wlock(&pvh_global_lock); 5201 rv = pmap_is_modified_pvh(&m->md) || 5202 ((m->flags & PG_FICTITIOUS) == 0 && 5203 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5204 rw_wunlock(&pvh_global_lock); 5205 return (rv); 5206 } 5207 5208 /* 5209 * pmap_is_prefaultable: 5210 * 5211 * Return whether or not the specified virtual address is eligible 5212 * for prefault. 5213 */ 5214 boolean_t 5215 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5216 { 5217 pt1_entry_t pte1; 5218 pt2_entry_t pte2; 5219 boolean_t rv; 5220 5221 rv = FALSE; 5222 PMAP_LOCK(pmap); 5223 pte1 = pte1_load(pmap_pte1(pmap, addr)); 5224 if (pte1_is_link(pte1)) { 5225 pte2 = pte2_load(pt2map_entry(addr)); 5226 rv = !pte2_is_valid(pte2) ; 5227 } 5228 PMAP_UNLOCK(pmap); 5229 return (rv); 5230 } 5231 5232 /* 5233 * Returns TRUE if any of the given mappings were referenced and FALSE 5234 * otherwise. Both page and 1mpage mappings are supported. 5235 */ 5236 static boolean_t 5237 pmap_is_referenced_pvh(struct md_page *pvh) 5238 { 5239 5240 pv_entry_t pv; 5241 pt1_entry_t pte1; 5242 pt2_entry_t pte2; 5243 pmap_t pmap; 5244 boolean_t rv; 5245 5246 rw_assert(&pvh_global_lock, RA_WLOCKED); 5247 rv = FALSE; 5248 sched_pin(); 5249 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5250 pmap = PV_PMAP(pv); 5251 PMAP_LOCK(pmap); 5252 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5253 if (pte1_is_section(pte1)) { 5254 rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); 5255 } else { 5256 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5257 rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); 5258 } 5259 PMAP_UNLOCK(pmap); 5260 if (rv) 5261 break; 5262 } 5263 sched_unpin(); 5264 return (rv); 5265 } 5266 5267 /* 5268 * pmap_is_referenced: 5269 * 5270 * Return whether or not the specified physical page was referenced 5271 * in any physical maps. 5272 */ 5273 boolean_t 5274 pmap_is_referenced(vm_page_t m) 5275 { 5276 boolean_t rv; 5277 5278 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5279 ("%s: page %p is not managed", __func__, m)); 5280 rw_wlock(&pvh_global_lock); 5281 rv = pmap_is_referenced_pvh(&m->md) || 5282 ((m->flags & PG_FICTITIOUS) == 0 && 5283 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5284 rw_wunlock(&pvh_global_lock); 5285 return (rv); 5286 } 5287 5288 /* 5289 * pmap_ts_referenced: 5290 * 5291 * Return a count of reference bits for a page, clearing those bits. 5292 * It is not necessary for every reference bit to be cleared, but it 5293 * is necessary that 0 only be returned when there are truly no 5294 * reference bits set. 5295 * 5296 * As an optimization, update the page's dirty field if a modified bit is 5297 * found while counting reference bits. This opportunistic update can be 5298 * performed at low cost and can eliminate the need for some future calls 5299 * to pmap_is_modified(). However, since this function stops after 5300 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5301 * dirty pages. Those dirty pages will only be detected by a future call 5302 * to pmap_is_modified(). 5303 */ 5304 int 5305 pmap_ts_referenced(vm_page_t m) 5306 { 5307 struct md_page *pvh; 5308 pv_entry_t pv, pvf; 5309 pmap_t pmap; 5310 pt1_entry_t *pte1p, opte1; 5311 pt2_entry_t *pte2p, opte2; 5312 vm_paddr_t pa; 5313 int rtval = 0; 5314 5315 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5316 ("%s: page %p is not managed", __func__, m)); 5317 pa = VM_PAGE_TO_PHYS(m); 5318 pvh = pa_to_pvh(pa); 5319 rw_wlock(&pvh_global_lock); 5320 sched_pin(); 5321 if ((m->flags & PG_FICTITIOUS) != 0 || 5322 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5323 goto small_mappings; 5324 pv = pvf; 5325 do { 5326 pmap = PV_PMAP(pv); 5327 PMAP_LOCK(pmap); 5328 pte1p = pmap_pte1(pmap, pv->pv_va); 5329 opte1 = pte1_load(pte1p); 5330 if (pte1_is_dirty(opte1)) { 5331 /* 5332 * Although "opte1" is mapping a 1MB page, because 5333 * this function is called at a 4KB page granularity, 5334 * we only update the 4KB page under test. 5335 */ 5336 vm_page_dirty(m); 5337 } 5338 if ((opte1 & PTE1_A) != 0) { 5339 /* 5340 * Since this reference bit is shared by 256 4KB pages, 5341 * it should not be cleared every time it is tested. 5342 * Apply a simple "hash" function on the physical page 5343 * number, the virtual section number, and the pmap 5344 * address to select one 4KB page out of the 256 5345 * on which testing the reference bit will result 5346 * in clearing that bit. This function is designed 5347 * to avoid the selection of the same 4KB page 5348 * for every 1MB page mapping. 5349 * 5350 * On demotion, a mapping that hasn't been referenced 5351 * is simply destroyed. To avoid the possibility of a 5352 * subsequent page fault on a demoted wired mapping, 5353 * always leave its reference bit set. Moreover, 5354 * since the section is wired, the current state of 5355 * its reference bit won't affect page replacement. 5356 */ 5357 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ 5358 (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && 5359 !pte1_is_wired(opte1)) { 5360 pte1_clear_bit(pte1p, PTE1_A); 5361 pmap_tlb_flush(pmap, pv->pv_va); 5362 } 5363 rtval++; 5364 } 5365 PMAP_UNLOCK(pmap); 5366 /* Rotate the PV list if it has more than one entry. */ 5367 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5368 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5369 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5370 } 5371 if (rtval >= PMAP_TS_REFERENCED_MAX) 5372 goto out; 5373 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5374 small_mappings: 5375 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5376 goto out; 5377 pv = pvf; 5378 do { 5379 pmap = PV_PMAP(pv); 5380 PMAP_LOCK(pmap); 5381 pte1p = pmap_pte1(pmap, pv->pv_va); 5382 KASSERT(pte1_is_link(pte1_load(pte1p)), 5383 ("%s: not found a link in page %p's pv list", __func__, m)); 5384 5385 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5386 opte2 = pte2_load(pte2p); 5387 if (pte2_is_dirty(opte2)) 5388 vm_page_dirty(m); 5389 if ((opte2 & PTE2_A) != 0) { 5390 pte2_clear_bit(pte2p, PTE2_A); 5391 pmap_tlb_flush(pmap, pv->pv_va); 5392 rtval++; 5393 } 5394 PMAP_UNLOCK(pmap); 5395 /* Rotate the PV list if it has more than one entry. */ 5396 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5397 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5398 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5399 } 5400 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5401 PMAP_TS_REFERENCED_MAX); 5402 out: 5403 sched_unpin(); 5404 rw_wunlock(&pvh_global_lock); 5405 return (rtval); 5406 } 5407 5408 /* 5409 * Clear the wired attribute from the mappings for the specified range of 5410 * addresses in the given pmap. Every valid mapping within that range 5411 * must have the wired attribute set. In contrast, invalid mappings 5412 * cannot have the wired attribute set, so they are ignored. 5413 * 5414 * The wired attribute of the page table entry is not a hardware feature, 5415 * so there is no need to invalidate any TLB entries. 5416 */ 5417 void 5418 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5419 { 5420 vm_offset_t nextva; 5421 pt1_entry_t *pte1p, pte1; 5422 pt2_entry_t *pte2p, pte2; 5423 boolean_t pv_lists_locked; 5424 5425 if (pmap_is_current(pmap)) 5426 pv_lists_locked = FALSE; 5427 else { 5428 pv_lists_locked = TRUE; 5429 resume: 5430 rw_wlock(&pvh_global_lock); 5431 sched_pin(); 5432 } 5433 PMAP_LOCK(pmap); 5434 for (; sva < eva; sva = nextva) { 5435 nextva = pte1_trunc(sva + PTE1_SIZE); 5436 if (nextva < sva) 5437 nextva = eva; 5438 5439 pte1p = pmap_pte1(pmap, sva); 5440 pte1 = pte1_load(pte1p); 5441 5442 /* 5443 * Weed out invalid mappings. Note: we assume that L1 page 5444 * page table is always allocated, and in kernel virtual. 5445 */ 5446 if (pte1 == 0) 5447 continue; 5448 5449 if (pte1_is_section(pte1)) { 5450 if (!pte1_is_wired(pte1)) 5451 panic("%s: pte1 %#x not wired", __func__, pte1); 5452 5453 /* 5454 * Are we unwiring the entire large page? If not, 5455 * demote the mapping and fall through. 5456 */ 5457 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5458 pte1_clear_bit(pte1p, PTE1_W); 5459 pmap->pm_stats.wired_count -= PTE1_SIZE / 5460 PAGE_SIZE; 5461 continue; 5462 } else { 5463 if (!pv_lists_locked) { 5464 pv_lists_locked = TRUE; 5465 if (!rw_try_wlock(&pvh_global_lock)) { 5466 PMAP_UNLOCK(pmap); 5467 /* Repeat sva. */ 5468 goto resume; 5469 } 5470 sched_pin(); 5471 } 5472 if (!pmap_demote_pte1(pmap, pte1p, sva)) 5473 panic("%s: demotion failed", __func__); 5474 #ifdef INVARIANTS 5475 else { 5476 /* Update pte1 after demotion */ 5477 pte1 = pte1_load(pte1p); 5478 } 5479 #endif 5480 } 5481 } 5482 5483 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5484 " is not link", __func__, pmap, sva, pte1, pte1p)); 5485 5486 /* 5487 * Limit our scan to either the end of the va represented 5488 * by the current L2 page table page, or to the end of the 5489 * range being protected. 5490 */ 5491 if (nextva > eva) 5492 nextva = eva; 5493 5494 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5495 sva += PAGE_SIZE) { 5496 pte2 = pte2_load(pte2p); 5497 if (!pte2_is_valid(pte2)) 5498 continue; 5499 if (!pte2_is_wired(pte2)) 5500 panic("%s: pte2 %#x is missing PTE2_W", 5501 __func__, pte2); 5502 5503 /* 5504 * PTE2_W must be cleared atomically. Although the pmap 5505 * lock synchronizes access to PTE2_W, another processor 5506 * could be changing PTE2_NM and/or PTE2_A concurrently. 5507 */ 5508 pte2_clear_bit(pte2p, PTE2_W); 5509 pmap->pm_stats.wired_count--; 5510 } 5511 } 5512 if (pv_lists_locked) { 5513 sched_unpin(); 5514 rw_wunlock(&pvh_global_lock); 5515 } 5516 PMAP_UNLOCK(pmap); 5517 } 5518 5519 /* 5520 * Clear the write and modified bits in each of the given page's mappings. 5521 */ 5522 void 5523 pmap_remove_write(vm_page_t m) 5524 { 5525 struct md_page *pvh; 5526 pv_entry_t next_pv, pv; 5527 pmap_t pmap; 5528 pt1_entry_t *pte1p; 5529 pt2_entry_t *pte2p, opte2; 5530 vm_offset_t va; 5531 5532 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5533 ("%s: page %p is not managed", __func__, m)); 5534 vm_page_assert_busied(m); 5535 5536 if (!pmap_page_is_write_mapped(m)) 5537 return; 5538 rw_wlock(&pvh_global_lock); 5539 sched_pin(); 5540 if ((m->flags & PG_FICTITIOUS) != 0) 5541 goto small_mappings; 5542 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5543 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5544 va = pv->pv_va; 5545 pmap = PV_PMAP(pv); 5546 PMAP_LOCK(pmap); 5547 pte1p = pmap_pte1(pmap, va); 5548 if (!(pte1_load(pte1p) & PTE1_RO)) 5549 (void)pmap_demote_pte1(pmap, pte1p, va); 5550 PMAP_UNLOCK(pmap); 5551 } 5552 small_mappings: 5553 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5554 pmap = PV_PMAP(pv); 5555 PMAP_LOCK(pmap); 5556 pte1p = pmap_pte1(pmap, pv->pv_va); 5557 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5558 " a section in page %p's pv list", __func__, m)); 5559 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5560 opte2 = pte2_load(pte2p); 5561 if (!(opte2 & PTE2_RO)) { 5562 pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); 5563 if (pte2_is_dirty(opte2)) 5564 vm_page_dirty(m); 5565 pmap_tlb_flush(pmap, pv->pv_va); 5566 } 5567 PMAP_UNLOCK(pmap); 5568 } 5569 vm_page_aflag_clear(m, PGA_WRITEABLE); 5570 sched_unpin(); 5571 rw_wunlock(&pvh_global_lock); 5572 } 5573 5574 /* 5575 * Apply the given advice to the specified range of addresses within the 5576 * given pmap. Depending on the advice, clear the referenced and/or 5577 * modified flags in each mapping and set the mapped page's dirty field. 5578 */ 5579 void 5580 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5581 { 5582 pt1_entry_t *pte1p, opte1; 5583 pt2_entry_t *pte2p, pte2; 5584 vm_offset_t pdnxt; 5585 vm_page_t m; 5586 boolean_t pv_lists_locked; 5587 5588 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5589 return; 5590 if (pmap_is_current(pmap)) 5591 pv_lists_locked = FALSE; 5592 else { 5593 pv_lists_locked = TRUE; 5594 resume: 5595 rw_wlock(&pvh_global_lock); 5596 sched_pin(); 5597 } 5598 PMAP_LOCK(pmap); 5599 for (; sva < eva; sva = pdnxt) { 5600 pdnxt = pte1_trunc(sva + PTE1_SIZE); 5601 if (pdnxt < sva) 5602 pdnxt = eva; 5603 pte1p = pmap_pte1(pmap, sva); 5604 opte1 = pte1_load(pte1p); 5605 if (!pte1_is_valid(opte1)) /* XXX */ 5606 continue; 5607 else if (pte1_is_section(opte1)) { 5608 if (!pte1_is_managed(opte1)) 5609 continue; 5610 if (!pv_lists_locked) { 5611 pv_lists_locked = TRUE; 5612 if (!rw_try_wlock(&pvh_global_lock)) { 5613 PMAP_UNLOCK(pmap); 5614 goto resume; 5615 } 5616 sched_pin(); 5617 } 5618 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5619 /* 5620 * The large page mapping was destroyed. 5621 */ 5622 continue; 5623 } 5624 5625 /* 5626 * Unless the page mappings are wired, remove the 5627 * mapping to a single page so that a subsequent 5628 * access may repromote. Since the underlying L2 page 5629 * table is fully populated, this removal never 5630 * frees a L2 page table page. 5631 */ 5632 if (!pte1_is_wired(opte1)) { 5633 pte2p = pmap_pte2_quick(pmap, sva); 5634 KASSERT(pte2_is_valid(pte2_load(pte2p)), 5635 ("%s: invalid PTE2", __func__)); 5636 pmap_remove_pte2(pmap, pte2p, sva, NULL); 5637 } 5638 } 5639 if (pdnxt > eva) 5640 pdnxt = eva; 5641 for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, 5642 sva += PAGE_SIZE) { 5643 pte2 = pte2_load(pte2p); 5644 if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) 5645 continue; 5646 else if (pte2_is_dirty(pte2)) { 5647 if (advice == MADV_DONTNEED) { 5648 /* 5649 * Future calls to pmap_is_modified() 5650 * can be avoided by making the page 5651 * dirty now. 5652 */ 5653 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 5654 vm_page_dirty(m); 5655 } 5656 pte2_set_bit(pte2p, PTE2_NM); 5657 pte2_clear_bit(pte2p, PTE2_A); 5658 } else if ((pte2 & PTE2_A) != 0) 5659 pte2_clear_bit(pte2p, PTE2_A); 5660 else 5661 continue; 5662 pmap_tlb_flush(pmap, sva); 5663 } 5664 } 5665 if (pv_lists_locked) { 5666 sched_unpin(); 5667 rw_wunlock(&pvh_global_lock); 5668 } 5669 PMAP_UNLOCK(pmap); 5670 } 5671 5672 /* 5673 * Clear the modify bits on the specified physical page. 5674 */ 5675 void 5676 pmap_clear_modify(vm_page_t m) 5677 { 5678 struct md_page *pvh; 5679 pv_entry_t next_pv, pv; 5680 pmap_t pmap; 5681 pt1_entry_t *pte1p, opte1; 5682 pt2_entry_t *pte2p, opte2; 5683 vm_offset_t va; 5684 5685 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5686 ("%s: page %p is not managed", __func__, m)); 5687 vm_page_assert_busied(m); 5688 5689 if (!pmap_page_is_write_mapped(m)) 5690 return; 5691 rw_wlock(&pvh_global_lock); 5692 sched_pin(); 5693 if ((m->flags & PG_FICTITIOUS) != 0) 5694 goto small_mappings; 5695 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5696 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5697 va = pv->pv_va; 5698 pmap = PV_PMAP(pv); 5699 PMAP_LOCK(pmap); 5700 pte1p = pmap_pte1(pmap, va); 5701 opte1 = pte1_load(pte1p); 5702 if (!(opte1 & PTE1_RO)) { 5703 if (pmap_demote_pte1(pmap, pte1p, va) && 5704 !pte1_is_wired(opte1)) { 5705 /* 5706 * Write protect the mapping to a 5707 * single page so that a subsequent 5708 * write access may repromote. 5709 */ 5710 va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); 5711 pte2p = pmap_pte2_quick(pmap, va); 5712 opte2 = pte2_load(pte2p); 5713 if ((opte2 & PTE2_V)) { 5714 pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); 5715 vm_page_dirty(m); 5716 pmap_tlb_flush(pmap, va); 5717 } 5718 } 5719 } 5720 PMAP_UNLOCK(pmap); 5721 } 5722 small_mappings: 5723 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5724 pmap = PV_PMAP(pv); 5725 PMAP_LOCK(pmap); 5726 pte1p = pmap_pte1(pmap, pv->pv_va); 5727 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5728 " a section in page %p's pv list", __func__, m)); 5729 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5730 if (pte2_is_dirty(pte2_load(pte2p))) { 5731 pte2_set_bit(pte2p, PTE2_NM); 5732 pmap_tlb_flush(pmap, pv->pv_va); 5733 } 5734 PMAP_UNLOCK(pmap); 5735 } 5736 sched_unpin(); 5737 rw_wunlock(&pvh_global_lock); 5738 } 5739 5740 5741 /* 5742 * Sets the memory attribute for the specified page. 5743 */ 5744 void 5745 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5746 { 5747 pt2_entry_t *cmap2_pte2p; 5748 vm_memattr_t oma; 5749 vm_paddr_t pa; 5750 struct pcpu *pc; 5751 5752 oma = m->md.pat_mode; 5753 m->md.pat_mode = ma; 5754 5755 CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, 5756 VM_PAGE_TO_PHYS(m), oma, ma); 5757 if ((m->flags & PG_FICTITIOUS) != 0) 5758 return; 5759 #if 0 5760 /* 5761 * If "m" is a normal page, flush it from the cache. 5762 * 5763 * First, try to find an existing mapping of the page by sf 5764 * buffer. sf_buf_invalidate_cache() modifies mapping and 5765 * flushes the cache. 5766 */ 5767 if (sf_buf_invalidate_cache(m, oma)) 5768 return; 5769 #endif 5770 /* 5771 * If page is not mapped by sf buffer, map the page 5772 * transient and do invalidation. 5773 */ 5774 if (ma != oma) { 5775 pa = VM_PAGE_TO_PHYS(m); 5776 sched_pin(); 5777 pc = get_pcpu(); 5778 cmap2_pte2p = pc->pc_cmap2_pte2p; 5779 mtx_lock(&pc->pc_cmap_lock); 5780 if (pte2_load(cmap2_pte2p) != 0) 5781 panic("%s: CMAP2 busy", __func__); 5782 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 5783 vm_memattr_to_pte2(ma))); 5784 dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); 5785 pte2_clear(cmap2_pte2p); 5786 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5787 sched_unpin(); 5788 mtx_unlock(&pc->pc_cmap_lock); 5789 } 5790 } 5791 5792 /* 5793 * Miscellaneous support routines follow 5794 */ 5795 5796 /* 5797 * Returns TRUE if the given page is mapped individually or as part of 5798 * a 1mpage. Otherwise, returns FALSE. 5799 */ 5800 boolean_t 5801 pmap_page_is_mapped(vm_page_t m) 5802 { 5803 boolean_t rv; 5804 5805 if ((m->oflags & VPO_UNMANAGED) != 0) 5806 return (FALSE); 5807 rw_wlock(&pvh_global_lock); 5808 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5809 ((m->flags & PG_FICTITIOUS) == 0 && 5810 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5811 rw_wunlock(&pvh_global_lock); 5812 return (rv); 5813 } 5814 5815 /* 5816 * Returns true if the pmap's pv is one of the first 5817 * 16 pvs linked to from this page. This count may 5818 * be changed upwards or downwards in the future; it 5819 * is only necessary that true be returned for a small 5820 * subset of pmaps for proper page aging. 5821 */ 5822 boolean_t 5823 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5824 { 5825 struct md_page *pvh; 5826 pv_entry_t pv; 5827 int loops = 0; 5828 boolean_t rv; 5829 5830 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5831 ("%s: page %p is not managed", __func__, m)); 5832 rv = FALSE; 5833 rw_wlock(&pvh_global_lock); 5834 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5835 if (PV_PMAP(pv) == pmap) { 5836 rv = TRUE; 5837 break; 5838 } 5839 loops++; 5840 if (loops >= 16) 5841 break; 5842 } 5843 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5844 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5845 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5846 if (PV_PMAP(pv) == pmap) { 5847 rv = TRUE; 5848 break; 5849 } 5850 loops++; 5851 if (loops >= 16) 5852 break; 5853 } 5854 } 5855 rw_wunlock(&pvh_global_lock); 5856 return (rv); 5857 } 5858 5859 /* 5860 * pmap_zero_page zeros the specified hardware page by mapping 5861 * the page into KVM and using bzero to clear its contents. 5862 */ 5863 void 5864 pmap_zero_page(vm_page_t m) 5865 { 5866 pt2_entry_t *cmap2_pte2p; 5867 struct pcpu *pc; 5868 5869 sched_pin(); 5870 pc = get_pcpu(); 5871 cmap2_pte2p = pc->pc_cmap2_pte2p; 5872 mtx_lock(&pc->pc_cmap_lock); 5873 if (pte2_load(cmap2_pte2p) != 0) 5874 panic("%s: CMAP2 busy", __func__); 5875 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5876 vm_page_pte2_attr(m))); 5877 pagezero(pc->pc_cmap2_addr); 5878 pte2_clear(cmap2_pte2p); 5879 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5880 sched_unpin(); 5881 mtx_unlock(&pc->pc_cmap_lock); 5882 } 5883 5884 /* 5885 * pmap_zero_page_area zeros the specified hardware page by mapping 5886 * the page into KVM and using bzero to clear its contents. 5887 * 5888 * off and size may not cover an area beyond a single hardware page. 5889 */ 5890 void 5891 pmap_zero_page_area(vm_page_t m, int off, int size) 5892 { 5893 pt2_entry_t *cmap2_pte2p; 5894 struct pcpu *pc; 5895 5896 sched_pin(); 5897 pc = get_pcpu(); 5898 cmap2_pte2p = pc->pc_cmap2_pte2p; 5899 mtx_lock(&pc->pc_cmap_lock); 5900 if (pte2_load(cmap2_pte2p) != 0) 5901 panic("%s: CMAP2 busy", __func__); 5902 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5903 vm_page_pte2_attr(m))); 5904 if (off == 0 && size == PAGE_SIZE) 5905 pagezero(pc->pc_cmap2_addr); 5906 else 5907 bzero(pc->pc_cmap2_addr + off, size); 5908 pte2_clear(cmap2_pte2p); 5909 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5910 sched_unpin(); 5911 mtx_unlock(&pc->pc_cmap_lock); 5912 } 5913 5914 /* 5915 * pmap_copy_page copies the specified (machine independent) 5916 * page by mapping the page into virtual memory and using 5917 * bcopy to copy the page, one machine dependent page at a 5918 * time. 5919 */ 5920 void 5921 pmap_copy_page(vm_page_t src, vm_page_t dst) 5922 { 5923 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5924 struct pcpu *pc; 5925 5926 sched_pin(); 5927 pc = get_pcpu(); 5928 cmap1_pte2p = pc->pc_cmap1_pte2p; 5929 cmap2_pte2p = pc->pc_cmap2_pte2p; 5930 mtx_lock(&pc->pc_cmap_lock); 5931 if (pte2_load(cmap1_pte2p) != 0) 5932 panic("%s: CMAP1 busy", __func__); 5933 if (pte2_load(cmap2_pte2p) != 0) 5934 panic("%s: CMAP2 busy", __func__); 5935 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), 5936 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); 5937 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), 5938 PTE2_AP_KRW, vm_page_pte2_attr(dst))); 5939 bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); 5940 pte2_clear(cmap1_pte2p); 5941 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5942 pte2_clear(cmap2_pte2p); 5943 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5944 sched_unpin(); 5945 mtx_unlock(&pc->pc_cmap_lock); 5946 } 5947 5948 int unmapped_buf_allowed = 1; 5949 5950 void 5951 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5952 vm_offset_t b_offset, int xfersize) 5953 { 5954 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5955 vm_page_t a_pg, b_pg; 5956 char *a_cp, *b_cp; 5957 vm_offset_t a_pg_offset, b_pg_offset; 5958 struct pcpu *pc; 5959 int cnt; 5960 5961 sched_pin(); 5962 pc = get_pcpu(); 5963 cmap1_pte2p = pc->pc_cmap1_pte2p; 5964 cmap2_pte2p = pc->pc_cmap2_pte2p; 5965 mtx_lock(&pc->pc_cmap_lock); 5966 if (pte2_load(cmap1_pte2p) != 0) 5967 panic("pmap_copy_pages: CMAP1 busy"); 5968 if (pte2_load(cmap2_pte2p) != 0) 5969 panic("pmap_copy_pages: CMAP2 busy"); 5970 while (xfersize > 0) { 5971 a_pg = ma[a_offset >> PAGE_SHIFT]; 5972 a_pg_offset = a_offset & PAGE_MASK; 5973 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5974 b_pg = mb[b_offset >> PAGE_SHIFT]; 5975 b_pg_offset = b_offset & PAGE_MASK; 5976 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5977 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), 5978 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); 5979 tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); 5980 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), 5981 PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); 5982 tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); 5983 a_cp = pc->pc_cmap1_addr + a_pg_offset; 5984 b_cp = pc->pc_cmap2_addr + b_pg_offset; 5985 bcopy(a_cp, b_cp, cnt); 5986 a_offset += cnt; 5987 b_offset += cnt; 5988 xfersize -= cnt; 5989 } 5990 pte2_clear(cmap1_pte2p); 5991 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5992 pte2_clear(cmap2_pte2p); 5993 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5994 sched_unpin(); 5995 mtx_unlock(&pc->pc_cmap_lock); 5996 } 5997 5998 vm_offset_t 5999 pmap_quick_enter_page(vm_page_t m) 6000 { 6001 struct pcpu *pc; 6002 pt2_entry_t *pte2p; 6003 6004 critical_enter(); 6005 pc = get_pcpu(); 6006 pte2p = pc->pc_qmap_pte2p; 6007 6008 KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); 6009 6010 pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6011 vm_page_pte2_attr(m))); 6012 return (pc->pc_qmap_addr); 6013 } 6014 6015 void 6016 pmap_quick_remove_page(vm_offset_t addr) 6017 { 6018 struct pcpu *pc; 6019 pt2_entry_t *pte2p; 6020 6021 pc = get_pcpu(); 6022 pte2p = pc->pc_qmap_pte2p; 6023 6024 KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); 6025 KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); 6026 6027 pte2_clear(pte2p); 6028 tlb_flush(pc->pc_qmap_addr); 6029 critical_exit(); 6030 } 6031 6032 /* 6033 * Copy the range specified by src_addr/len 6034 * from the source map to the range dst_addr/len 6035 * in the destination map. 6036 * 6037 * This routine is only advisory and need not do anything. 6038 */ 6039 void 6040 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6041 vm_offset_t src_addr) 6042 { 6043 struct spglist free; 6044 vm_offset_t addr; 6045 vm_offset_t end_addr = src_addr + len; 6046 vm_offset_t nextva; 6047 6048 if (dst_addr != src_addr) 6049 return; 6050 6051 if (!pmap_is_current(src_pmap)) 6052 return; 6053 6054 rw_wlock(&pvh_global_lock); 6055 if (dst_pmap < src_pmap) { 6056 PMAP_LOCK(dst_pmap); 6057 PMAP_LOCK(src_pmap); 6058 } else { 6059 PMAP_LOCK(src_pmap); 6060 PMAP_LOCK(dst_pmap); 6061 } 6062 sched_pin(); 6063 for (addr = src_addr; addr < end_addr; addr = nextva) { 6064 pt2_entry_t *src_pte2p, *dst_pte2p; 6065 vm_page_t dst_mpt2pg, src_mpt2pg; 6066 pt1_entry_t src_pte1; 6067 u_int pte1_idx; 6068 6069 KASSERT(addr < VM_MAXUSER_ADDRESS, 6070 ("%s: invalid to pmap_copy page tables", __func__)); 6071 6072 nextva = pte1_trunc(addr + PTE1_SIZE); 6073 if (nextva < addr) 6074 nextva = end_addr; 6075 6076 pte1_idx = pte1_index(addr); 6077 src_pte1 = src_pmap->pm_pt1[pte1_idx]; 6078 if (pte1_is_section(src_pte1)) { 6079 if ((addr & PTE1_OFFSET) != 0 || 6080 (addr + PTE1_SIZE) > end_addr) 6081 continue; 6082 if (dst_pmap->pm_pt1[pte1_idx] == 0 && 6083 (!pte1_is_managed(src_pte1) || 6084 pmap_pv_insert_pte1(dst_pmap, addr, src_pte1, 6085 PMAP_ENTER_NORECLAIM))) { 6086 dst_pmap->pm_pt1[pte1_idx] = src_pte1 & 6087 ~PTE1_W; 6088 dst_pmap->pm_stats.resident_count += 6089 PTE1_SIZE / PAGE_SIZE; 6090 pmap_pte1_mappings++; 6091 } 6092 continue; 6093 } else if (!pte1_is_link(src_pte1)) 6094 continue; 6095 6096 src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); 6097 6098 /* 6099 * We leave PT2s to be linked from PT1 even if they are not 6100 * referenced until all PT2s in a page are without reference. 6101 * 6102 * QQQ: It could be changed ... 6103 */ 6104 #if 0 /* single_pt2_link_is_cleared */ 6105 KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, 6106 ("%s: source page table page is unused", __func__)); 6107 #else 6108 if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) 6109 continue; 6110 #endif 6111 if (nextva > end_addr) 6112 nextva = end_addr; 6113 6114 src_pte2p = pt2map_entry(addr); 6115 while (addr < nextva) { 6116 pt2_entry_t temp_pte2; 6117 temp_pte2 = pte2_load(src_pte2p); 6118 /* 6119 * we only virtual copy managed pages 6120 */ 6121 if (pte2_is_managed(temp_pte2)) { 6122 dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, 6123 PMAP_ENTER_NOSLEEP); 6124 if (dst_mpt2pg == NULL) 6125 goto out; 6126 dst_pte2p = pmap_pte2_quick(dst_pmap, addr); 6127 if (!pte2_is_valid(pte2_load(dst_pte2p)) && 6128 pmap_try_insert_pv_entry(dst_pmap, addr, 6129 PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { 6130 /* 6131 * Clear the wired, modified, and 6132 * accessed (referenced) bits 6133 * during the copy. 6134 */ 6135 temp_pte2 &= ~(PTE2_W | PTE2_A); 6136 temp_pte2 |= PTE2_NM; 6137 pte2_store(dst_pte2p, temp_pte2); 6138 dst_pmap->pm_stats.resident_count++; 6139 } else { 6140 SLIST_INIT(&free); 6141 if (pmap_unwire_pt2(dst_pmap, addr, 6142 dst_mpt2pg, &free)) { 6143 pmap_tlb_flush(dst_pmap, addr); 6144 vm_page_free_pages_toq(&free, 6145 false); 6146 } 6147 goto out; 6148 } 6149 if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= 6150 pt2_wirecount_get(src_mpt2pg, pte1_idx)) 6151 break; 6152 } 6153 addr += PAGE_SIZE; 6154 src_pte2p++; 6155 } 6156 } 6157 out: 6158 sched_unpin(); 6159 rw_wunlock(&pvh_global_lock); 6160 PMAP_UNLOCK(src_pmap); 6161 PMAP_UNLOCK(dst_pmap); 6162 } 6163 6164 /* 6165 * Increase the starting virtual address of the given mapping if a 6166 * different alignment might result in more section mappings. 6167 */ 6168 void 6169 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6170 vm_offset_t *addr, vm_size_t size) 6171 { 6172 vm_offset_t pte1_offset; 6173 6174 if (size < PTE1_SIZE) 6175 return; 6176 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6177 offset += ptoa(object->pg_color); 6178 pte1_offset = offset & PTE1_OFFSET; 6179 if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || 6180 (*addr & PTE1_OFFSET) == pte1_offset) 6181 return; 6182 if ((*addr & PTE1_OFFSET) < pte1_offset) 6183 *addr = pte1_trunc(*addr) + pte1_offset; 6184 else 6185 *addr = pte1_roundup(*addr) + pte1_offset; 6186 } 6187 6188 void 6189 pmap_activate(struct thread *td) 6190 { 6191 pmap_t pmap, oldpmap; 6192 u_int cpuid, ttb; 6193 6194 PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); 6195 6196 critical_enter(); 6197 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6198 oldpmap = PCPU_GET(curpmap); 6199 cpuid = PCPU_GET(cpuid); 6200 6201 #if defined(SMP) 6202 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6203 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6204 #else 6205 CPU_CLR(cpuid, &oldpmap->pm_active); 6206 CPU_SET(cpuid, &pmap->pm_active); 6207 #endif 6208 6209 ttb = pmap_ttb_get(pmap); 6210 6211 /* 6212 * pmap_activate is for the current thread on the current cpu 6213 */ 6214 td->td_pcb->pcb_pagedir = ttb; 6215 cp15_ttbr_set(ttb); 6216 PCPU_SET(curpmap, pmap); 6217 critical_exit(); 6218 } 6219 6220 /* 6221 * Perform the pmap work for mincore(2). If the page is not both referenced and 6222 * modified by this pmap, returns its physical address so that the caller can 6223 * find other mappings. 6224 */ 6225 int 6226 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 6227 { 6228 pt1_entry_t *pte1p, pte1; 6229 pt2_entry_t *pte2p, pte2; 6230 vm_paddr_t pa; 6231 bool managed; 6232 int val; 6233 6234 PMAP_LOCK(pmap); 6235 pte1p = pmap_pte1(pmap, addr); 6236 pte1 = pte1_load(pte1p); 6237 if (pte1_is_section(pte1)) { 6238 pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); 6239 managed = pte1_is_managed(pte1); 6240 val = MINCORE_SUPER | MINCORE_INCORE; 6241 if (pte1_is_dirty(pte1)) 6242 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6243 if (pte1 & PTE1_A) 6244 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6245 } else if (pte1_is_link(pte1)) { 6246 pte2p = pmap_pte2(pmap, addr); 6247 pte2 = pte2_load(pte2p); 6248 pmap_pte2_release(pte2p); 6249 pa = pte2_pa(pte2); 6250 managed = pte2_is_managed(pte2); 6251 val = MINCORE_INCORE; 6252 if (pte2_is_dirty(pte2)) 6253 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6254 if (pte2 & PTE2_A) 6255 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6256 } else { 6257 managed = false; 6258 val = 0; 6259 } 6260 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6261 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6262 *pap = pa; 6263 } 6264 PMAP_UNLOCK(pmap); 6265 return (val); 6266 } 6267 6268 void 6269 pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) 6270 { 6271 vm_offset_t sva; 6272 uint32_t l2attr; 6273 6274 KASSERT((size & PAGE_MASK) == 0, 6275 ("%s: device mapping not page-sized", __func__)); 6276 6277 sva = va; 6278 l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); 6279 while (size != 0) { 6280 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); 6281 va += PAGE_SIZE; 6282 pa += PAGE_SIZE; 6283 size -= PAGE_SIZE; 6284 } 6285 tlb_flush_range(sva, va - sva); 6286 } 6287 6288 void 6289 pmap_kremove_device(vm_offset_t va, vm_size_t size) 6290 { 6291 vm_offset_t sva; 6292 6293 KASSERT((size & PAGE_MASK) == 0, 6294 ("%s: device mapping not page-sized", __func__)); 6295 6296 sva = va; 6297 while (size != 0) { 6298 pmap_kremove(va); 6299 va += PAGE_SIZE; 6300 size -= PAGE_SIZE; 6301 } 6302 tlb_flush_range(sva, va - sva); 6303 } 6304 6305 void 6306 pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) 6307 { 6308 6309 pcb->pcb_pagedir = pmap_ttb_get(pmap); 6310 } 6311 6312 6313 /* 6314 * Clean L1 data cache range by physical address. 6315 * The range must be within a single page. 6316 */ 6317 static void 6318 pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) 6319 { 6320 pt2_entry_t *cmap2_pte2p; 6321 struct pcpu *pc; 6322 6323 KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, 6324 ("%s: not on single page", __func__)); 6325 6326 sched_pin(); 6327 pc = get_pcpu(); 6328 cmap2_pte2p = pc->pc_cmap2_pte2p; 6329 mtx_lock(&pc->pc_cmap_lock); 6330 if (pte2_load(cmap2_pte2p) != 0) 6331 panic("%s: CMAP2 busy", __func__); 6332 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); 6333 dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); 6334 pte2_clear(cmap2_pte2p); 6335 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6336 sched_unpin(); 6337 mtx_unlock(&pc->pc_cmap_lock); 6338 } 6339 6340 /* 6341 * Sync instruction cache range which is not mapped yet. 6342 */ 6343 void 6344 cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) 6345 { 6346 uint32_t len, offset; 6347 vm_page_t m; 6348 6349 /* Write back d-cache on given address range. */ 6350 offset = pa & PAGE_MASK; 6351 for ( ; size != 0; size -= len, pa += len, offset = 0) { 6352 len = min(PAGE_SIZE - offset, size); 6353 m = PHYS_TO_VM_PAGE(pa); 6354 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6355 __func__, pa)); 6356 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6357 } 6358 /* 6359 * I-cache is VIPT. Only way how to flush all virtual mappings 6360 * on given physical address is to invalidate all i-cache. 6361 */ 6362 icache_inv_all(); 6363 } 6364 6365 void 6366 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) 6367 { 6368 6369 /* Write back d-cache on given address range. */ 6370 if (va >= VM_MIN_KERNEL_ADDRESS) { 6371 dcache_wb_pou(va, size); 6372 } else { 6373 uint32_t len, offset; 6374 vm_paddr_t pa; 6375 vm_page_t m; 6376 6377 offset = va & PAGE_MASK; 6378 for ( ; size != 0; size -= len, va += len, offset = 0) { 6379 pa = pmap_extract(pmap, va); /* offset is preserved */ 6380 len = min(PAGE_SIZE - offset, size); 6381 m = PHYS_TO_VM_PAGE(pa); 6382 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6383 __func__, pa)); 6384 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6385 } 6386 } 6387 /* 6388 * I-cache is VIPT. Only way how to flush all virtual mappings 6389 * on given physical address is to invalidate all i-cache. 6390 */ 6391 icache_inv_all(); 6392 } 6393 6394 /* 6395 * The implementation of pmap_fault() uses IN_RANGE2() macro which 6396 * depends on the fact that given range size is a power of 2. 6397 */ 6398 CTASSERT(powerof2(NB_IN_PT1)); 6399 CTASSERT(powerof2(PT2MAP_SIZE)); 6400 6401 #define IN_RANGE2(addr, start, size) \ 6402 ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) 6403 6404 /* 6405 * Handle access and R/W emulation faults. 6406 */ 6407 int 6408 pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) 6409 { 6410 pt1_entry_t *pte1p, pte1; 6411 pt2_entry_t *pte2p, pte2; 6412 6413 if (pmap == NULL) 6414 pmap = kernel_pmap; 6415 6416 /* 6417 * In kernel, we should never get abort with FAR which is in range of 6418 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here 6419 * and print out a useful abort message and even get to the debugger 6420 * otherwise it likely ends with never ending loop of aborts. 6421 */ 6422 if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { 6423 /* 6424 * All L1 tables should always be mapped and present. 6425 * However, we check only current one herein. For user mode, 6426 * only permission abort from malicious user is not fatal. 6427 * And alignment abort as it may have higher priority. 6428 */ 6429 if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { 6430 CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", 6431 __func__, pmap, pmap->pm_pt1, far); 6432 panic("%s: pm_pt1 abort", __func__); 6433 } 6434 return (KERN_INVALID_ADDRESS); 6435 } 6436 if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { 6437 /* 6438 * PT2MAP should be always mapped and present in current 6439 * L1 table. However, only existing L2 tables are mapped 6440 * in PT2MAP. For user mode, only L2 translation abort and 6441 * permission abort from malicious user is not fatal. 6442 * And alignment abort as it may have higher priority. 6443 */ 6444 if (!usermode || (idx != FAULT_ALIGN && 6445 idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { 6446 CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", 6447 __func__, pmap, PT2MAP, far); 6448 panic("%s: PT2MAP abort", __func__); 6449 } 6450 return (KERN_INVALID_ADDRESS); 6451 } 6452 6453 /* 6454 * A pmap lock is used below for handling of access and R/W emulation 6455 * aborts. They were handled by atomic operations before so some 6456 * analysis of new situation is needed to answer the following question: 6457 * Is it safe to use the lock even for these aborts? 6458 * 6459 * There may happen two cases in general: 6460 * 6461 * (1) Aborts while the pmap lock is locked already - this should not 6462 * happen as pmap lock is not recursive. However, under pmap lock only 6463 * internal kernel data should be accessed and such data should be 6464 * mapped with A bit set and NM bit cleared. If double abort happens, 6465 * then a mapping of data which has caused it must be fixed. Further, 6466 * all new mappings are always made with A bit set and the bit can be 6467 * cleared only on managed mappings. 6468 * 6469 * (2) Aborts while another lock(s) is/are locked - this already can 6470 * happen. However, there is no difference here if it's either access or 6471 * R/W emulation abort, or if it's some other abort. 6472 */ 6473 6474 PMAP_LOCK(pmap); 6475 #ifdef INVARIANTS 6476 pte1 = pte1_load(pmap_pte1(pmap, far)); 6477 if (pte1_is_link(pte1)) { 6478 /* 6479 * Check in advance that associated L2 page table is mapped into 6480 * PT2MAP space. Note that faulty access to not mapped L2 page 6481 * table is caught in more general check above where "far" is 6482 * checked that it does not lay in PT2MAP space. Note also that 6483 * L1 page table and PT2TAB always exist and are mapped. 6484 */ 6485 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); 6486 if (!pte2_is_valid(pte2)) 6487 panic("%s: missing L2 page table (%p, %#x)", 6488 __func__, pmap, far); 6489 } 6490 #endif 6491 #ifdef SMP 6492 /* 6493 * Special treatment is due to break-before-make approach done when 6494 * pte1 is updated for userland mapping during section promotion or 6495 * demotion. If not caught here, pmap_enter() can find a section 6496 * mapping on faulting address. That is not allowed. 6497 */ 6498 if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { 6499 PMAP_UNLOCK(pmap); 6500 return (KERN_SUCCESS); 6501 } 6502 #endif 6503 /* 6504 * Accesss bits for page and section. Note that the entry 6505 * is not in TLB yet, so TLB flush is not necessary. 6506 * 6507 * QQQ: This is hardware emulation, we do not call userret() 6508 * for aborts from user mode. 6509 */ 6510 if (idx == FAULT_ACCESS_L2) { 6511 pte1 = pte1_load(pmap_pte1(pmap, far)); 6512 if (pte1_is_link(pte1)) { 6513 /* L2 page table should exist and be mapped. */ 6514 pte2p = pt2map_entry(far); 6515 pte2 = pte2_load(pte2p); 6516 if (pte2_is_valid(pte2)) { 6517 pte2_store(pte2p, pte2 | PTE2_A); 6518 PMAP_UNLOCK(pmap); 6519 return (KERN_SUCCESS); 6520 } 6521 } else { 6522 /* 6523 * We got L2 access fault but PTE1 is not a link. 6524 * Probably some race happened, do nothing. 6525 */ 6526 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L2 - pmap %#x far %#x", 6527 __func__, pmap, far); 6528 PMAP_UNLOCK(pmap); 6529 return (KERN_SUCCESS); 6530 } 6531 } 6532 if (idx == FAULT_ACCESS_L1) { 6533 pte1p = pmap_pte1(pmap, far); 6534 pte1 = pte1_load(pte1p); 6535 if (pte1_is_section(pte1)) { 6536 pte1_store(pte1p, pte1 | PTE1_A); 6537 PMAP_UNLOCK(pmap); 6538 return (KERN_SUCCESS); 6539 } else { 6540 /* 6541 * We got L1 access fault but PTE1 is not section 6542 * mapping. Probably some race happened, do nothing. 6543 */ 6544 CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L1 - pmap %#x far %#x", 6545 __func__, pmap, far); 6546 PMAP_UNLOCK(pmap); 6547 return (KERN_SUCCESS); 6548 } 6549 } 6550 6551 /* 6552 * Handle modify bits for page and section. Note that the modify 6553 * bit is emulated by software. So PTEx_RO is software read only 6554 * bit and PTEx_NM flag is real hardware read only bit. 6555 * 6556 * QQQ: This is hardware emulation, we do not call userret() 6557 * for aborts from user mode. 6558 */ 6559 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { 6560 pte1 = pte1_load(pmap_pte1(pmap, far)); 6561 if (pte1_is_link(pte1)) { 6562 /* L2 page table should exist and be mapped. */ 6563 pte2p = pt2map_entry(far); 6564 pte2 = pte2_load(pte2p); 6565 if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && 6566 (pte2 & PTE2_NM)) { 6567 pte2_store(pte2p, pte2 & ~PTE2_NM); 6568 tlb_flush(trunc_page(far)); 6569 PMAP_UNLOCK(pmap); 6570 return (KERN_SUCCESS); 6571 } 6572 } else { 6573 /* 6574 * We got L2 permission fault but PTE1 is not a link. 6575 * Probably some race happened, do nothing. 6576 */ 6577 CTR3(KTR_PMAP, "%s: FAULT_PERM_L2 - pmap %#x far %#x", 6578 __func__, pmap, far); 6579 PMAP_UNLOCK(pmap); 6580 return (KERN_SUCCESS); 6581 } 6582 } 6583 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { 6584 pte1p = pmap_pte1(pmap, far); 6585 pte1 = pte1_load(pte1p); 6586 if (pte1_is_section(pte1)) { 6587 if (!(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { 6588 pte1_store(pte1p, pte1 & ~PTE1_NM); 6589 tlb_flush(pte1_trunc(far)); 6590 PMAP_UNLOCK(pmap); 6591 return (KERN_SUCCESS); 6592 } 6593 } else { 6594 /* 6595 * We got L1 permission fault but PTE1 is not section 6596 * mapping. Probably some race happened, do nothing. 6597 */ 6598 CTR3(KTR_PMAP, "%s: FAULT_PERM_L1 - pmap %#x far %#x", 6599 __func__, pmap, far); 6600 PMAP_UNLOCK(pmap); 6601 return (KERN_SUCCESS); 6602 } 6603 } 6604 6605 /* 6606 * QQQ: The previous code, mainly fast handling of access and 6607 * modify bits aborts, could be moved to ASM. Now we are 6608 * starting to deal with not fast aborts. 6609 */ 6610 PMAP_UNLOCK(pmap); 6611 return (KERN_FAILURE); 6612 } 6613 6614 #if defined(PMAP_DEBUG) 6615 /* 6616 * Reusing of KVA used in pmap_zero_page function !!! 6617 */ 6618 static void 6619 pmap_zero_page_check(vm_page_t m) 6620 { 6621 pt2_entry_t *cmap2_pte2p; 6622 uint32_t *p, *end; 6623 struct pcpu *pc; 6624 6625 sched_pin(); 6626 pc = get_pcpu(); 6627 cmap2_pte2p = pc->pc_cmap2_pte2p; 6628 mtx_lock(&pc->pc_cmap_lock); 6629 if (pte2_load(cmap2_pte2p) != 0) 6630 panic("%s: CMAP2 busy", __func__); 6631 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6632 vm_page_pte2_attr(m))); 6633 end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); 6634 for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) 6635 if (*p != 0) 6636 panic("%s: page %p not zero, va: %p", __func__, m, 6637 pc->pc_cmap2_addr); 6638 pte2_clear(cmap2_pte2p); 6639 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6640 sched_unpin(); 6641 mtx_unlock(&pc->pc_cmap_lock); 6642 } 6643 6644 int 6645 pmap_pid_dump(int pid) 6646 { 6647 pmap_t pmap; 6648 struct proc *p; 6649 int npte2 = 0; 6650 int i, j, index; 6651 6652 sx_slock(&allproc_lock); 6653 FOREACH_PROC_IN_SYSTEM(p) { 6654 if (p->p_pid != pid || p->p_vmspace == NULL) 6655 continue; 6656 index = 0; 6657 pmap = vmspace_pmap(p->p_vmspace); 6658 for (i = 0; i < NPTE1_IN_PT1; i++) { 6659 pt1_entry_t pte1; 6660 pt2_entry_t *pte2p, pte2; 6661 vm_offset_t base, va; 6662 vm_paddr_t pa; 6663 vm_page_t m; 6664 6665 base = i << PTE1_SHIFT; 6666 pte1 = pte1_load(&pmap->pm_pt1[i]); 6667 6668 if (pte1_is_section(pte1)) { 6669 /* 6670 * QQQ: Do something here! 6671 */ 6672 } else if (pte1_is_link(pte1)) { 6673 for (j = 0; j < NPTE2_IN_PT2; j++) { 6674 va = base + (j << PAGE_SHIFT); 6675 if (va >= VM_MIN_KERNEL_ADDRESS) { 6676 if (index) { 6677 index = 0; 6678 printf("\n"); 6679 } 6680 sx_sunlock(&allproc_lock); 6681 return (npte2); 6682 } 6683 pte2p = pmap_pte2(pmap, va); 6684 pte2 = pte2_load(pte2p); 6685 pmap_pte2_release(pte2p); 6686 if (!pte2_is_valid(pte2)) 6687 continue; 6688 6689 pa = pte2_pa(pte2); 6690 m = PHYS_TO_VM_PAGE(pa); 6691 printf("va: 0x%x, pa: 0x%x, w: %d, " 6692 "f: 0x%x", va, pa, 6693 m->ref_count, m->flags); 6694 npte2++; 6695 index++; 6696 if (index >= 2) { 6697 index = 0; 6698 printf("\n"); 6699 } else { 6700 printf(" "); 6701 } 6702 } 6703 } 6704 } 6705 } 6706 sx_sunlock(&allproc_lock); 6707 return (npte2); 6708 } 6709 6710 #endif 6711 6712 #ifdef DDB 6713 static pt2_entry_t * 6714 pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) 6715 { 6716 pt1_entry_t pte1; 6717 vm_paddr_t pt2pg_pa; 6718 6719 pte1 = pte1_load(pmap_pte1(pmap, va)); 6720 if (!pte1_is_link(pte1)) 6721 return (NULL); 6722 6723 if (pmap_is_current(pmap)) 6724 return (pt2map_entry(va)); 6725 6726 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 6727 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 6728 if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { 6729 pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); 6730 #ifdef SMP 6731 PMAP3cpu = PCPU_GET(cpuid); 6732 #endif 6733 tlb_flush_local((vm_offset_t)PADDR3); 6734 } 6735 #ifdef SMP 6736 else if (PMAP3cpu != PCPU_GET(cpuid)) { 6737 PMAP3cpu = PCPU_GET(cpuid); 6738 tlb_flush_local((vm_offset_t)PADDR3); 6739 } 6740 #endif 6741 return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 6742 } 6743 6744 static void 6745 dump_pmap(pmap_t pmap) 6746 { 6747 6748 printf("pmap %p\n", pmap); 6749 printf(" pm_pt1: %p\n", pmap->pm_pt1); 6750 printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); 6751 printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); 6752 } 6753 6754 DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) 6755 { 6756 6757 pmap_t pmap; 6758 LIST_FOREACH(pmap, &allpmaps, pm_list) { 6759 dump_pmap(pmap); 6760 } 6761 } 6762 6763 static int 6764 pte2_class(pt2_entry_t pte2) 6765 { 6766 int cls; 6767 6768 cls = (pte2 >> 2) & 0x03; 6769 cls |= (pte2 >> 4) & 0x04; 6770 return (cls); 6771 } 6772 6773 static void 6774 dump_section(pmap_t pmap, uint32_t pte1_idx) 6775 { 6776 } 6777 6778 static void 6779 dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) 6780 { 6781 uint32_t i; 6782 vm_offset_t va; 6783 pt2_entry_t *pte2p, pte2; 6784 vm_page_t m; 6785 6786 va = pte1_idx << PTE1_SHIFT; 6787 pte2p = pmap_pte2_ddb(pmap, va); 6788 for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { 6789 pte2 = pte2_load(pte2p); 6790 if (pte2 == 0) 6791 continue; 6792 if (!pte2_is_valid(pte2)) { 6793 printf(" 0x%08X: 0x%08X", va, pte2); 6794 if (!invalid_ok) 6795 printf(" - not valid !!!"); 6796 printf("\n"); 6797 continue; 6798 } 6799 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 6800 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, 6801 pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); 6802 if (m != NULL) { 6803 printf(" v:%d w:%d f:0x%04X\n", m->valid, 6804 m->ref_count, m->flags); 6805 } else { 6806 printf("\n"); 6807 } 6808 } 6809 } 6810 6811 static __inline boolean_t 6812 is_pv_chunk_space(vm_offset_t va) 6813 { 6814 6815 if ((((vm_offset_t)pv_chunkbase) <= va) && 6816 (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) 6817 return (TRUE); 6818 return (FALSE); 6819 } 6820 6821 DB_SHOW_COMMAND(pmap, pmap_pmap_print) 6822 { 6823 /* XXX convert args. */ 6824 pmap_t pmap = (pmap_t)addr; 6825 pt1_entry_t pte1; 6826 pt2_entry_t pte2; 6827 vm_offset_t va, eva; 6828 vm_page_t m; 6829 uint32_t i; 6830 boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; 6831 6832 if (have_addr) { 6833 pmap_t pm; 6834 6835 LIST_FOREACH(pm, &allpmaps, pm_list) 6836 if (pm == pmap) break; 6837 if (pm == NULL) { 6838 printf("given pmap %p is not in allpmaps list\n", pmap); 6839 return; 6840 } 6841 } else 6842 pmap = PCPU_GET(curpmap); 6843 6844 eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; 6845 dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ 6846 6847 printf("pmap: 0x%08X\n", (uint32_t)pmap); 6848 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6849 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6850 6851 for(i = 0; i < NPTE1_IN_PT1; i++) { 6852 pte1 = pte1_load(&pmap->pm_pt1[i]); 6853 if (pte1 == 0) 6854 continue; 6855 va = i << PTE1_SHIFT; 6856 if (va >= eva) 6857 break; 6858 6859 if (pte1_is_section(pte1)) { 6860 printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, 6861 !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); 6862 dump_section(pmap, i); 6863 } else if (pte1_is_link(pte1)) { 6864 dump_link_ok = TRUE; 6865 invalid_ok = FALSE; 6866 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6867 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 6868 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", 6869 va, pte1, pte2, m); 6870 if (is_pv_chunk_space(va)) { 6871 printf(" - pv_chunk space"); 6872 if (dump_pv_chunk) 6873 invalid_ok = TRUE; 6874 else 6875 dump_link_ok = FALSE; 6876 } 6877 else if (m != NULL) 6878 printf(" w:%d w2:%u", m->ref_count, 6879 pt2_wirecount_get(m, pte1_index(va))); 6880 if (pte2 == 0) 6881 printf(" !!! pt2tab entry is ZERO"); 6882 else if (pte2_pa(pte1) != pte2_pa(pte2)) 6883 printf(" !!! pt2tab entry is DIFFERENT - m: %p", 6884 PHYS_TO_VM_PAGE(pte2_pa(pte2))); 6885 printf("\n"); 6886 if (dump_link_ok) 6887 dump_link(pmap, i, invalid_ok); 6888 } else 6889 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6890 } 6891 } 6892 6893 static void 6894 dump_pt2tab(pmap_t pmap) 6895 { 6896 uint32_t i; 6897 pt2_entry_t pte2; 6898 vm_offset_t va; 6899 vm_paddr_t pa; 6900 vm_page_t m; 6901 6902 printf("PT2TAB:\n"); 6903 for (i = 0; i < PT2TAB_ENTRIES; i++) { 6904 pte2 = pte2_load(&pmap->pm_pt2tab[i]); 6905 if (!pte2_is_valid(pte2)) 6906 continue; 6907 va = i << PT2TAB_SHIFT; 6908 pa = pte2_pa(pte2); 6909 m = PHYS_TO_VM_PAGE(pa); 6910 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, 6911 pte2_class(pte2), !!(pte2 & PTE2_S), m); 6912 if (m != NULL) 6913 printf(" , w: %d, f: 0x%04X pidx: %lld", 6914 m->ref_count, m->flags, m->pindex); 6915 printf("\n"); 6916 } 6917 } 6918 6919 DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) 6920 { 6921 /* XXX convert args. */ 6922 pmap_t pmap = (pmap_t)addr; 6923 pt1_entry_t pte1; 6924 pt2_entry_t pte2; 6925 vm_offset_t va; 6926 uint32_t i, start; 6927 6928 if (have_addr) { 6929 printf("supported only on current pmap\n"); 6930 return; 6931 } 6932 6933 pmap = PCPU_GET(curpmap); 6934 printf("curpmap: 0x%08X\n", (uint32_t)pmap); 6935 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6936 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6937 6938 start = pte1_index((vm_offset_t)PT2MAP); 6939 for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { 6940 pte1 = pte1_load(&pmap->pm_pt1[i]); 6941 if (pte1 == 0) 6942 continue; 6943 va = i << PTE1_SHIFT; 6944 if (pte1_is_section(pte1)) { 6945 printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, 6946 !!(pte1 & PTE1_S)); 6947 dump_section(pmap, i); 6948 } else if (pte1_is_link(pte1)) { 6949 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6950 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, 6951 pte1, pte2); 6952 if (pte2 == 0) 6953 printf(" !!! pt2tab entry is ZERO\n"); 6954 } else 6955 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6956 } 6957 dump_pt2tab(pmap); 6958 } 6959 #endif 6960