1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 6 * Copyright (c) 2014-2016 Svatopluk Kraus <skra@FreeBSD.org> 7 * Copyright (c) 2014-2016 Michal Meloun <mmel@FreeBSD.org> 8 * All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * the Systems Programming Group of the University of Utah Computer 12 * Science Department and William Jolitz of UUNET Technologies Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 39 */ 40 /*- 41 * Copyright (c) 2003 Networks Associates Technology, Inc. 42 * All rights reserved. 43 * 44 * This software was developed for the FreeBSD Project by Jake Burkholder, 45 * Safeport Network Services, and Network Associates Laboratories, the 46 * Security Research Division of Network Associates, Inc. under 47 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 48 * CHATS research program. 49 * 50 * Redistribution and use in source and binary forms, with or without 51 * modification, are permitted provided that the following conditions 52 * are met: 53 * 1. Redistributions of source code must retain the above copyright 54 * notice, this list of conditions and the following disclaimer. 55 * 2. Redistributions in binary form must reproduce the above copyright 56 * notice, this list of conditions and the following disclaimer in the 57 * documentation and/or other materials provided with the distribution. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 */ 71 72 #include <sys/cdefs.h> 73 __FBSDID("$FreeBSD$"); 74 75 /* 76 * Manages physical address maps. 77 * 78 * Since the information managed by this module is 79 * also stored by the logical address mapping module, 80 * this module may throw away valid virtual-to-physical 81 * mappings at almost any time. However, invalidations 82 * of virtual-to-physical mappings must be done as 83 * requested. 84 * 85 * In order to cope with hardware architectures which 86 * make virtual-to-physical map invalidates expensive, 87 * this module may delay invalidate or reduced protection 88 * operations until such time as they are actually 89 * necessary. This module is given full information as 90 * to which processors are currently using which maps, 91 * and to when physical maps must be made correct. 92 */ 93 94 #include "opt_vm.h" 95 #include "opt_pmap.h" 96 #include "opt_ddb.h" 97 98 #include <sys/param.h> 99 #include <sys/systm.h> 100 #include <sys/kernel.h> 101 #include <sys/ktr.h> 102 #include <sys/lock.h> 103 #include <sys/proc.h> 104 #include <sys/rwlock.h> 105 #include <sys/malloc.h> 106 #include <sys/vmmeter.h> 107 #include <sys/malloc.h> 108 #include <sys/mman.h> 109 #include <sys/sf_buf.h> 110 #include <sys/smp.h> 111 #include <sys/sched.h> 112 #include <sys/sysctl.h> 113 114 #ifdef DDB 115 #include <ddb/ddb.h> 116 #endif 117 118 #include <machine/physmem.h> 119 120 #include <vm/vm.h> 121 #include <vm/uma.h> 122 #include <vm/pmap.h> 123 #include <vm/vm_param.h> 124 #include <vm/vm_kern.h> 125 #include <vm/vm_object.h> 126 #include <vm/vm_map.h> 127 #include <vm/vm_page.h> 128 #include <vm/vm_pageout.h> 129 #include <vm/vm_phys.h> 130 #include <vm/vm_extern.h> 131 #include <vm/vm_reserv.h> 132 #include <sys/lock.h> 133 #include <sys/mutex.h> 134 135 #include <machine/md_var.h> 136 #include <machine/pmap_var.h> 137 #include <machine/cpu.h> 138 #include <machine/pcb.h> 139 #include <machine/sf_buf.h> 140 #ifdef SMP 141 #include <machine/smp.h> 142 #endif 143 #ifndef PMAP_SHPGPERPROC 144 #define PMAP_SHPGPERPROC 200 145 #endif 146 147 #ifndef DIAGNOSTIC 148 #define PMAP_INLINE __inline 149 #else 150 #define PMAP_INLINE 151 #endif 152 153 #ifdef PMAP_DEBUG 154 static void pmap_zero_page_check(vm_page_t m); 155 void pmap_debug(int level); 156 int pmap_pid_dump(int pid); 157 158 #define PDEBUG(_lev_,_stat_) \ 159 if (pmap_debug_level >= (_lev_)) \ 160 ((_stat_)) 161 #define dprintf printf 162 int pmap_debug_level = 1; 163 #else /* PMAP_DEBUG */ 164 #define PDEBUG(_lev_,_stat_) /* Nothing */ 165 #define dprintf(x, arg...) 166 #endif /* PMAP_DEBUG */ 167 168 /* 169 * Level 2 page tables map definion ('max' is excluded). 170 */ 171 172 #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 173 #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) 174 175 #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) 176 #define UPT2V_MAX_ADDRESS \ 177 ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) 178 179 /* 180 * Promotion to a 1MB (PTE1) page mapping requires that the corresponding 181 * 4KB (PTE2) page mappings have identical settings for the following fields: 182 */ 183 #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ 184 PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ 185 PTE2_ATTR_MASK) 186 187 #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ 188 PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ 189 PTE1_ATTR_MASK) 190 191 #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ 192 (((l2_attr) & L2_C) ? L1_S_C : 0) | \ 193 (((l2_attr) & L2_B) ? L1_S_B : 0) | \ 194 (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ 195 (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ 196 (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ 197 (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ 198 (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ 199 (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ 200 (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ 201 (((l2_attr) & PTE2_W) ? PTE1_W : 0)) 202 203 #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ 204 (((l1_attr) & L1_S_C) ? L2_C : 0) | \ 205 (((l1_attr) & L1_S_B) ? L2_B : 0) | \ 206 (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ 207 (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ 208 (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ 209 (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ 210 (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ 211 (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ 212 (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ 213 (((l1_attr) & PTE1_W) ? PTE2_W : 0)) 214 215 /* 216 * PTE2 descriptors creation macros. 217 */ 218 #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) 219 #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) 220 221 #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 222 #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) 223 224 #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) 225 #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) 226 227 #define PV_STATS 228 #ifdef PV_STATS 229 #define PV_STAT(x) do { x ; } while (0) 230 #else 231 #define PV_STAT(x) do { } while (0) 232 #endif 233 234 /* 235 * The boot_pt1 is used temporary in very early boot stage as L1 page table. 236 * We can init many things with no memory allocation thanks to its static 237 * allocation and this brings two main advantages: 238 * (1) other cores can be started very simply, 239 * (2) various boot loaders can be supported as its arguments can be processed 240 * in virtual address space and can be moved to safe location before 241 * first allocation happened. 242 * Only disadvantage is that boot_pt1 is used only in very early boot stage. 243 * However, the table is uninitialized and so lays in bss. Therefore kernel 244 * image size is not influenced. 245 * 246 * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and 247 * CPU suspend/resume game. 248 */ 249 extern pt1_entry_t boot_pt1[]; 250 251 vm_paddr_t base_pt1; 252 pt1_entry_t *kern_pt1; 253 pt2_entry_t *kern_pt2tab; 254 pt2_entry_t *PT2MAP; 255 256 static uint32_t ttb_flags; 257 static vm_memattr_t pt_memattr; 258 ttb_entry_t pmap_kern_ttb; 259 260 struct pmap kernel_pmap_store; 261 LIST_HEAD(pmaplist, pmap); 262 static struct pmaplist allpmaps; 263 static struct mtx allpmaps_lock; 264 265 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 266 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 267 268 static vm_offset_t kernel_vm_end_new; 269 vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; 270 vm_offset_t vm_max_kernel_address; 271 vm_paddr_t kernel_l1pa; 272 273 static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; 274 275 /* 276 * Data for the pv entry allocation mechanism 277 */ 278 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 279 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 280 static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ 281 static int shpgperproc = PMAP_SHPGPERPROC; 282 283 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 284 int pv_maxchunks; /* How many chunks we have KVA for */ 285 vm_offset_t pv_vafree; /* freelist stored in the PTE */ 286 287 vm_paddr_t first_managed_pa; 288 #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) 289 290 /* 291 * All those kernel PT submaps that BSD is so fond of 292 */ 293 caddr_t _tmppt = 0; 294 295 struct msgbuf *msgbufp = NULL; /* XXX move it to machdep.c */ 296 297 /* 298 * Crashdump maps. 299 */ 300 static caddr_t crashdumpmap; 301 302 static pt2_entry_t *PMAP1 = NULL, *PMAP2; 303 static pt2_entry_t *PADDR1 = NULL, *PADDR2; 304 #ifdef DDB 305 static pt2_entry_t *PMAP3; 306 static pt2_entry_t *PADDR3; 307 static int PMAP3cpu __unused; /* for SMP only */ 308 #endif 309 #ifdef SMP 310 static int PMAP1cpu; 311 static int PMAP1changedcpu; 312 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 313 &PMAP1changedcpu, 0, 314 "Number of times pmap_pte2_quick changed CPU with same PMAP1"); 315 #endif 316 static int PMAP1changed; 317 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 318 &PMAP1changed, 0, 319 "Number of times pmap_pte2_quick changed PMAP1"); 320 static int PMAP1unchanged; 321 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 322 &PMAP1unchanged, 0, 323 "Number of times pmap_pte2_quick didn't change PMAP1"); 324 static struct mtx PMAP2mutex; 325 326 static __inline void pt2_wirecount_init(vm_page_t m); 327 static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, 328 vm_offset_t va); 329 void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); 330 331 /* 332 * Function to set the debug level of the pmap code. 333 */ 334 #ifdef PMAP_DEBUG 335 void 336 pmap_debug(int level) 337 { 338 339 pmap_debug_level = level; 340 dprintf("pmap_debug: level=%d\n", pmap_debug_level); 341 } 342 #endif /* PMAP_DEBUG */ 343 344 /* 345 * This table must corespond with memory attribute configuration in vm.h. 346 * First entry is used for normal system mapping. 347 * 348 * Device memory is always marked as shared. 349 * Normal memory is shared only in SMP . 350 * Not outer shareable bits are not used yet. 351 * Class 6 cannot be used on ARM11. 352 */ 353 #define TEXDEF_TYPE_SHIFT 0 354 #define TEXDEF_TYPE_MASK 0x3 355 #define TEXDEF_INNER_SHIFT 2 356 #define TEXDEF_INNER_MASK 0x3 357 #define TEXDEF_OUTER_SHIFT 4 358 #define TEXDEF_OUTER_MASK 0x3 359 #define TEXDEF_NOS_SHIFT 6 360 #define TEXDEF_NOS_MASK 0x1 361 362 #define TEX(t, i, o, s) \ 363 ((t) << TEXDEF_TYPE_SHIFT) | \ 364 ((i) << TEXDEF_INNER_SHIFT) | \ 365 ((o) << TEXDEF_OUTER_SHIFT | \ 366 ((s) << TEXDEF_NOS_SHIFT)) 367 368 static uint32_t tex_class[8] = { 369 /* type inner cache outer cache */ 370 TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ 371 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ 372 TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ 373 TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ 374 TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ 375 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ 376 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ 377 TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ 378 }; 379 #undef TEX 380 381 static uint32_t pte2_attr_tab[8] = { 382 PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ 383 PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ 384 PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ 385 PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ 386 PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 387 0, /* 5 - NOT USED YET */ 388 0, /* 6 - NOT USED YET */ 389 0 /* 7 - NOT USED YET */ 390 }; 391 CTASSERT(VM_MEMATTR_WB_WA == 0); 392 CTASSERT(VM_MEMATTR_NOCACHE == 1); 393 CTASSERT(VM_MEMATTR_DEVICE == 2); 394 CTASSERT(VM_MEMATTR_SO == 3); 395 CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); 396 397 static inline uint32_t 398 vm_memattr_to_pte2(vm_memattr_t ma) 399 { 400 401 KASSERT((u_int)ma < 5, ("%s: bad vm_memattr_t %d", __func__, ma)); 402 return (pte2_attr_tab[(u_int)ma]); 403 } 404 405 static inline uint32_t 406 vm_page_pte2_attr(vm_page_t m) 407 { 408 409 return (vm_memattr_to_pte2(m->md.pat_mode)); 410 } 411 412 /* 413 * Convert TEX definition entry to TTB flags. 414 */ 415 static uint32_t 416 encode_ttb_flags(int idx) 417 { 418 uint32_t inner, outer, nos, reg; 419 420 inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & 421 TEXDEF_INNER_MASK; 422 outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & 423 TEXDEF_OUTER_MASK; 424 nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & 425 TEXDEF_NOS_MASK; 426 427 reg = nos << 5; 428 reg |= outer << 3; 429 if (cpuinfo.coherent_walk) 430 reg |= (inner & 0x1) << 6; 431 reg |= (inner & 0x2) >> 1; 432 #ifdef SMP 433 ARM_SMP_UP( 434 reg |= 1 << 1, 435 ); 436 #endif 437 return reg; 438 } 439 440 /* 441 * Set TEX remapping registers in current CPU. 442 */ 443 void 444 pmap_set_tex(void) 445 { 446 uint32_t prrr, nmrr; 447 uint32_t type, inner, outer, nos; 448 int i; 449 450 #ifdef PMAP_PTE_NOCACHE 451 /* XXX fixme */ 452 if (cpuinfo.coherent_walk) { 453 pt_memattr = VM_MEMATTR_WB_WA; 454 ttb_flags = encode_ttb_flags(0); 455 } 456 else { 457 pt_memattr = VM_MEMATTR_NOCACHE; 458 ttb_flags = encode_ttb_flags(1); 459 } 460 #else 461 pt_memattr = VM_MEMATTR_WB_WA; 462 ttb_flags = encode_ttb_flags(0); 463 #endif 464 465 prrr = 0; 466 nmrr = 0; 467 468 /* Build remapping register from TEX classes. */ 469 for (i = 0; i < 8; i++) { 470 type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & 471 TEXDEF_TYPE_MASK; 472 inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & 473 TEXDEF_INNER_MASK; 474 outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & 475 TEXDEF_OUTER_MASK; 476 nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & 477 TEXDEF_NOS_MASK; 478 479 prrr |= type << (i * 2); 480 prrr |= nos << (i + 24); 481 nmrr |= inner << (i * 2); 482 nmrr |= outer << (i * 2 + 16); 483 } 484 /* Add shareable bits for device memory. */ 485 prrr |= PRRR_DS0 | PRRR_DS1; 486 487 /* Add shareable bits for normal memory in SMP case. */ 488 #ifdef SMP 489 ARM_SMP_UP( 490 prrr |= PRRR_NS1, 491 ); 492 #endif 493 cp15_prrr_set(prrr); 494 cp15_nmrr_set(nmrr); 495 496 /* Caches are disabled, so full TLB flush should be enough. */ 497 tlb_flush_all_local(); 498 } 499 500 /* 501 * Remap one vm_meattr class to another one. This can be useful as 502 * workaround for SOC errata, e.g. if devices must be accessed using 503 * SO memory class. 504 * 505 * !!! Please note that this function is absolutely last resort thing. 506 * It should not be used under normal circumstances. !!! 507 * 508 * Usage rules: 509 * - it shall be called after pmap_bootstrap_prepare() and before 510 * cpu_mp_start() (thus only on boot CPU). In practice, it's expected 511 * to be called from platform_attach() or platform_late_init(). 512 * 513 * - if remapping doesn't change caching mode, or until uncached class 514 * is remapped to any kind of cached one, then no other restriction exists. 515 * 516 * - if pmap_remap_vm_attr() changes caching mode, but both (original and 517 * remapped) remain cached, then caller is resposible for calling 518 * of dcache_wbinv_poc_all(). 519 * 520 * - remapping of any kind of cached class to uncached is not permitted. 521 */ 522 void 523 pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) 524 { 525 int old_idx, new_idx; 526 527 /* Map VM memattrs to indexes to tex_class table. */ 528 old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); 529 new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); 530 531 /* Replace TEX attribute and apply it. */ 532 tex_class[old_idx] = tex_class[new_idx]; 533 pmap_set_tex(); 534 } 535 536 /* 537 * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, 538 * KERNBASE is mapped by first L2 page table in L2 page table page. It 539 * meets same constrain due to PT2MAP being placed just under KERNBASE. 540 */ 541 CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); 542 CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); 543 544 /* 545 * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. 546 * For now, anyhow, the following check must be fulfilled. 547 */ 548 CTASSERT(PAGE_SIZE == PTE2_SIZE); 549 /* 550 * We don't want to mess up MI code with all MMU and PMAP definitions, 551 * so some things, which depend on other ones, are defined independently. 552 * Now, it is time to check that we don't screw up something. 553 */ 554 CTASSERT(PDRSHIFT == PTE1_SHIFT); 555 /* 556 * Check L1 and L2 page table entries definitions consistency. 557 */ 558 CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); 559 CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); 560 /* 561 * Check L2 page tables page consistency. 562 */ 563 CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); 564 CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); 565 /* 566 * Check PT2TAB consistency. 567 * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. 568 * This should be done without remainder. 569 */ 570 CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); 571 572 /* 573 * A PT2MAP magic. 574 * 575 * All level 2 page tables (PT2s) are mapped continuously and accordingly 576 * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can 577 * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page 578 * must be used together, but not necessary at once. The first PT2 in a page 579 * must map things on correctly aligned address and the others must follow 580 * in right order. 581 */ 582 #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) 583 #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) 584 #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) 585 586 /* 587 * Check PT2TAB consistency. 588 * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. 589 * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. 590 * The both should be done without remainder. 591 */ 592 CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); 593 CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); 594 /* 595 * The implementation was made general, however, with the assumption 596 * bellow in mind. In case of another value of NPG_IN_PT2TAB, 597 * the code should be once more rechecked. 598 */ 599 CTASSERT(NPG_IN_PT2TAB == 1); 600 601 /* 602 * Get offset of PT2 in a page 603 * associated with given PT1 index. 604 */ 605 static __inline u_int 606 page_pt2off(u_int pt1_idx) 607 { 608 609 return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); 610 } 611 612 /* 613 * Get physical address of PT2 614 * associated with given PT2s page and PT1 index. 615 */ 616 static __inline vm_paddr_t 617 page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) 618 { 619 620 return (pgpa + page_pt2off(pt1_idx)); 621 } 622 623 /* 624 * Get first entry of PT2 625 * associated with given PT2s page and PT1 index. 626 */ 627 static __inline pt2_entry_t * 628 page_pt2(vm_offset_t pgva, u_int pt1_idx) 629 { 630 631 return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); 632 } 633 634 /* 635 * Get virtual address of PT2s page (mapped in PT2MAP) 636 * which holds PT2 which holds entry which maps given virtual address. 637 */ 638 static __inline vm_offset_t 639 pt2map_pt2pg(vm_offset_t va) 640 { 641 642 va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); 643 return ((vm_offset_t)pt2map_entry(va)); 644 } 645 646 /***************************************************************************** 647 * 648 * THREE pmap initialization milestones exist: 649 * 650 * locore.S 651 * -> fundamental init (including MMU) in ASM 652 * 653 * initarm() 654 * -> fundamental init continues in C 655 * -> first available physical address is known 656 * 657 * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) 658 * -> basic (safe) interface for physical address allocation is made 659 * -> basic (safe) interface for virtual mapping is made 660 * -> limited not SMP coherent work is possible 661 * 662 * -> more fundamental init continues in C 663 * -> locks and some more things are available 664 * -> all fundamental allocations and mappings are done 665 * 666 * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) 667 * -> phys_avail[] and virtual_avail is set 668 * -> control is passed to vm subsystem 669 * -> physical and virtual address allocation are off limit 670 * -> low level mapping functions, some SMP coherent, 671 * are available, which cannot be used before vm subsystem 672 * is being inited 673 * 674 * mi_startup() 675 * -> vm subsystem is being inited 676 * 677 * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) 678 * -> pmap is fully inited 679 * 680 *****************************************************************************/ 681 682 /***************************************************************************** 683 * 684 * PMAP first stage initialization and utility functions 685 * for pre-bootstrap epoch. 686 * 687 * After pmap_bootstrap_prepare() is called, the following functions 688 * can be used: 689 * 690 * (1) strictly only for this stage functions for physical page allocations, 691 * virtual space allocations, and mappings: 692 * 693 * vm_paddr_t pmap_preboot_get_pages(u_int num); 694 * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); 695 * vm_offset_t pmap_preboot_reserve_pages(u_int num); 696 * vm_offset_t pmap_preboot_get_vpages(u_int num); 697 * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 698 * vm_prot_t prot, vm_memattr_t attr); 699 * 700 * (2) for all stages: 701 * 702 * vm_paddr_t pmap_kextract(vm_offset_t va); 703 * 704 * NOTE: This is not SMP coherent stage. 705 * 706 *****************************************************************************/ 707 708 #define KERNEL_P2V(pa) \ 709 ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) 710 #define KERNEL_V2P(va) \ 711 ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) 712 713 static vm_paddr_t last_paddr; 714 715 /* 716 * Pre-bootstrap epoch page allocator. 717 */ 718 vm_paddr_t 719 pmap_preboot_get_pages(u_int num) 720 { 721 vm_paddr_t ret; 722 723 ret = last_paddr; 724 last_paddr += num * PAGE_SIZE; 725 726 return (ret); 727 } 728 729 /* 730 * The fundamental initialization of PMAP stuff. 731 * 732 * Some things already happened in locore.S and some things could happen 733 * before pmap_bootstrap_prepare() is called, so let's recall what is done: 734 * 1. Caches are disabled. 735 * 2. We are running on virtual addresses already with 'boot_pt1' 736 * as L1 page table. 737 * 3. So far, all virtual addresses can be converted to physical ones and 738 * vice versa by the following macros: 739 * KERNEL_P2V(pa) .... physical to virtual ones, 740 * KERNEL_V2P(va) .... virtual to physical ones. 741 * 742 * What is done herein: 743 * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. 744 * 2. PT2MAP magic is brought to live. 745 * 3. Basic preboot functions for page allocations and mappings can be used. 746 * 4. Everything is prepared for L1 cache enabling. 747 * 748 * Variations: 749 * 1. To use second TTB register, so kernel and users page tables will be 750 * separated. This way process forking - pmap_pinit() - could be faster, 751 * it saves physical pages and KVA per a process, and it's simple change. 752 * However, it will lead, due to hardware matter, to the following: 753 * (a) 2G space for kernel and 2G space for users. 754 * (b) 1G space for kernel in low addresses and 3G for users above it. 755 * A question is: Is the case (b) really an option? Note that case (b) 756 * does save neither physical memory and KVA. 757 */ 758 void 759 pmap_bootstrap_prepare(vm_paddr_t last) 760 { 761 vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; 762 vm_offset_t pt2pg_va; 763 pt1_entry_t *pte1p; 764 pt2_entry_t *pte2p; 765 u_int i; 766 uint32_t l1_attr; 767 768 /* 769 * Now, we are going to make real kernel mapping. Note that we are 770 * already running on some mapping made in locore.S and we expect 771 * that it's large enough to ensure nofault access to physical memory 772 * allocated herein before switch. 773 * 774 * As kernel image and everything needed before are and will be mapped 775 * by section mappings, we align last physical address to PTE1_SIZE. 776 */ 777 last_paddr = pte1_roundup(last); 778 779 /* 780 * Allocate and zero page(s) for kernel L1 page table. 781 * 782 * Note that it's first allocation on space which was PTE1_SIZE 783 * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. 784 */ 785 base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); 786 kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); 787 bzero((void*)kern_pt1, NB_IN_PT1); 788 pte1_sync_range(kern_pt1, NB_IN_PT1); 789 790 /* Allocate and zero page(s) for kernel PT2TAB. */ 791 pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); 792 kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); 793 bzero(kern_pt2tab, NB_IN_PT2TAB); 794 pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); 795 796 /* Allocate and zero page(s) for kernel L2 page tables. */ 797 pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); 798 pt2pg_va = KERNEL_P2V(pt2pg_pa); 799 size = NKPT2PG * PAGE_SIZE; 800 bzero((void*)pt2pg_va, size); 801 pte2_sync_range((pt2_entry_t *)pt2pg_va, size); 802 803 /* 804 * Add a physical memory segment (vm_phys_seg) corresponding to the 805 * preallocated pages for kernel L2 page tables so that vm_page 806 * structures representing these pages will be created. The vm_page 807 * structures are required for promotion of the corresponding kernel 808 * virtual addresses to section mappings. 809 */ 810 vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); 811 812 /* 813 * Insert allocated L2 page table pages to PT2TAB and make 814 * link to all PT2s in L1 page table. See how kernel_vm_end 815 * is initialized. 816 * 817 * We play simple and safe. So every KVA will have underlaying 818 * L2 page table, even kernel image mapped by sections. 819 */ 820 pte2p = kern_pt2tab_entry(KERNBASE); 821 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) 822 pt2tab_store(pte2p++, PTE2_KPT(pa)); 823 824 pte1p = kern_pte1(KERNBASE); 825 for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) 826 pte1_store(pte1p++, PTE1_LINK(pa)); 827 828 /* Make section mappings for kernel. */ 829 l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); 830 pte1p = kern_pte1(KERNBASE); 831 for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) 832 pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); 833 834 /* 835 * Get free and aligned space for PT2MAP and make L1 page table links 836 * to L2 page tables held in PT2TAB. 837 * 838 * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t 839 * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus 840 * each entry in PT2TAB maps all PT2s in a page. This implies that 841 * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. 842 */ 843 PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); 844 pte1p = kern_pte1((vm_offset_t)PT2MAP); 845 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 846 pte1_store(pte1p++, PTE1_LINK(pa)); 847 } 848 849 /* 850 * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. 851 * Each pmap will hold own PT2TAB, so the mapping should be not global. 852 */ 853 pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); 854 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 855 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 856 } 857 858 /* 859 * Choose correct L2 page table and make mappings for allocations 860 * made herein which replaces temporary locore.S mappings after a while. 861 * Note that PT2MAP cannot be used until we switch to kern_pt1. 862 * 863 * Note, that these allocations started aligned on 1M section and 864 * kernel PT1 was allocated first. Making of mappings must follow 865 * order of physical allocations as we've used KERNEL_P2V() macro 866 * for virtual addresses resolution. 867 */ 868 pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); 869 pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); 870 871 pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); 872 873 /* Make mapping for kernel L1 page table. */ 874 for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) 875 pte2_store(pte2p++, PTE2_KPT(pa)); 876 877 /* Make mapping for kernel PT2TAB. */ 878 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) 879 pte2_store(pte2p++, PTE2_KPT(pa)); 880 881 /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ 882 pmap_kern_ttb = base_pt1 | ttb_flags; 883 cpuinfo_reinit_mmu(pmap_kern_ttb); 884 /* 885 * Initialize the first available KVA. As kernel image is mapped by 886 * sections, we are leaving some gap behind. 887 */ 888 virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; 889 } 890 891 /* 892 * Setup L2 page table page for given KVA. 893 * Used in pre-bootstrap epoch. 894 * 895 * Note that we have allocated NKPT2PG pages for L2 page tables in advance 896 * and used them for mapping KVA starting from KERNBASE. However, this is not 897 * enough. Vectors and devices need L2 page tables too. Note that they are 898 * even above VM_MAX_KERNEL_ADDRESS. 899 */ 900 static __inline vm_paddr_t 901 pmap_preboot_pt2pg_setup(vm_offset_t va) 902 { 903 pt2_entry_t *pte2p, pte2; 904 vm_paddr_t pt2pg_pa; 905 906 /* Get associated entry in PT2TAB. */ 907 pte2p = kern_pt2tab_entry(va); 908 909 /* Just return, if PT2s page exists already. */ 910 pte2 = pt2tab_load(pte2p); 911 if (pte2_is_valid(pte2)) 912 return (pte2_pa(pte2)); 913 914 KASSERT(va >= VM_MAX_KERNEL_ADDRESS, 915 ("%s: NKPT2PG too small", __func__)); 916 917 /* 918 * Allocate page for PT2s and insert it to PT2TAB. 919 * In other words, map it into PT2MAP space. 920 */ 921 pt2pg_pa = pmap_preboot_get_pages(1); 922 pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); 923 924 /* Zero all PT2s in allocated page. */ 925 bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); 926 pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); 927 928 return (pt2pg_pa); 929 } 930 931 /* 932 * Setup L2 page table for given KVA. 933 * Used in pre-bootstrap epoch. 934 */ 935 static void 936 pmap_preboot_pt2_setup(vm_offset_t va) 937 { 938 pt1_entry_t *pte1p; 939 vm_paddr_t pt2pg_pa, pt2_pa; 940 941 /* Setup PT2's page. */ 942 pt2pg_pa = pmap_preboot_pt2pg_setup(va); 943 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); 944 945 /* Insert PT2 to PT1. */ 946 pte1p = kern_pte1(va); 947 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 948 } 949 950 /* 951 * Get L2 page entry associated with given KVA. 952 * Used in pre-bootstrap epoch. 953 */ 954 static __inline pt2_entry_t* 955 pmap_preboot_vtopte2(vm_offset_t va) 956 { 957 pt1_entry_t *pte1p; 958 959 /* Setup PT2 if needed. */ 960 pte1p = kern_pte1(va); 961 if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ 962 pmap_preboot_pt2_setup(va); 963 964 return (pt2map_entry(va)); 965 } 966 967 /* 968 * Pre-bootstrap epoch page(s) mapping(s). 969 */ 970 void 971 pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) 972 { 973 u_int i; 974 pt2_entry_t *pte2p; 975 976 /* Map all the pages. */ 977 for (i = 0; i < num; i++) { 978 pte2p = pmap_preboot_vtopte2(va); 979 pte2_store(pte2p, PTE2_KRW(pa)); 980 va += PAGE_SIZE; 981 pa += PAGE_SIZE; 982 } 983 } 984 985 /* 986 * Pre-bootstrap epoch virtual space alocator. 987 */ 988 vm_offset_t 989 pmap_preboot_reserve_pages(u_int num) 990 { 991 u_int i; 992 vm_offset_t start, va; 993 pt2_entry_t *pte2p; 994 995 /* Allocate virtual space. */ 996 start = va = virtual_avail; 997 virtual_avail += num * PAGE_SIZE; 998 999 /* Zero the mapping. */ 1000 for (i = 0; i < num; i++) { 1001 pte2p = pmap_preboot_vtopte2(va); 1002 pte2_store(pte2p, 0); 1003 va += PAGE_SIZE; 1004 } 1005 1006 return (start); 1007 } 1008 1009 /* 1010 * Pre-bootstrap epoch page(s) allocation and mapping(s). 1011 */ 1012 vm_offset_t 1013 pmap_preboot_get_vpages(u_int num) 1014 { 1015 vm_paddr_t pa; 1016 vm_offset_t va; 1017 1018 /* Allocate physical page(s). */ 1019 pa = pmap_preboot_get_pages(num); 1020 1021 /* Allocate virtual space. */ 1022 va = virtual_avail; 1023 virtual_avail += num * PAGE_SIZE; 1024 1025 /* Map and zero all. */ 1026 pmap_preboot_map_pages(pa, va, num); 1027 bzero((void *)va, num * PAGE_SIZE); 1028 1029 return (va); 1030 } 1031 1032 /* 1033 * Pre-bootstrap epoch page mapping(s) with attributes. 1034 */ 1035 void 1036 pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, 1037 vm_prot_t prot, vm_memattr_t attr) 1038 { 1039 u_int num; 1040 u_int l1_attr, l1_prot, l2_prot, l2_attr; 1041 pt1_entry_t *pte1p; 1042 pt2_entry_t *pte2p; 1043 1044 l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; 1045 l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1046 l2_attr = vm_memattr_to_pte2(attr); 1047 l1_prot = ATTR_TO_L1(l2_prot); 1048 l1_attr = ATTR_TO_L1(l2_attr); 1049 1050 /* Map all the pages. */ 1051 num = round_page(size); 1052 while (num > 0) { 1053 if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { 1054 pte1p = kern_pte1(va); 1055 pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); 1056 va += PTE1_SIZE; 1057 pa += PTE1_SIZE; 1058 num -= PTE1_SIZE; 1059 } else { 1060 pte2p = pmap_preboot_vtopte2(va); 1061 pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); 1062 va += PAGE_SIZE; 1063 pa += PAGE_SIZE; 1064 num -= PAGE_SIZE; 1065 } 1066 } 1067 } 1068 1069 /* 1070 * Extract from the kernel page table the physical address 1071 * that is mapped by the given virtual address "va". 1072 */ 1073 vm_paddr_t 1074 pmap_kextract(vm_offset_t va) 1075 { 1076 vm_paddr_t pa; 1077 pt1_entry_t pte1; 1078 pt2_entry_t pte2; 1079 1080 pte1 = pte1_load(kern_pte1(va)); 1081 if (pte1_is_section(pte1)) { 1082 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1083 } else if (pte1_is_link(pte1)) { 1084 /* 1085 * We should beware of concurrent promotion that changes 1086 * pte1 at this point. However, it's not a problem as PT2 1087 * page is preserved by promotion in PT2TAB. So even if 1088 * it happens, using of PT2MAP is still safe. 1089 * 1090 * QQQ: However, concurrent removing is a problem which 1091 * ends in abort on PT2MAP space. Locking must be used 1092 * to deal with this. 1093 */ 1094 pte2 = pte2_load(pt2map_entry(va)); 1095 pa = pte2_pa(pte2) | (va & PTE2_OFFSET); 1096 } 1097 else { 1098 panic("%s: va %#x pte1 %#x", __func__, va, pte1); 1099 } 1100 return (pa); 1101 } 1102 1103 /* 1104 * Extract from the kernel page table the physical address 1105 * that is mapped by the given virtual address "va". Also 1106 * return L2 page table entry which maps the address. 1107 * 1108 * This is only intended to be used for panic dumps. 1109 */ 1110 vm_paddr_t 1111 pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) 1112 { 1113 vm_paddr_t pa; 1114 pt1_entry_t pte1; 1115 pt2_entry_t pte2; 1116 1117 pte1 = pte1_load(kern_pte1(va)); 1118 if (pte1_is_section(pte1)) { 1119 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1120 pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; 1121 } else if (pte1_is_link(pte1)) { 1122 pte2 = pte2_load(pt2map_entry(va)); 1123 pa = pte2_pa(pte2); 1124 } else { 1125 pte2 = 0; 1126 pa = 0; 1127 } 1128 if (pte2p != NULL) 1129 *pte2p = pte2; 1130 return (pa); 1131 } 1132 1133 /***************************************************************************** 1134 * 1135 * PMAP second stage initialization and utility functions 1136 * for bootstrap epoch. 1137 * 1138 * After pmap_bootstrap() is called, the following functions for 1139 * mappings can be used: 1140 * 1141 * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); 1142 * void pmap_kremove(vm_offset_t va); 1143 * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, 1144 * int prot); 1145 * 1146 * NOTE: This is not SMP coherent stage. And physical page allocation is not 1147 * allowed during this stage. 1148 * 1149 *****************************************************************************/ 1150 1151 /* 1152 * Initialize kernel PMAP locks and lists, kernel_pmap itself, and 1153 * reserve various virtual spaces for temporary mappings. 1154 */ 1155 void 1156 pmap_bootstrap(vm_offset_t firstaddr) 1157 { 1158 pt2_entry_t *unused __unused; 1159 struct pcpu *pc; 1160 1161 /* 1162 * Initialize the kernel pmap (which is statically allocated). 1163 */ 1164 PMAP_LOCK_INIT(kernel_pmap); 1165 kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ 1166 kernel_pmap->pm_pt1 = kern_pt1; 1167 kernel_pmap->pm_pt2tab = kern_pt2tab; 1168 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 1169 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1170 1171 /* 1172 * Initialize the global pv list lock. 1173 */ 1174 rw_init(&pvh_global_lock, "pmap pv global"); 1175 1176 LIST_INIT(&allpmaps); 1177 1178 /* 1179 * Request a spin mutex so that changes to allpmaps cannot be 1180 * preempted by smp_rendezvous_cpus(). 1181 */ 1182 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 1183 mtx_lock_spin(&allpmaps_lock); 1184 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 1185 mtx_unlock_spin(&allpmaps_lock); 1186 1187 /* 1188 * Reserve some special page table entries/VA space for temporary 1189 * mapping of pages. 1190 */ 1191 #define SYSMAP(c, p, v, n) do { \ 1192 v = (c)pmap_preboot_reserve_pages(n); \ 1193 p = pt2map_entry((vm_offset_t)v); \ 1194 } while (0) 1195 1196 /* 1197 * Local CMAP1/CMAP2 are used for zeroing and copying pages. 1198 * Local CMAP2 is also used for data cache cleaning. 1199 */ 1200 pc = get_pcpu(); 1201 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1202 SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); 1203 SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); 1204 SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); 1205 1206 /* 1207 * Crashdump maps. 1208 */ 1209 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); 1210 1211 /* 1212 * _tmppt is used for reading arbitrary physical pages via /dev/mem. 1213 */ 1214 SYSMAP(caddr_t, unused, _tmppt, 1); 1215 1216 /* 1217 * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), 1218 * respectively. PADDR3 is used by pmap_pte2_ddb(). 1219 */ 1220 SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); 1221 SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); 1222 #ifdef DDB 1223 SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); 1224 #endif 1225 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 1226 1227 /* 1228 * Note that in very short time in initarm(), we are going to 1229 * initialize phys_avail[] array and no further page allocation 1230 * can happen after that until vm subsystem will be initialized. 1231 */ 1232 kernel_vm_end_new = kernel_vm_end; 1233 virtual_end = vm_max_kernel_address; 1234 } 1235 1236 static void 1237 pmap_init_reserved_pages(void) 1238 { 1239 struct pcpu *pc; 1240 vm_offset_t pages; 1241 int i; 1242 1243 CPU_FOREACH(i) { 1244 pc = pcpu_find(i); 1245 /* 1246 * Skip if the mapping has already been initialized, 1247 * i.e. this is the BSP. 1248 */ 1249 if (pc->pc_cmap1_addr != 0) 1250 continue; 1251 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 1252 pages = kva_alloc(PAGE_SIZE * 3); 1253 if (pages == 0) 1254 panic("%s: unable to allocate KVA", __func__); 1255 pc->pc_cmap1_pte2p = pt2map_entry(pages); 1256 pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); 1257 pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); 1258 pc->pc_cmap1_addr = (caddr_t)pages; 1259 pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); 1260 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 1261 } 1262 } 1263 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 1264 1265 /* 1266 * The function can already be use in second initialization stage. 1267 * As such, the function DOES NOT call pmap_growkernel() where PT2 1268 * allocation can happen. So if used, be sure that PT2 for given 1269 * virtual address is allocated already! 1270 * 1271 * Add a wired page to the kva. 1272 * Note: not SMP coherent. 1273 */ 1274 static __inline void 1275 pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, 1276 uint32_t attr) 1277 { 1278 pt1_entry_t *pte1p; 1279 pt2_entry_t *pte2p; 1280 1281 pte1p = kern_pte1(va); 1282 if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ 1283 /* 1284 * This is a very low level function, so PT2 and particularly 1285 * PT2PG associated with given virtual address must be already 1286 * allocated. It's a pain mainly during pmap initialization 1287 * stage. However, called after pmap initialization with 1288 * virtual address not under kernel_vm_end will lead to 1289 * the same misery. 1290 */ 1291 if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) 1292 panic("%s: kernel PT2 not allocated!", __func__); 1293 } 1294 1295 pte2p = pt2map_entry(va); 1296 pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); 1297 } 1298 1299 PMAP_INLINE void 1300 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1301 { 1302 1303 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); 1304 } 1305 1306 /* 1307 * Remove a page from the kernel pagetables. 1308 * Note: not SMP coherent. 1309 */ 1310 PMAP_INLINE void 1311 pmap_kremove(vm_offset_t va) 1312 { 1313 pt2_entry_t *pte2p; 1314 1315 pte2p = pt2map_entry(va); 1316 pte2_clear(pte2p); 1317 } 1318 1319 /* 1320 * Share new kernel PT2PG with all pmaps. 1321 * The caller is responsible for maintaining TLB consistency. 1322 */ 1323 static void 1324 pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) 1325 { 1326 pmap_t pmap; 1327 pt2_entry_t *pte2p; 1328 1329 mtx_lock_spin(&allpmaps_lock); 1330 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1331 pte2p = pmap_pt2tab_entry(pmap, va); 1332 pt2tab_store(pte2p, npte2); 1333 } 1334 mtx_unlock_spin(&allpmaps_lock); 1335 } 1336 1337 /* 1338 * Share new kernel PTE1 with all pmaps. 1339 * The caller is responsible for maintaining TLB consistency. 1340 */ 1341 static void 1342 pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) 1343 { 1344 pmap_t pmap; 1345 pt1_entry_t *pte1p; 1346 1347 mtx_lock_spin(&allpmaps_lock); 1348 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1349 pte1p = pmap_pte1(pmap, va); 1350 pte1_store(pte1p, npte1); 1351 } 1352 mtx_unlock_spin(&allpmaps_lock); 1353 } 1354 1355 /* 1356 * Used to map a range of physical addresses into kernel 1357 * virtual address space. 1358 * 1359 * The value passed in '*virt' is a suggested virtual address for 1360 * the mapping. Architectures which can support a direct-mapped 1361 * physical to virtual region can return the appropriate address 1362 * within that region, leaving '*virt' unchanged. Other 1363 * architectures should map the pages starting at '*virt' and 1364 * update '*virt' with the first usable address after the mapped 1365 * region. 1366 * 1367 * NOTE: Read the comments above pmap_kenter_prot_attr() as 1368 * the function is used herein! 1369 */ 1370 vm_offset_t 1371 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1372 { 1373 vm_offset_t va, sva; 1374 vm_paddr_t pte1_offset; 1375 pt1_entry_t npte1; 1376 uint32_t l1prot, l2prot; 1377 uint32_t l1attr, l2attr; 1378 1379 PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," 1380 " prot = %d\n", __func__, *virt, start, end, end - start, prot)); 1381 1382 l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; 1383 l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; 1384 l1prot = ATTR_TO_L1(l2prot); 1385 1386 l2attr = PTE2_ATTR_DEFAULT; 1387 l1attr = ATTR_TO_L1(l2attr); 1388 1389 va = *virt; 1390 /* 1391 * Does the physical address range's size and alignment permit at 1392 * least one section mapping to be created? 1393 */ 1394 pte1_offset = start & PTE1_OFFSET; 1395 if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= 1396 PTE1_SIZE) { 1397 /* 1398 * Increase the starting virtual address so that its alignment 1399 * does not preclude the use of section mappings. 1400 */ 1401 if ((va & PTE1_OFFSET) < pte1_offset) 1402 va = pte1_trunc(va) + pte1_offset; 1403 else if ((va & PTE1_OFFSET) > pte1_offset) 1404 va = pte1_roundup(va) + pte1_offset; 1405 } 1406 sva = va; 1407 while (start < end) { 1408 if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { 1409 KASSERT((va & PTE1_OFFSET) == 0, 1410 ("%s: misaligned va %#x", __func__, va)); 1411 npte1 = PTE1_KERN(start, l1prot, l1attr); 1412 pmap_kenter_pte1(va, npte1); 1413 va += PTE1_SIZE; 1414 start += PTE1_SIZE; 1415 } else { 1416 pmap_kenter_prot_attr(va, start, l2prot, l2attr); 1417 va += PAGE_SIZE; 1418 start += PAGE_SIZE; 1419 } 1420 } 1421 tlb_flush_range(sva, va - sva); 1422 *virt = va; 1423 return (sva); 1424 } 1425 1426 /* 1427 * Make a temporary mapping for a physical address. 1428 * This is only intended to be used for panic dumps. 1429 */ 1430 void * 1431 pmap_kenter_temporary(vm_paddr_t pa, int i) 1432 { 1433 vm_offset_t va; 1434 1435 /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ 1436 1437 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 1438 pmap_kenter(va, pa); 1439 tlb_flush_local(va); 1440 return ((void *)crashdumpmap); 1441 } 1442 1443 1444 /************************************* 1445 * 1446 * TLB & cache maintenance routines. 1447 * 1448 *************************************/ 1449 1450 /* 1451 * We inline these within pmap.c for speed. 1452 */ 1453 PMAP_INLINE void 1454 pmap_tlb_flush(pmap_t pmap, vm_offset_t va) 1455 { 1456 1457 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1458 tlb_flush(va); 1459 } 1460 1461 PMAP_INLINE void 1462 pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) 1463 { 1464 1465 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1466 tlb_flush_range(sva, size); 1467 } 1468 1469 /* 1470 * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. 1471 * Requirements: 1472 * - Must deal with pages in order to ensure that none of the PTE2_* bits 1473 * are ever set, PTE2_V in particular. 1474 * - Assumes we can write to pte2s without pte2_store() atomic ops. 1475 * - Assumes nothing will ever test these addresses for 0 to indicate 1476 * no mapping instead of correctly checking PTE2_V. 1477 * - Assumes a vm_offset_t will fit in a pte2 (true for arm). 1478 * Because PTE2_V is never set, there can be no mappings to invalidate. 1479 */ 1480 static vm_offset_t 1481 pmap_pte2list_alloc(vm_offset_t *head) 1482 { 1483 pt2_entry_t *pte2p; 1484 vm_offset_t va; 1485 1486 va = *head; 1487 if (va == 0) 1488 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 1489 pte2p = pt2map_entry(va); 1490 *head = *pte2p; 1491 if (*head & PTE2_V) 1492 panic("%s: va with PTE2_V set!", __func__); 1493 *pte2p = 0; 1494 return (va); 1495 } 1496 1497 static void 1498 pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) 1499 { 1500 pt2_entry_t *pte2p; 1501 1502 if (va & PTE2_V) 1503 panic("%s: freeing va with PTE2_V set!", __func__); 1504 pte2p = pt2map_entry(va); 1505 *pte2p = *head; /* virtual! PTE2_V is 0 though */ 1506 *head = va; 1507 } 1508 1509 static void 1510 pmap_pte2list_init(vm_offset_t *head, void *base, int npages) 1511 { 1512 int i; 1513 vm_offset_t va; 1514 1515 *head = 0; 1516 for (i = npages - 1; i >= 0; i--) { 1517 va = (vm_offset_t)base + i * PAGE_SIZE; 1518 pmap_pte2list_free(head, va); 1519 } 1520 } 1521 1522 /***************************************************************************** 1523 * 1524 * PMAP third and final stage initialization. 1525 * 1526 * After pmap_init() is called, PMAP subsystem is fully initialized. 1527 * 1528 *****************************************************************************/ 1529 1530 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 1531 1532 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 1533 "Max number of PV entries"); 1534 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 1535 "Page share factor per proc"); 1536 1537 static u_long nkpt2pg = NKPT2PG; 1538 SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, 1539 &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); 1540 1541 static int sp_enabled = 1; 1542 SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 1543 &sp_enabled, 0, "Are large page mappings enabled?"); 1544 1545 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD, 0, 1546 "1MB page mapping counters"); 1547 1548 static u_long pmap_pte1_demotions; 1549 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, 1550 &pmap_pte1_demotions, 0, "1MB page demotions"); 1551 1552 static u_long pmap_pte1_mappings; 1553 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, 1554 &pmap_pte1_mappings, 0, "1MB page mappings"); 1555 1556 static u_long pmap_pte1_p_failures; 1557 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, 1558 &pmap_pte1_p_failures, 0, "1MB page promotion failures"); 1559 1560 static u_long pmap_pte1_promotions; 1561 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, 1562 &pmap_pte1_promotions, 0, "1MB page promotions"); 1563 1564 static u_long pmap_pte1_kern_demotions; 1565 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, 1566 &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); 1567 1568 static u_long pmap_pte1_kern_promotions; 1569 SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, 1570 &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); 1571 1572 static __inline ttb_entry_t 1573 pmap_ttb_get(pmap_t pmap) 1574 { 1575 1576 return (vtophys(pmap->pm_pt1) | ttb_flags); 1577 } 1578 1579 /* 1580 * Initialize a vm_page's machine-dependent fields. 1581 * 1582 * Variations: 1583 * 1. Pages for L2 page tables are always not managed. So, pv_list and 1584 * pt2_wirecount can share same physical space. However, proper 1585 * initialization on a page alloc for page tables and reinitialization 1586 * on the page free must be ensured. 1587 */ 1588 void 1589 pmap_page_init(vm_page_t m) 1590 { 1591 1592 TAILQ_INIT(&m->md.pv_list); 1593 pt2_wirecount_init(m); 1594 m->md.pat_mode = VM_MEMATTR_DEFAULT; 1595 } 1596 1597 /* 1598 * Virtualization for faster way how to zero whole page. 1599 */ 1600 static __inline void 1601 pagezero(void *page) 1602 { 1603 1604 bzero(page, PAGE_SIZE); 1605 } 1606 1607 /* 1608 * Zero L2 page table page. 1609 * Use same KVA as in pmap_zero_page(). 1610 */ 1611 static __inline vm_paddr_t 1612 pmap_pt2pg_zero(vm_page_t m) 1613 { 1614 pt2_entry_t *cmap2_pte2p; 1615 vm_paddr_t pa; 1616 struct pcpu *pc; 1617 1618 pa = VM_PAGE_TO_PHYS(m); 1619 1620 /* 1621 * XXX: For now, we map whole page even if it's already zero, 1622 * to sync it even if the sync is only DSB. 1623 */ 1624 sched_pin(); 1625 pc = get_pcpu(); 1626 cmap2_pte2p = pc->pc_cmap2_pte2p; 1627 mtx_lock(&pc->pc_cmap_lock); 1628 if (pte2_load(cmap2_pte2p) != 0) 1629 panic("%s: CMAP2 busy", __func__); 1630 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 1631 vm_page_pte2_attr(m))); 1632 /* Even VM_ALLOC_ZERO request is only advisory. */ 1633 if ((m->flags & PG_ZERO) == 0) 1634 pagezero(pc->pc_cmap2_addr); 1635 pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); 1636 pte2_clear(cmap2_pte2p); 1637 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 1638 1639 /* 1640 * Unpin the thread before releasing the lock. Otherwise the thread 1641 * could be rescheduled while still bound to the current CPU, only 1642 * to unpin itself immediately upon resuming execution. 1643 */ 1644 sched_unpin(); 1645 mtx_unlock(&pc->pc_cmap_lock); 1646 1647 return (pa); 1648 } 1649 1650 /* 1651 * Init just allocated page as L2 page table(s) holder 1652 * and return its physical address. 1653 */ 1654 static __inline vm_paddr_t 1655 pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) 1656 { 1657 vm_paddr_t pa; 1658 pt2_entry_t *pte2p; 1659 1660 /* Check page attributes. */ 1661 if (m->md.pat_mode != pt_memattr) 1662 pmap_page_set_memattr(m, pt_memattr); 1663 1664 /* Zero page and init wire counts. */ 1665 pa = pmap_pt2pg_zero(m); 1666 pt2_wirecount_init(m); 1667 1668 /* 1669 * Map page to PT2MAP address space for given pmap. 1670 * Note that PT2MAP space is shared with all pmaps. 1671 */ 1672 if (pmap == kernel_pmap) 1673 pmap_kenter_pt2tab(va, PTE2_KPT(pa)); 1674 else { 1675 pte2p = pmap_pt2tab_entry(pmap, va); 1676 pt2tab_store(pte2p, PTE2_KPT_NG(pa)); 1677 } 1678 1679 return (pa); 1680 } 1681 1682 /* 1683 * Initialize the pmap module. 1684 * Called by vm_init, to initialize any structures that the pmap 1685 * system needs to map virtual memory. 1686 */ 1687 void 1688 pmap_init(void) 1689 { 1690 vm_size_t s; 1691 pt2_entry_t *pte2p, pte2; 1692 u_int i, pte1_idx, pv_npg; 1693 1694 PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); 1695 1696 /* 1697 * Initialize the vm page array entries for kernel pmap's 1698 * L2 page table pages allocated in advance. 1699 */ 1700 pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); 1701 pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); 1702 for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { 1703 vm_paddr_t pa; 1704 vm_page_t m; 1705 1706 pte2 = pte2_load(pte2p); 1707 KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); 1708 1709 pa = pte2_pa(pte2); 1710 m = PHYS_TO_VM_PAGE(pa); 1711 KASSERT(m >= vm_page_array && 1712 m < &vm_page_array[vm_page_array_size], 1713 ("%s: L2 page table page is out of range", __func__)); 1714 1715 m->pindex = pte1_idx; 1716 m->phys_addr = pa; 1717 pte1_idx += NPT2_IN_PG; 1718 } 1719 1720 /* 1721 * Initialize the address space (zone) for the pv entries. Set a 1722 * high water mark so that the system can recover from excessive 1723 * numbers of pv entries. 1724 */ 1725 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 1726 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 1727 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 1728 pv_entry_max = roundup(pv_entry_max, _NPCPV); 1729 pv_entry_high_water = 9 * (pv_entry_max / 10); 1730 1731 /* 1732 * Are large page mappings enabled? 1733 */ 1734 TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); 1735 if (sp_enabled) { 1736 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1737 ("%s: can't assign to pagesizes[1]", __func__)); 1738 pagesizes[1] = PTE1_SIZE; 1739 } 1740 1741 /* 1742 * Calculate the size of the pv head table for sections. 1743 * Handle the possibility that "vm_phys_segs[...].end" is zero. 1744 * Note that the table is only for sections which could be promoted. 1745 */ 1746 first_managed_pa = pte1_trunc(vm_phys_segs[0].start); 1747 pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) 1748 - first_managed_pa) / PTE1_SIZE + 1; 1749 1750 /* 1751 * Allocate memory for the pv head table for sections. 1752 */ 1753 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1754 s = round_page(s); 1755 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1756 M_WAITOK | M_ZERO); 1757 for (i = 0; i < pv_npg; i++) 1758 TAILQ_INIT(&pv_table[i].pv_list); 1759 1760 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 1761 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 1762 if (pv_chunkbase == NULL) 1763 panic("%s: not enough kvm for pv chunks", __func__); 1764 pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 1765 } 1766 1767 /* 1768 * Add a list of wired pages to the kva 1769 * this routine is only used for temporary 1770 * kernel mappings that do not need to have 1771 * page modification or references recorded. 1772 * Note that old mappings are simply written 1773 * over. The page *must* be wired. 1774 * Note: SMP coherent. Uses a ranged shootdown IPI. 1775 */ 1776 void 1777 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1778 { 1779 u_int anychanged; 1780 pt2_entry_t *epte2p, *pte2p, pte2; 1781 vm_page_t m; 1782 vm_paddr_t pa; 1783 1784 anychanged = 0; 1785 pte2p = pt2map_entry(sva); 1786 epte2p = pte2p + count; 1787 while (pte2p < epte2p) { 1788 m = *ma++; 1789 pa = VM_PAGE_TO_PHYS(m); 1790 pte2 = pte2_load(pte2p); 1791 if ((pte2_pa(pte2) != pa) || 1792 (pte2_attr(pte2) != vm_page_pte2_attr(m))) { 1793 anychanged++; 1794 pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, 1795 vm_page_pte2_attr(m))); 1796 } 1797 pte2p++; 1798 } 1799 if (__predict_false(anychanged)) 1800 tlb_flush_range(sva, count * PAGE_SIZE); 1801 } 1802 1803 /* 1804 * This routine tears out page mappings from the 1805 * kernel -- it is meant only for temporary mappings. 1806 * Note: SMP coherent. Uses a ranged shootdown IPI. 1807 */ 1808 void 1809 pmap_qremove(vm_offset_t sva, int count) 1810 { 1811 vm_offset_t va; 1812 1813 va = sva; 1814 while (count-- > 0) { 1815 pmap_kremove(va); 1816 va += PAGE_SIZE; 1817 } 1818 tlb_flush_range(sva, va - sva); 1819 } 1820 1821 /* 1822 * Are we current address space or kernel? 1823 */ 1824 static __inline int 1825 pmap_is_current(pmap_t pmap) 1826 { 1827 1828 return (pmap == kernel_pmap || 1829 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); 1830 } 1831 1832 /* 1833 * If the given pmap is not the current or kernel pmap, the returned 1834 * pte2 must be released by passing it to pmap_pte2_release(). 1835 */ 1836 static pt2_entry_t * 1837 pmap_pte2(pmap_t pmap, vm_offset_t va) 1838 { 1839 pt1_entry_t pte1; 1840 vm_paddr_t pt2pg_pa; 1841 1842 pte1 = pte1_load(pmap_pte1(pmap, va)); 1843 if (pte1_is_section(pte1)) 1844 panic("%s: attempt to map PTE1", __func__); 1845 if (pte1_is_link(pte1)) { 1846 /* Are we current address space or kernel? */ 1847 if (pmap_is_current(pmap)) 1848 return (pt2map_entry(va)); 1849 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1850 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1851 mtx_lock(&PMAP2mutex); 1852 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 1853 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 1854 tlb_flush((vm_offset_t)PADDR2); 1855 } 1856 return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1857 } 1858 return (NULL); 1859 } 1860 1861 /* 1862 * Releases a pte2 that was obtained from pmap_pte2(). 1863 * Be prepared for the pte2p being NULL. 1864 */ 1865 static __inline void 1866 pmap_pte2_release(pt2_entry_t *pte2p) 1867 { 1868 1869 if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { 1870 mtx_unlock(&PMAP2mutex); 1871 } 1872 } 1873 1874 /* 1875 * Super fast pmap_pte2 routine best used when scanning 1876 * the pv lists. This eliminates many coarse-grained 1877 * invltlb calls. Note that many of the pv list 1878 * scans are across different pmaps. It is very wasteful 1879 * to do an entire tlb flush for checking a single mapping. 1880 * 1881 * If the given pmap is not the current pmap, pvh_global_lock 1882 * must be held and curthread pinned to a CPU. 1883 */ 1884 static pt2_entry_t * 1885 pmap_pte2_quick(pmap_t pmap, vm_offset_t va) 1886 { 1887 pt1_entry_t pte1; 1888 vm_paddr_t pt2pg_pa; 1889 1890 pte1 = pte1_load(pmap_pte1(pmap, va)); 1891 if (pte1_is_section(pte1)) 1892 panic("%s: attempt to map PTE1", __func__); 1893 if (pte1_is_link(pte1)) { 1894 /* Are we current address space or kernel? */ 1895 if (pmap_is_current(pmap)) 1896 return (pt2map_entry(va)); 1897 rw_assert(&pvh_global_lock, RA_WLOCKED); 1898 KASSERT(curthread->td_pinned > 0, 1899 ("%s: curthread not pinned", __func__)); 1900 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 1901 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 1902 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 1903 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 1904 #ifdef SMP 1905 PMAP1cpu = PCPU_GET(cpuid); 1906 #endif 1907 tlb_flush_local((vm_offset_t)PADDR1); 1908 PMAP1changed++; 1909 } else 1910 #ifdef SMP 1911 if (PMAP1cpu != PCPU_GET(cpuid)) { 1912 PMAP1cpu = PCPU_GET(cpuid); 1913 tlb_flush_local((vm_offset_t)PADDR1); 1914 PMAP1changedcpu++; 1915 } else 1916 #endif 1917 PMAP1unchanged++; 1918 return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 1919 } 1920 return (NULL); 1921 } 1922 1923 /* 1924 * Routine: pmap_extract 1925 * Function: 1926 * Extract the physical page address associated 1927 * with the given map/virtual_address pair. 1928 */ 1929 vm_paddr_t 1930 pmap_extract(pmap_t pmap, vm_offset_t va) 1931 { 1932 vm_paddr_t pa; 1933 pt1_entry_t pte1; 1934 pt2_entry_t *pte2p; 1935 1936 PMAP_LOCK(pmap); 1937 pte1 = pte1_load(pmap_pte1(pmap, va)); 1938 if (pte1_is_section(pte1)) 1939 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1940 else if (pte1_is_link(pte1)) { 1941 pte2p = pmap_pte2(pmap, va); 1942 pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); 1943 pmap_pte2_release(pte2p); 1944 } else 1945 pa = 0; 1946 PMAP_UNLOCK(pmap); 1947 return (pa); 1948 } 1949 1950 /* 1951 * Routine: pmap_extract_and_hold 1952 * Function: 1953 * Atomically extract and hold the physical page 1954 * with the given pmap and virtual address pair 1955 * if that mapping permits the given protection. 1956 */ 1957 vm_page_t 1958 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1959 { 1960 vm_paddr_t pa, lockpa; 1961 pt1_entry_t pte1; 1962 pt2_entry_t pte2, *pte2p; 1963 vm_page_t m; 1964 1965 lockpa = 0; 1966 m = NULL; 1967 PMAP_LOCK(pmap); 1968 retry: 1969 pte1 = pte1_load(pmap_pte1(pmap, va)); 1970 if (pte1_is_section(pte1)) { 1971 if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { 1972 pa = pte1_pa(pte1) | (va & PTE1_OFFSET); 1973 if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) 1974 goto retry; 1975 m = PHYS_TO_VM_PAGE(pa); 1976 vm_page_hold(m); 1977 } 1978 } else if (pte1_is_link(pte1)) { 1979 pte2p = pmap_pte2(pmap, va); 1980 pte2 = pte2_load(pte2p); 1981 pmap_pte2_release(pte2p); 1982 if (pte2_is_valid(pte2) && 1983 (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { 1984 pa = pte2_pa(pte2); 1985 if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) 1986 goto retry; 1987 m = PHYS_TO_VM_PAGE(pa); 1988 vm_page_hold(m); 1989 } 1990 } 1991 PA_UNLOCK_COND(lockpa); 1992 PMAP_UNLOCK(pmap); 1993 return (m); 1994 } 1995 1996 /* 1997 * Grow the number of kernel L2 page table entries, if needed. 1998 */ 1999 void 2000 pmap_growkernel(vm_offset_t addr) 2001 { 2002 vm_page_t m; 2003 vm_paddr_t pt2pg_pa, pt2_pa; 2004 pt1_entry_t pte1; 2005 pt2_entry_t pte2; 2006 2007 PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); 2008 /* 2009 * All the time kernel_vm_end is first KVA for which underlying 2010 * L2 page table is either not allocated or linked from L1 page table 2011 * (not considering sections). Except for two possible cases: 2012 * 2013 * (1) in the very beginning as long as pmap_growkernel() was 2014 * not called, it could be first unused KVA (which is not 2015 * rounded up to PTE1_SIZE), 2016 * 2017 * (2) when all KVA space is mapped and kernel_map->max_offset 2018 * address is not rounded up to PTE1_SIZE. (For example, 2019 * it could be 0xFFFFFFFF.) 2020 */ 2021 kernel_vm_end = pte1_roundup(kernel_vm_end); 2022 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2023 addr = roundup2(addr, PTE1_SIZE); 2024 if (addr - 1 >= kernel_map->max_offset) 2025 addr = kernel_map->max_offset; 2026 while (kernel_vm_end < addr) { 2027 pte1 = pte1_load(kern_pte1(kernel_vm_end)); 2028 if (pte1_is_valid(pte1)) { 2029 kernel_vm_end += PTE1_SIZE; 2030 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2031 kernel_vm_end = kernel_map->max_offset; 2032 break; 2033 } 2034 continue; 2035 } 2036 2037 /* 2038 * kernel_vm_end_new is used in pmap_pinit() when kernel 2039 * mappings are entered to new pmap all at once to avoid race 2040 * between pmap_kenter_pte1() and kernel_vm_end increase. 2041 * The same aplies to pmap_kenter_pt2tab(). 2042 */ 2043 kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; 2044 2045 pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); 2046 if (!pte2_is_valid(pte2)) { 2047 /* 2048 * Install new PT2s page into kernel PT2TAB. 2049 */ 2050 m = vm_page_alloc(NULL, 2051 pte1_index(kernel_vm_end) & ~PT2PG_MASK, 2052 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2053 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2054 if (m == NULL) 2055 panic("%s: no memory to grow kernel", __func__); 2056 /* 2057 * QQQ: To link all new L2 page tables from L1 page 2058 * table now and so pmap_kenter_pte1() them 2059 * at once together with pmap_kenter_pt2tab() 2060 * could be nice speed up. However, 2061 * pmap_growkernel() does not happen so often... 2062 * QQQ: The other TTBR is another option. 2063 */ 2064 pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, 2065 m); 2066 } else 2067 pt2pg_pa = pte2_pa(pte2); 2068 2069 pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); 2070 pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); 2071 2072 kernel_vm_end = kernel_vm_end_new; 2073 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2074 kernel_vm_end = kernel_map->max_offset; 2075 break; 2076 } 2077 } 2078 } 2079 2080 static int 2081 kvm_size(SYSCTL_HANDLER_ARGS) 2082 { 2083 unsigned long ksize = vm_max_kernel_address - KERNBASE; 2084 2085 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2086 } 2087 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2088 0, 0, kvm_size, "IU", "Size of KVM"); 2089 2090 static int 2091 kvm_free(SYSCTL_HANDLER_ARGS) 2092 { 2093 unsigned long kfree = vm_max_kernel_address - kernel_vm_end; 2094 2095 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2096 } 2097 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2098 0, 0, kvm_free, "IU", "Amount of KVM free"); 2099 2100 /*********************************************** 2101 * 2102 * Pmap allocation/deallocation routines. 2103 * 2104 ***********************************************/ 2105 2106 /* 2107 * Initialize the pmap for the swapper process. 2108 */ 2109 void 2110 pmap_pinit0(pmap_t pmap) 2111 { 2112 PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); 2113 2114 PMAP_LOCK_INIT(pmap); 2115 2116 /* 2117 * Kernel page table directory and pmap stuff around is already 2118 * initialized, we are using it right now and here. So, finish 2119 * only PMAP structures initialization for process0 ... 2120 * 2121 * Since the L1 page table and PT2TAB is shared with the kernel pmap, 2122 * which is already included in the list "allpmaps", this pmap does 2123 * not need to be inserted into that list. 2124 */ 2125 pmap->pm_pt1 = kern_pt1; 2126 pmap->pm_pt2tab = kern_pt2tab; 2127 CPU_ZERO(&pmap->pm_active); 2128 PCPU_SET(curpmap, pmap); 2129 TAILQ_INIT(&pmap->pm_pvchunk); 2130 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2131 CPU_SET(0, &pmap->pm_active); 2132 } 2133 2134 static __inline void 2135 pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, 2136 vm_offset_t eva) 2137 { 2138 u_int idx, count; 2139 2140 idx = pte1_index(sva); 2141 count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); 2142 bcopy(spte1p + idx, dpte1p + idx, count); 2143 } 2144 2145 static __inline void 2146 pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, 2147 vm_offset_t eva) 2148 { 2149 u_int idx, count; 2150 2151 idx = pt2tab_index(sva); 2152 count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); 2153 bcopy(spte2p + idx, dpte2p + idx, count); 2154 } 2155 2156 /* 2157 * Initialize a preallocated and zeroed pmap structure, 2158 * such as one in a vmspace structure. 2159 */ 2160 int 2161 pmap_pinit(pmap_t pmap) 2162 { 2163 pt1_entry_t *pte1p; 2164 pt2_entry_t *pte2p; 2165 vm_paddr_t pa, pt2tab_pa; 2166 u_int i; 2167 2168 PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, 2169 pmap->pm_pt1)); 2170 2171 /* 2172 * No need to allocate L2 page table space yet but we do need 2173 * a valid L1 page table and PT2TAB table. 2174 * 2175 * Install shared kernel mappings to these tables. It's a little 2176 * tricky as some parts of KVA are reserved for vectors, devices, 2177 * and whatever else. These parts are supposed to be above 2178 * vm_max_kernel_address. Thus two regions should be installed: 2179 * 2180 * (1) <KERNBASE, kernel_vm_end), 2181 * (2) <vm_max_kernel_address, 0xFFFFFFFF>. 2182 * 2183 * QQQ: The second region should be stable enough to be installed 2184 * only once in time when the tables are allocated. 2185 * QQQ: Maybe copy of both regions at once could be faster ... 2186 * QQQ: Maybe the other TTBR is an option. 2187 * 2188 * Finally, install own PT2TAB table to these tables. 2189 */ 2190 2191 if (pmap->pm_pt1 == NULL) { 2192 pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(kernel_arena, 2193 NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, 2194 pt_memattr); 2195 if (pmap->pm_pt1 == NULL) 2196 return (0); 2197 } 2198 if (pmap->pm_pt2tab == NULL) { 2199 /* 2200 * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page 2201 * only, what should be the only size for 32 bit systems, 2202 * then we could allocate it with vm_page_alloc() and all 2203 * the stuff needed as other L2 page table pages. 2204 * (2) Note that a process PT2TAB is special L2 page table 2205 * page. Its mapping in kernel_arena is permanent and can 2206 * be used no matter which process is current. Its mapping 2207 * in PT2MAP can be used only for current process. 2208 */ 2209 pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(kernel_arena, 2210 NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); 2211 if (pmap->pm_pt2tab == NULL) { 2212 /* 2213 * QQQ: As struct pmap is allocated from UMA with 2214 * UMA_ZONE_NOFREE flag, it's important to leave 2215 * no allocation in pmap if initialization failed. 2216 */ 2217 kmem_free(kernel_arena, (vm_offset_t)pmap->pm_pt1, 2218 NB_IN_PT1); 2219 pmap->pm_pt1 = NULL; 2220 return (0); 2221 } 2222 /* 2223 * QQQ: Each L2 page table page vm_page_t has pindex set to 2224 * pte1 index of virtual address mapped by this page. 2225 * It's not valid for non kernel PT2TABs themselves. 2226 * The pindex of these pages can not be altered because 2227 * of the way how they are allocated now. However, it 2228 * should not be a problem. 2229 */ 2230 } 2231 2232 mtx_lock_spin(&allpmaps_lock); 2233 /* 2234 * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), 2235 * kernel_vm_end_new is used here instead of kernel_vm_end. 2236 */ 2237 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, 2238 kernel_vm_end_new - 1); 2239 pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 2240 0xFFFFFFFF); 2241 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, 2242 kernel_vm_end_new - 1); 2243 pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 2244 0xFFFFFFFF); 2245 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 2246 mtx_unlock_spin(&allpmaps_lock); 2247 2248 /* 2249 * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. 2250 * I.e. self reference mapping. The PT2TAB is private, however mapped 2251 * into shared PT2MAP space, so the mapping should be not global. 2252 */ 2253 pt2tab_pa = vtophys(pmap->pm_pt2tab); 2254 pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); 2255 for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { 2256 pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); 2257 } 2258 2259 /* Insert PT2MAP PT2s into pmap PT1. */ 2260 pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); 2261 for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { 2262 pte1_store(pte1p++, PTE1_LINK(pa)); 2263 } 2264 2265 /* 2266 * Now synchronize new mapping which was made above. 2267 */ 2268 pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); 2269 pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); 2270 2271 CPU_ZERO(&pmap->pm_active); 2272 TAILQ_INIT(&pmap->pm_pvchunk); 2273 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2274 2275 return (1); 2276 } 2277 2278 #ifdef INVARIANTS 2279 static boolean_t 2280 pt2tab_user_is_empty(pt2_entry_t *tab) 2281 { 2282 u_int i, end; 2283 2284 end = pt2tab_index(VM_MAXUSER_ADDRESS); 2285 for (i = 0; i < end; i++) 2286 if (tab[i] != 0) return (FALSE); 2287 return (TRUE); 2288 } 2289 #endif 2290 /* 2291 * Release any resources held by the given physical map. 2292 * Called when a pmap initialized by pmap_pinit is being released. 2293 * Should only be called if the map contains no valid mappings. 2294 */ 2295 void 2296 pmap_release(pmap_t pmap) 2297 { 2298 #ifdef INVARIANTS 2299 vm_offset_t start, end; 2300 #endif 2301 KASSERT(pmap->pm_stats.resident_count == 0, 2302 ("%s: pmap resident count %ld != 0", __func__, 2303 pmap->pm_stats.resident_count)); 2304 KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), 2305 ("%s: has allocated user PT2(s)", __func__)); 2306 KASSERT(CPU_EMPTY(&pmap->pm_active), 2307 ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); 2308 2309 mtx_lock_spin(&allpmaps_lock); 2310 LIST_REMOVE(pmap, pm_list); 2311 mtx_unlock_spin(&allpmaps_lock); 2312 2313 #ifdef INVARIANTS 2314 start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); 2315 end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); 2316 bzero((char *)pmap->pm_pt1 + start, end - start); 2317 2318 start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); 2319 end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); 2320 bzero((char *)pmap->pm_pt2tab + start, end - start); 2321 #endif 2322 /* 2323 * We are leaving PT1 and PT2TAB allocated on released pmap, 2324 * so hopefully UMA vmspace_zone will always be inited with 2325 * UMA_ZONE_NOFREE flag. 2326 */ 2327 } 2328 2329 /********************************************************* 2330 * 2331 * L2 table pages and their pages management routines. 2332 * 2333 *********************************************************/ 2334 2335 /* 2336 * Virtual interface for L2 page table wire counting. 2337 * 2338 * Each L2 page table in a page has own counter which counts a number of 2339 * valid mappings in a table. Global page counter counts mappings in all 2340 * tables in a page plus a single itself mapping in PT2TAB. 2341 * 2342 * During a promotion we leave the associated L2 page table counter 2343 * untouched, so the table (strictly speaking a page which holds it) 2344 * is never freed if promoted. 2345 * 2346 * If a page m->wire_count == 1 then no valid mappings exist in any L2 page 2347 * table in the page and the page itself is only mapped in PT2TAB. 2348 */ 2349 2350 static __inline void 2351 pt2_wirecount_init(vm_page_t m) 2352 { 2353 u_int i; 2354 2355 /* 2356 * Note: A page m is allocated with VM_ALLOC_WIRED flag and 2357 * m->wire_count should be already set correctly. 2358 * So, there is no need to set it again herein. 2359 */ 2360 for (i = 0; i < NPT2_IN_PG; i++) 2361 m->md.pt2_wirecount[i] = 0; 2362 } 2363 2364 static __inline void 2365 pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) 2366 { 2367 2368 /* 2369 * Note: A just modificated pte2 (i.e. already allocated) 2370 * is acquiring one extra reference which must be 2371 * explicitly cleared. It influences the KASSERTs herein. 2372 * All L2 page tables in a page always belong to the same 2373 * pmap, so we allow only one extra reference for the page. 2374 */ 2375 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), 2376 ("%s: PT2 is overflowing ...", __func__)); 2377 KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), 2378 ("%s: PT2PG is overflowing ...", __func__)); 2379 2380 m->wire_count++; 2381 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; 2382 } 2383 2384 static __inline void 2385 pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) 2386 { 2387 2388 KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, 2389 ("%s: PT2 is underflowing ...", __func__)); 2390 KASSERT(m->wire_count > 1, 2391 ("%s: PT2PG is underflowing ...", __func__)); 2392 2393 m->wire_count--; 2394 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; 2395 } 2396 2397 static __inline void 2398 pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) 2399 { 2400 2401 KASSERT(count <= NPTE2_IN_PT2, 2402 ("%s: invalid count %u", __func__, count)); 2403 KASSERT(m->wire_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], 2404 ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count, 2405 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); 2406 2407 m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; 2408 m->wire_count += count; 2409 m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; 2410 2411 KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), 2412 ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count)); 2413 } 2414 2415 static __inline uint32_t 2416 pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) 2417 { 2418 2419 return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); 2420 } 2421 2422 static __inline boolean_t 2423 pt2_is_empty(vm_page_t m, vm_offset_t va) 2424 { 2425 2426 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); 2427 } 2428 2429 static __inline boolean_t 2430 pt2_is_full(vm_page_t m, vm_offset_t va) 2431 { 2432 2433 return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 2434 NPTE2_IN_PT2); 2435 } 2436 2437 static __inline boolean_t 2438 pt2pg_is_empty(vm_page_t m) 2439 { 2440 2441 return (m->wire_count == 1); 2442 } 2443 2444 /* 2445 * This routine is called if the L2 page table 2446 * is not mapped correctly. 2447 */ 2448 static vm_page_t 2449 _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2450 { 2451 uint32_t pte1_idx; 2452 pt1_entry_t *pte1p; 2453 pt2_entry_t pte2; 2454 vm_page_t m; 2455 vm_paddr_t pt2pg_pa, pt2_pa; 2456 2457 pte1_idx = pte1_index(va); 2458 pte1p = pmap->pm_pt1 + pte1_idx; 2459 2460 KASSERT(pte1_load(pte1p) == 0, 2461 ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, 2462 pte1_load(pte1p))); 2463 2464 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); 2465 if (!pte2_is_valid(pte2)) { 2466 /* 2467 * Install new PT2s page into pmap PT2TAB. 2468 */ 2469 m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, 2470 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2471 if (m == NULL) { 2472 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 2473 PMAP_UNLOCK(pmap); 2474 rw_wunlock(&pvh_global_lock); 2475 VM_WAIT; 2476 rw_wlock(&pvh_global_lock); 2477 PMAP_LOCK(pmap); 2478 } 2479 2480 /* 2481 * Indicate the need to retry. While waiting, 2482 * the L2 page table page may have been allocated. 2483 */ 2484 return (NULL); 2485 } 2486 pmap->pm_stats.resident_count++; 2487 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 2488 } else { 2489 pt2pg_pa = pte2_pa(pte2); 2490 m = PHYS_TO_VM_PAGE(pt2pg_pa); 2491 } 2492 2493 pt2_wirecount_inc(m, pte1_idx); 2494 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 2495 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 2496 2497 return (m); 2498 } 2499 2500 static vm_page_t 2501 pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) 2502 { 2503 u_int pte1_idx; 2504 pt1_entry_t *pte1p, pte1; 2505 vm_page_t m; 2506 2507 pte1_idx = pte1_index(va); 2508 retry: 2509 pte1p = pmap->pm_pt1 + pte1_idx; 2510 pte1 = pte1_load(pte1p); 2511 2512 /* 2513 * This supports switching from a 1MB page to a 2514 * normal 4K page. 2515 */ 2516 if (pte1_is_section(pte1)) { 2517 (void)pmap_demote_pte1(pmap, pte1p, va); 2518 /* 2519 * Reload pte1 after demotion. 2520 * 2521 * Note: Demotion can even fail as either PT2 is not find for 2522 * the virtual address or PT2PG can not be allocated. 2523 */ 2524 pte1 = pte1_load(pte1p); 2525 } 2526 2527 /* 2528 * If the L2 page table page is mapped, we just increment the 2529 * hold count, and activate it. 2530 */ 2531 if (pte1_is_link(pte1)) { 2532 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2533 pt2_wirecount_inc(m, pte1_idx); 2534 } else { 2535 /* 2536 * Here if the PT2 isn't mapped, or if it has 2537 * been deallocated. 2538 */ 2539 m = _pmap_allocpte2(pmap, va, flags); 2540 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2541 goto retry; 2542 } 2543 2544 return (m); 2545 } 2546 2547 static __inline void 2548 pmap_free_zero_pages(struct spglist *free) 2549 { 2550 vm_page_t m; 2551 2552 while ((m = SLIST_FIRST(free)) != NULL) { 2553 SLIST_REMOVE_HEAD(free, plinks.s.ss); 2554 /* Preserve the page's PG_ZERO setting. */ 2555 vm_page_free_toq(m); 2556 } 2557 } 2558 2559 /* 2560 * Schedule the specified unused L2 page table page to be freed. Specifically, 2561 * add the page to the specified list of pages that will be released to the 2562 * physical memory manager after the TLB has been updated. 2563 */ 2564 static __inline void 2565 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) 2566 { 2567 2568 /* 2569 * Put page on a list so that it is released after 2570 * *ALL* TLB shootdown is done 2571 */ 2572 #ifdef PMAP_DEBUG 2573 pmap_zero_page_check(m); 2574 #endif 2575 m->flags |= PG_ZERO; 2576 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2577 } 2578 2579 /* 2580 * Unwire L2 page tables page. 2581 */ 2582 static void 2583 pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) 2584 { 2585 pt1_entry_t *pte1p, opte1 __unused; 2586 pt2_entry_t *pte2p; 2587 uint32_t i; 2588 2589 KASSERT(pt2pg_is_empty(m), 2590 ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); 2591 2592 /* 2593 * Unmap all L2 page tables in the page from L1 page table. 2594 * 2595 * QQQ: Individual L2 page tables (except the last one) can be unmapped 2596 * earlier. However, we are doing that this way. 2597 */ 2598 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 2599 ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); 2600 pte1p = pmap->pm_pt1 + m->pindex; 2601 for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { 2602 KASSERT(m->md.pt2_wirecount[i] == 0, 2603 ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); 2604 opte1 = pte1_load(pte1p); 2605 if (pte1_is_link(opte1)) { 2606 pte1_clear(pte1p); 2607 /* 2608 * Flush intermediate TLB cache. 2609 */ 2610 pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); 2611 } 2612 #ifdef INVARIANTS 2613 else 2614 KASSERT((opte1 == 0) || pte1_is_section(opte1), 2615 ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, 2616 pmap, va, opte1, i)); 2617 #endif 2618 } 2619 2620 /* 2621 * Unmap the page from PT2TAB. 2622 */ 2623 pte2p = pmap_pt2tab_entry(pmap, va); 2624 (void)pt2tab_load_clear(pte2p); 2625 pmap_tlb_flush(pmap, pt2map_pt2pg(va)); 2626 2627 m->wire_count = 0; 2628 pmap->pm_stats.resident_count--; 2629 2630 /* 2631 * This is a release store so that the ordinary store unmapping 2632 * the L2 page table page is globally performed before TLB shoot- 2633 * down is begun. 2634 */ 2635 atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); 2636 } 2637 2638 /* 2639 * Decrements a L2 page table page's wire count, which is used to record the 2640 * number of valid page table entries within the page. If the wire count 2641 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2642 * page table page was unmapped and FALSE otherwise. 2643 */ 2644 static __inline boolean_t 2645 pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2646 { 2647 pt2_wirecount_dec(m, pte1_index(va)); 2648 if (pt2pg_is_empty(m)) { 2649 /* 2650 * QQQ: Wire count is zero, so whole page should be zero and 2651 * we can set PG_ZERO flag to it. 2652 * Note that when promotion is enabled, it takes some 2653 * more efforts. See pmap_unwire_pt2_all() below. 2654 */ 2655 pmap_unwire_pt2pg(pmap, va, m); 2656 pmap_add_delayed_free_list(m, free); 2657 return (TRUE); 2658 } else 2659 return (FALSE); 2660 } 2661 2662 /* 2663 * Drop a L2 page table page's wire count at once, which is used to record 2664 * the number of valid L2 page table entries within the page. If the wire 2665 * count drops to zero, then the L2 page table page is unmapped. 2666 */ 2667 static __inline void 2668 pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, 2669 struct spglist *free) 2670 { 2671 u_int pte1_idx = pte1_index(va); 2672 2673 KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), 2674 ("%s: PT2 page's pindex is wrong", __func__)); 2675 KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx), 2676 ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count, 2677 pt2_wirecount_get(m, pte1_idx))); 2678 2679 /* 2680 * It's possible that the L2 page table was never used. 2681 * It happened in case that a section was created without promotion. 2682 */ 2683 if (pt2_is_full(m, va)) { 2684 pt2_wirecount_set(m, pte1_idx, 0); 2685 2686 /* 2687 * QQQ: We clear L2 page table now, so when L2 page table page 2688 * is going to be freed, we can set it PG_ZERO flag ... 2689 * This function is called only on section mappings, so 2690 * hopefully it's not to big overload. 2691 * 2692 * XXX: If pmap is current, existing PT2MAP mapping could be 2693 * used for zeroing. 2694 */ 2695 pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); 2696 } 2697 #ifdef INVARIANTS 2698 else 2699 KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", 2700 __func__, pt2_wirecount_get(m, pte1_idx))); 2701 #endif 2702 if (pt2pg_is_empty(m)) { 2703 pmap_unwire_pt2pg(pmap, va, m); 2704 pmap_add_delayed_free_list(m, free); 2705 } 2706 } 2707 2708 /* 2709 * After removing a L2 page table entry, this routine is used to 2710 * conditionally free the page, and manage the hold/wire counts. 2711 */ 2712 static boolean_t 2713 pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) 2714 { 2715 pt1_entry_t pte1; 2716 vm_page_t mpte; 2717 2718 if (va >= VM_MAXUSER_ADDRESS) 2719 return (FALSE); 2720 pte1 = pte1_load(pmap_pte1(pmap, va)); 2721 mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 2722 return (pmap_unwire_pt2(pmap, va, mpte, free)); 2723 } 2724 2725 /************************************* 2726 * 2727 * Page management routines. 2728 * 2729 *************************************/ 2730 2731 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2732 CTASSERT(_NPCM == 11); 2733 CTASSERT(_NPCPV == 336); 2734 2735 static __inline struct pv_chunk * 2736 pv_to_chunk(pv_entry_t pv) 2737 { 2738 2739 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2740 } 2741 2742 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2743 2744 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2745 #define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2746 2747 static const uint32_t pc_freemask[_NPCM] = { 2748 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2749 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2750 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2751 PC_FREE0_9, PC_FREE10 2752 }; 2753 2754 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2755 "Current number of pv entries"); 2756 2757 #ifdef PV_STATS 2758 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2759 2760 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2761 "Current number of pv entry chunks"); 2762 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2763 "Current number of pv entry chunks allocated"); 2764 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2765 "Current number of pv entry chunks frees"); 2766 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 2767 0, "Number of times tried to get a chunk page but failed."); 2768 2769 static long pv_entry_frees, pv_entry_allocs; 2770 static int pv_entry_spare; 2771 2772 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2773 "Current number of pv entry frees"); 2774 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 2775 0, "Current number of pv entry allocs"); 2776 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2777 "Current number of spare pv entries"); 2778 #endif 2779 2780 /* 2781 * Is given page managed? 2782 */ 2783 static __inline bool 2784 is_managed(vm_paddr_t pa) 2785 { 2786 vm_page_t m; 2787 2788 m = PHYS_TO_VM_PAGE(pa); 2789 if (m == NULL) 2790 return (false); 2791 return ((m->oflags & VPO_UNMANAGED) == 0); 2792 } 2793 2794 static __inline bool 2795 pte1_is_managed(pt1_entry_t pte1) 2796 { 2797 2798 return (is_managed(pte1_pa(pte1))); 2799 } 2800 2801 static __inline bool 2802 pte2_is_managed(pt2_entry_t pte2) 2803 { 2804 2805 return (is_managed(pte2_pa(pte2))); 2806 } 2807 2808 /* 2809 * We are in a serious low memory condition. Resort to 2810 * drastic measures to free some pages so we can allocate 2811 * another pv entry chunk. 2812 */ 2813 static vm_page_t 2814 pmap_pv_reclaim(pmap_t locked_pmap) 2815 { 2816 struct pch newtail; 2817 struct pv_chunk *pc; 2818 struct md_page *pvh; 2819 pt1_entry_t *pte1p; 2820 pmap_t pmap; 2821 pt2_entry_t *pte2p, tpte2; 2822 pv_entry_t pv; 2823 vm_offset_t va; 2824 vm_page_t m, m_pc; 2825 struct spglist free; 2826 uint32_t inuse; 2827 int bit, field, freed; 2828 2829 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2830 pmap = NULL; 2831 m_pc = NULL; 2832 SLIST_INIT(&free); 2833 TAILQ_INIT(&newtail); 2834 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2835 SLIST_EMPTY(&free))) { 2836 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2837 if (pmap != pc->pc_pmap) { 2838 if (pmap != NULL) { 2839 if (pmap != locked_pmap) 2840 PMAP_UNLOCK(pmap); 2841 } 2842 pmap = pc->pc_pmap; 2843 /* Avoid deadlock and lock recursion. */ 2844 if (pmap > locked_pmap) 2845 PMAP_LOCK(pmap); 2846 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2847 pmap = NULL; 2848 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2849 continue; 2850 } 2851 } 2852 2853 /* 2854 * Destroy every non-wired, 4 KB page mapping in the chunk. 2855 */ 2856 freed = 0; 2857 for (field = 0; field < _NPCM; field++) { 2858 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2859 inuse != 0; inuse &= ~(1UL << bit)) { 2860 bit = ffs(inuse) - 1; 2861 pv = &pc->pc_pventry[field * 32 + bit]; 2862 va = pv->pv_va; 2863 pte1p = pmap_pte1(pmap, va); 2864 if (pte1_is_section(pte1_load(pte1p))) 2865 continue; 2866 pte2p = pmap_pte2(pmap, va); 2867 tpte2 = pte2_load(pte2p); 2868 if ((tpte2 & PTE2_W) == 0) 2869 tpte2 = pte2_load_clear(pte2p); 2870 pmap_pte2_release(pte2p); 2871 if ((tpte2 & PTE2_W) != 0) 2872 continue; 2873 KASSERT(tpte2 != 0, 2874 ("pmap_pv_reclaim: pmap %p va %#x zero pte", 2875 pmap, va)); 2876 pmap_tlb_flush(pmap, va); 2877 m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); 2878 if (pte2_is_dirty(tpte2)) 2879 vm_page_dirty(m); 2880 if ((tpte2 & PTE2_A) != 0) 2881 vm_page_aflag_set(m, PGA_REFERENCED); 2882 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2883 if (TAILQ_EMPTY(&m->md.pv_list) && 2884 (m->flags & PG_FICTITIOUS) == 0) { 2885 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2886 if (TAILQ_EMPTY(&pvh->pv_list)) { 2887 vm_page_aflag_clear(m, 2888 PGA_WRITEABLE); 2889 } 2890 } 2891 pc->pc_map[field] |= 1UL << bit; 2892 pmap_unuse_pt2(pmap, va, &free); 2893 freed++; 2894 } 2895 } 2896 if (freed == 0) { 2897 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2898 continue; 2899 } 2900 /* Every freed mapping is for a 4 KB page. */ 2901 pmap->pm_stats.resident_count -= freed; 2902 PV_STAT(pv_entry_frees += freed); 2903 PV_STAT(pv_entry_spare += freed); 2904 pv_entry_count -= freed; 2905 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2906 for (field = 0; field < _NPCM; field++) 2907 if (pc->pc_map[field] != pc_freemask[field]) { 2908 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2909 pc_list); 2910 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2911 2912 /* 2913 * One freed pv entry in locked_pmap is 2914 * sufficient. 2915 */ 2916 if (pmap == locked_pmap) 2917 goto out; 2918 break; 2919 } 2920 if (field == _NPCM) { 2921 PV_STAT(pv_entry_spare -= _NPCPV); 2922 PV_STAT(pc_chunk_count--); 2923 PV_STAT(pc_chunk_frees++); 2924 /* Entire chunk is free; return it. */ 2925 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2926 pmap_qremove((vm_offset_t)pc, 1); 2927 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2928 break; 2929 } 2930 } 2931 out: 2932 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2933 if (pmap != NULL) { 2934 if (pmap != locked_pmap) 2935 PMAP_UNLOCK(pmap); 2936 } 2937 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2938 m_pc = SLIST_FIRST(&free); 2939 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2940 /* Recycle a freed page table page. */ 2941 m_pc->wire_count = 1; 2942 atomic_add_int(&vm_cnt.v_wire_count, 1); 2943 } 2944 pmap_free_zero_pages(&free); 2945 return (m_pc); 2946 } 2947 2948 static void 2949 free_pv_chunk(struct pv_chunk *pc) 2950 { 2951 vm_page_t m; 2952 2953 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2954 PV_STAT(pv_entry_spare -= _NPCPV); 2955 PV_STAT(pc_chunk_count--); 2956 PV_STAT(pc_chunk_frees++); 2957 /* entire chunk is free, return it */ 2958 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2959 pmap_qremove((vm_offset_t)pc, 1); 2960 vm_page_unwire(m, PQ_NONE); 2961 vm_page_free(m); 2962 pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); 2963 } 2964 2965 /* 2966 * Free the pv_entry back to the free list. 2967 */ 2968 static void 2969 free_pv_entry(pmap_t pmap, pv_entry_t pv) 2970 { 2971 struct pv_chunk *pc; 2972 int idx, field, bit; 2973 2974 rw_assert(&pvh_global_lock, RA_WLOCKED); 2975 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2976 PV_STAT(pv_entry_frees++); 2977 PV_STAT(pv_entry_spare++); 2978 pv_entry_count--; 2979 pc = pv_to_chunk(pv); 2980 idx = pv - &pc->pc_pventry[0]; 2981 field = idx / 32; 2982 bit = idx % 32; 2983 pc->pc_map[field] |= 1ul << bit; 2984 for (idx = 0; idx < _NPCM; idx++) 2985 if (pc->pc_map[idx] != pc_freemask[idx]) { 2986 /* 2987 * 98% of the time, pc is already at the head of the 2988 * list. If it isn't already, move it to the head. 2989 */ 2990 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2991 pc)) { 2992 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2993 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2994 pc_list); 2995 } 2996 return; 2997 } 2998 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2999 free_pv_chunk(pc); 3000 } 3001 3002 /* 3003 * Get a new pv_entry, allocating a block from the system 3004 * when needed. 3005 */ 3006 static pv_entry_t 3007 get_pv_entry(pmap_t pmap, boolean_t try) 3008 { 3009 static const struct timeval printinterval = { 60, 0 }; 3010 static struct timeval lastprint; 3011 int bit, field; 3012 pv_entry_t pv; 3013 struct pv_chunk *pc; 3014 vm_page_t m; 3015 3016 rw_assert(&pvh_global_lock, RA_WLOCKED); 3017 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3018 PV_STAT(pv_entry_allocs++); 3019 pv_entry_count++; 3020 if (pv_entry_count > pv_entry_high_water) 3021 if (ratecheck(&lastprint, &printinterval)) 3022 printf("Approaching the limit on PV entries, consider " 3023 "increasing either the vm.pmap.shpgperproc or the " 3024 "vm.pmap.pv_entry_max tunable.\n"); 3025 retry: 3026 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3027 if (pc != NULL) { 3028 for (field = 0; field < _NPCM; field++) { 3029 if (pc->pc_map[field]) { 3030 bit = ffs(pc->pc_map[field]) - 1; 3031 break; 3032 } 3033 } 3034 if (field < _NPCM) { 3035 pv = &pc->pc_pventry[field * 32 + bit]; 3036 pc->pc_map[field] &= ~(1ul << bit); 3037 /* If this was the last item, move it to tail */ 3038 for (field = 0; field < _NPCM; field++) 3039 if (pc->pc_map[field] != 0) { 3040 PV_STAT(pv_entry_spare--); 3041 return (pv); /* not full, return */ 3042 } 3043 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3044 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3045 PV_STAT(pv_entry_spare--); 3046 return (pv); 3047 } 3048 } 3049 /* 3050 * Access to the pte2list "pv_vafree" is synchronized by the pvh 3051 * global lock. If "pv_vafree" is currently non-empty, it will 3052 * remain non-empty until pmap_pte2list_alloc() completes. 3053 */ 3054 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 3055 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3056 if (try) { 3057 pv_entry_count--; 3058 PV_STAT(pc_chunk_tryfail++); 3059 return (NULL); 3060 } 3061 m = pmap_pv_reclaim(pmap); 3062 if (m == NULL) 3063 goto retry; 3064 } 3065 PV_STAT(pc_chunk_count++); 3066 PV_STAT(pc_chunk_allocs++); 3067 pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); 3068 pmap_qenter((vm_offset_t)pc, &m, 1); 3069 pc->pc_pmap = pmap; 3070 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 3071 for (field = 1; field < _NPCM; field++) 3072 pc->pc_map[field] = pc_freemask[field]; 3073 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 3074 pv = &pc->pc_pventry[0]; 3075 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3076 PV_STAT(pv_entry_spare += _NPCPV - 1); 3077 return (pv); 3078 } 3079 3080 /* 3081 * Create a pv entry for page at pa for 3082 * (pmap, va). 3083 */ 3084 static void 3085 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3086 { 3087 pv_entry_t pv; 3088 3089 rw_assert(&pvh_global_lock, RA_WLOCKED); 3090 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3091 pv = get_pv_entry(pmap, FALSE); 3092 pv->pv_va = va; 3093 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3094 } 3095 3096 static __inline pv_entry_t 3097 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3098 { 3099 pv_entry_t pv; 3100 3101 rw_assert(&pvh_global_lock, RA_WLOCKED); 3102 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3103 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3104 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3105 break; 3106 } 3107 } 3108 return (pv); 3109 } 3110 3111 static void 3112 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3113 { 3114 pv_entry_t pv; 3115 3116 pv = pmap_pvh_remove(pvh, pmap, va); 3117 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3118 free_pv_entry(pmap, pv); 3119 } 3120 3121 static void 3122 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 3123 { 3124 struct md_page *pvh; 3125 3126 rw_assert(&pvh_global_lock, RA_WLOCKED); 3127 pmap_pvh_free(&m->md, pmap, va); 3128 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 3129 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3130 if (TAILQ_EMPTY(&pvh->pv_list)) 3131 vm_page_aflag_clear(m, PGA_WRITEABLE); 3132 } 3133 } 3134 3135 static void 3136 pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3137 { 3138 struct md_page *pvh; 3139 pv_entry_t pv; 3140 vm_offset_t va_last; 3141 vm_page_t m; 3142 3143 rw_assert(&pvh_global_lock, RA_WLOCKED); 3144 KASSERT((pa & PTE1_OFFSET) == 0, 3145 ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); 3146 3147 /* 3148 * Transfer the 1mpage's pv entry for this mapping to the first 3149 * page's pv list. 3150 */ 3151 pvh = pa_to_pvh(pa); 3152 va = pte1_trunc(va); 3153 pv = pmap_pvh_remove(pvh, pmap, va); 3154 KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); 3155 m = PHYS_TO_VM_PAGE(pa); 3156 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3157 /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3158 va_last = va + PTE1_SIZE - PAGE_SIZE; 3159 do { 3160 m++; 3161 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3162 ("pmap_pv_demote_pte1: page %p is not managed", m)); 3163 va += PAGE_SIZE; 3164 pmap_insert_entry(pmap, va, m); 3165 } while (va < va_last); 3166 } 3167 3168 static void 3169 pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3170 { 3171 struct md_page *pvh; 3172 pv_entry_t pv; 3173 vm_offset_t va_last; 3174 vm_page_t m; 3175 3176 rw_assert(&pvh_global_lock, RA_WLOCKED); 3177 KASSERT((pa & PTE1_OFFSET) == 0, 3178 ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); 3179 3180 /* 3181 * Transfer the first page's pv entry for this mapping to the 3182 * 1mpage's pv list. Aside from avoiding the cost of a call 3183 * to get_pv_entry(), a transfer avoids the possibility that 3184 * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() 3185 * removes one of the mappings that is being promoted. 3186 */ 3187 m = PHYS_TO_VM_PAGE(pa); 3188 va = pte1_trunc(va); 3189 pv = pmap_pvh_remove(&m->md, pmap, va); 3190 KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); 3191 pvh = pa_to_pvh(pa); 3192 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3193 /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ 3194 va_last = va + PTE1_SIZE - PAGE_SIZE; 3195 do { 3196 m++; 3197 va += PAGE_SIZE; 3198 pmap_pvh_free(&m->md, pmap, va); 3199 } while (va < va_last); 3200 } 3201 3202 /* 3203 * Conditionally create a pv entry. 3204 */ 3205 static boolean_t 3206 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 3207 { 3208 pv_entry_t pv; 3209 3210 rw_assert(&pvh_global_lock, RA_WLOCKED); 3211 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3212 if (pv_entry_count < pv_entry_high_water && 3213 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3214 pv->pv_va = va; 3215 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3216 return (TRUE); 3217 } else 3218 return (FALSE); 3219 } 3220 3221 /* 3222 * Create the pv entries for each of the pages within a section. 3223 */ 3224 static boolean_t 3225 pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 3226 { 3227 struct md_page *pvh; 3228 pv_entry_t pv; 3229 3230 rw_assert(&pvh_global_lock, RA_WLOCKED); 3231 if (pv_entry_count < pv_entry_high_water && 3232 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 3233 pv->pv_va = va; 3234 pvh = pa_to_pvh(pa); 3235 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3236 return (TRUE); 3237 } else 3238 return (FALSE); 3239 } 3240 3241 static inline void 3242 pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) 3243 { 3244 3245 /* Kill all the small mappings or the big one only. */ 3246 if (pte1_is_section(npte1)) 3247 pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); 3248 else 3249 pmap_tlb_flush(pmap, pte1_trunc(va)); 3250 } 3251 3252 /* 3253 * Update kernel pte1 on all pmaps. 3254 * 3255 * The following function is called only on one cpu with disabled interrupts. 3256 * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way 3257 * nobody can invoke explicit hardware table walk during the update of pte1. 3258 * Unsolicited hardware table walk can still happen, invoked by speculative 3259 * data or instruction prefetch or even by speculative hardware table walk. 3260 * 3261 * The break-before-make approach should be implemented here. However, it's 3262 * not so easy to do that for kernel mappings as it would be unhappy to unmap 3263 * itself unexpectedly but voluntarily. 3264 */ 3265 static void 3266 pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) 3267 { 3268 pmap_t pmap; 3269 pt1_entry_t *pte1p; 3270 3271 /* 3272 * Get current pmap. Interrupts should be disabled here 3273 * so PCPU_GET() is done atomically. 3274 */ 3275 pmap = PCPU_GET(curpmap); 3276 if (pmap == NULL) 3277 pmap = kernel_pmap; 3278 3279 /* 3280 * (1) Change pte1 on current pmap. 3281 * (2) Flush all obsolete TLB entries on current CPU. 3282 * (3) Change pte1 on all pmaps. 3283 * (4) Flush all obsolete TLB entries on all CPUs in SMP case. 3284 */ 3285 3286 pte1p = pmap_pte1(pmap, va); 3287 pte1_store(pte1p, npte1); 3288 3289 /* Kill all the small mappings or the big one only. */ 3290 if (pte1_is_section(npte1)) { 3291 pmap_pte1_kern_promotions++; 3292 tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); 3293 } else { 3294 pmap_pte1_kern_demotions++; 3295 tlb_flush_local(pte1_trunc(va)); 3296 } 3297 3298 /* 3299 * In SMP case, this function is called when all cpus are at smp 3300 * rendezvous, so there is no need to use 'allpmaps_lock' lock here. 3301 * In UP case, the function is called with this lock locked. 3302 */ 3303 LIST_FOREACH(pmap, &allpmaps, pm_list) { 3304 pte1p = pmap_pte1(pmap, va); 3305 pte1_store(pte1p, npte1); 3306 } 3307 3308 #ifdef SMP 3309 /* Kill all the small mappings or the big one only. */ 3310 if (pte1_is_section(npte1)) 3311 tlb_flush_range(pte1_trunc(va), PTE1_SIZE); 3312 else 3313 tlb_flush(pte1_trunc(va)); 3314 #endif 3315 } 3316 3317 #ifdef SMP 3318 struct pte1_action { 3319 vm_offset_t va; 3320 pt1_entry_t npte1; 3321 u_int update; /* CPU that updates the PTE1 */ 3322 }; 3323 3324 static void 3325 pmap_update_pte1_action(void *arg) 3326 { 3327 struct pte1_action *act = arg; 3328 3329 if (act->update == PCPU_GET(cpuid)) 3330 pmap_update_pte1_kernel(act->va, act->npte1); 3331 } 3332 3333 /* 3334 * Change pte1 on current pmap. 3335 * Note that kernel pte1 must be changed on all pmaps. 3336 * 3337 * According to the architecture reference manual published by ARM, 3338 * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. 3339 * According to this manual, UNPREDICTABLE behaviours must never happen in 3340 * a viable system. In contrast, on x86 processors, it is not specified which 3341 * TLB entry mapping the virtual address will be used, but the MMU doesn't 3342 * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone 3343 * Black). 3344 * 3345 * It's a problem when either promotion or demotion is being done. The pte1 3346 * update and appropriate TLB flush must be done atomically in general. 3347 */ 3348 static void 3349 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3350 pt1_entry_t npte1) 3351 { 3352 3353 if (pmap == kernel_pmap) { 3354 struct pte1_action act; 3355 3356 sched_pin(); 3357 act.va = va; 3358 act.npte1 = npte1; 3359 act.update = PCPU_GET(cpuid); 3360 smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, 3361 pmap_update_pte1_action, NULL, &act); 3362 sched_unpin(); 3363 } else { 3364 register_t cspr; 3365 3366 /* 3367 * Use break-before-make approach for changing userland 3368 * mappings. It can cause L1 translation aborts on other 3369 * cores in SMP case. So, special treatment is implemented 3370 * in pmap_fault(). To reduce the likelihood that another core 3371 * will be affected by the broken mapping, disable interrupts 3372 * until the mapping change is completed. 3373 */ 3374 cspr = disable_interrupts(PSR_I | PSR_F); 3375 pte1_clear(pte1p); 3376 pmap_tlb_flush_pte1(pmap, va, npte1); 3377 pte1_store(pte1p, npte1); 3378 restore_interrupts(cspr); 3379 } 3380 } 3381 #else 3382 static void 3383 pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, 3384 pt1_entry_t npte1) 3385 { 3386 3387 if (pmap == kernel_pmap) { 3388 mtx_lock_spin(&allpmaps_lock); 3389 pmap_update_pte1_kernel(va, npte1); 3390 mtx_unlock_spin(&allpmaps_lock); 3391 } else { 3392 register_t cspr; 3393 3394 /* 3395 * Use break-before-make approach for changing userland 3396 * mappings. It's absolutely safe in UP case when interrupts 3397 * are disabled. 3398 */ 3399 cspr = disable_interrupts(PSR_I | PSR_F); 3400 pte1_clear(pte1p); 3401 pmap_tlb_flush_pte1(pmap, va, npte1); 3402 pte1_store(pte1p, npte1); 3403 restore_interrupts(cspr); 3404 } 3405 } 3406 #endif 3407 3408 /* 3409 * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are 3410 * within a single page table page (PT2) to a single 1MB page mapping. 3411 * For promotion to occur, two conditions must be met: (1) the 4KB page 3412 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3413 * mappings must have identical characteristics. 3414 * 3415 * Managed (PG_MANAGED) mappings within the kernel address space are not 3416 * promoted. The reason is that kernel PTE1s are replicated in each pmap but 3417 * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only 3418 * read the PTE1 from the kernel pmap. 3419 */ 3420 static void 3421 pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3422 { 3423 pt1_entry_t npte1; 3424 pt2_entry_t *fpte2p, fpte2, fpte2_fav; 3425 pt2_entry_t *pte2p, pte2; 3426 vm_offset_t pteva __unused; 3427 vm_page_t m __unused; 3428 3429 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3430 pmap, va, pte1_load(pte1p), pte1p)); 3431 3432 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3433 3434 /* 3435 * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is 3436 * either invalid, unused, or does not map the first 4KB physical page 3437 * within a 1MB page. 3438 */ 3439 fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); 3440 fpte2 = pte2_load(fpte2p); 3441 if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != 3442 (PTE2_A | PTE2_V)) { 3443 pmap_pte1_p_failures++; 3444 CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", 3445 __func__, va, pmap); 3446 return; 3447 } 3448 if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { 3449 pmap_pte1_p_failures++; 3450 CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", 3451 __func__, va, pmap); 3452 return; 3453 } 3454 if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3455 /* 3456 * When page is not modified, PTE2_RO can be set without 3457 * a TLB invalidation. 3458 */ 3459 fpte2 |= PTE2_RO; 3460 pte2_store(fpte2p, fpte2); 3461 } 3462 3463 /* 3464 * Examine each of the other PTE2s in the specified PT2. Abort if this 3465 * PTE2 maps an unexpected 4KB physical page or does not have identical 3466 * characteristics to the first PTE2. 3467 */ 3468 fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); 3469 fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ 3470 for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { 3471 pte2 = pte2_load(pte2p); 3472 if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { 3473 pmap_pte1_p_failures++; 3474 CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", 3475 __func__, va, pmap); 3476 return; 3477 } 3478 if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { 3479 /* 3480 * When page is not modified, PTE2_RO can be set 3481 * without a TLB invalidation. See note above. 3482 */ 3483 pte2 |= PTE2_RO; 3484 pte2_store(pte2p, pte2); 3485 pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & 3486 PTE2_FRAME); 3487 CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", 3488 __func__, pteva, pmap); 3489 } 3490 if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { 3491 pmap_pte1_p_failures++; 3492 CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", 3493 __func__, va, pmap); 3494 return; 3495 } 3496 3497 fpte2_fav -= PTE2_SIZE; 3498 } 3499 /* 3500 * The page table page in its current state will stay in PT2TAB 3501 * until the PTE1 mapping the section is demoted by pmap_demote_pte1() 3502 * or destroyed by pmap_remove_pte1(). 3503 * 3504 * Note that L2 page table size is not equal to PAGE_SIZE. 3505 */ 3506 m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); 3507 KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], 3508 ("%s: PT2 page is out of range", __func__)); 3509 KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), 3510 ("%s: PT2 page's pindex is wrong", __func__)); 3511 3512 /* 3513 * Get pte1 from pte2 format. 3514 */ 3515 npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; 3516 3517 /* 3518 * Promote the pv entries. 3519 */ 3520 if (pte2_is_managed(fpte2)) 3521 pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); 3522 3523 /* 3524 * Promote the mappings. 3525 */ 3526 pmap_change_pte1(pmap, pte1p, va, npte1); 3527 3528 pmap_pte1_promotions++; 3529 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3530 __func__, va, pmap); 3531 3532 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3533 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3534 } 3535 3536 /* 3537 * Zero L2 page table page. 3538 */ 3539 static __inline void 3540 pmap_clear_pt2(pt2_entry_t *fpte2p) 3541 { 3542 pt2_entry_t *pte2p; 3543 3544 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) 3545 pte2_clear(pte2p); 3546 3547 } 3548 3549 /* 3550 * Removes a 1MB page mapping from the kernel pmap. 3551 */ 3552 static void 3553 pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3554 { 3555 vm_page_t m; 3556 uint32_t pte1_idx; 3557 pt2_entry_t *fpte2p; 3558 vm_paddr_t pt2_pa; 3559 3560 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3561 m = pmap_pt2_page(pmap, va); 3562 if (m == NULL) 3563 /* 3564 * QQQ: Is this function called only on promoted pte1? 3565 * We certainly do section mappings directly 3566 * (without promotion) in kernel !!! 3567 */ 3568 panic("%s: missing pt2 page", __func__); 3569 3570 pte1_idx = pte1_index(va); 3571 3572 /* 3573 * Initialize the L2 page table. 3574 */ 3575 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3576 pmap_clear_pt2(fpte2p); 3577 3578 /* 3579 * Remove the mapping. 3580 */ 3581 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); 3582 pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); 3583 3584 /* 3585 * QQQ: We do not need to invalidate PT2MAP mapping 3586 * as we did not change it. I.e. the L2 page table page 3587 * was and still is mapped the same way. 3588 */ 3589 } 3590 3591 /* 3592 * Do the things to unmap a section in a process 3593 */ 3594 static void 3595 pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 3596 struct spglist *free) 3597 { 3598 pt1_entry_t opte1; 3599 struct md_page *pvh; 3600 vm_offset_t eva, va; 3601 vm_page_t m; 3602 3603 PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, 3604 pte1_load(pte1p), pte1p)); 3605 3606 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3607 KASSERT((sva & PTE1_OFFSET) == 0, 3608 ("%s: sva is not 1mpage aligned", __func__)); 3609 3610 /* 3611 * Clear and invalidate the mapping. It should occupy one and only TLB 3612 * entry. So, pmap_tlb_flush() called with aligned address should be 3613 * sufficient. 3614 */ 3615 opte1 = pte1_load_clear(pte1p); 3616 pmap_tlb_flush(pmap, sva); 3617 3618 if (pte1_is_wired(opte1)) 3619 pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; 3620 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 3621 if (pte1_is_managed(opte1)) { 3622 pvh = pa_to_pvh(pte1_pa(opte1)); 3623 pmap_pvh_free(pvh, pmap, sva); 3624 eva = sva + PTE1_SIZE; 3625 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 3626 va < eva; va += PAGE_SIZE, m++) { 3627 if (pte1_is_dirty(opte1)) 3628 vm_page_dirty(m); 3629 if (opte1 & PTE1_A) 3630 vm_page_aflag_set(m, PGA_REFERENCED); 3631 if (TAILQ_EMPTY(&m->md.pv_list) && 3632 TAILQ_EMPTY(&pvh->pv_list)) 3633 vm_page_aflag_clear(m, PGA_WRITEABLE); 3634 } 3635 } 3636 if (pmap == kernel_pmap) { 3637 /* 3638 * L2 page table(s) can't be removed from kernel map as 3639 * kernel counts on it (stuff around pmap_growkernel()). 3640 */ 3641 pmap_remove_kernel_pte1(pmap, pte1p, sva); 3642 } else { 3643 /* 3644 * Get associated L2 page table page. 3645 * It's possible that the page was never allocated. 3646 */ 3647 m = pmap_pt2_page(pmap, sva); 3648 if (m != NULL) 3649 pmap_unwire_pt2_all(pmap, sva, m, free); 3650 } 3651 } 3652 3653 /* 3654 * Fills L2 page table page with mappings to consecutive physical pages. 3655 */ 3656 static __inline void 3657 pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) 3658 { 3659 pt2_entry_t *pte2p; 3660 3661 for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { 3662 pte2_store(pte2p, npte2); 3663 npte2 += PTE2_SIZE; 3664 } 3665 } 3666 3667 /* 3668 * Tries to demote a 1MB page mapping. If demotion fails, the 3669 * 1MB page mapping is invalidated. 3670 */ 3671 static boolean_t 3672 pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) 3673 { 3674 pt1_entry_t opte1, npte1; 3675 pt2_entry_t *fpte2p, npte2; 3676 vm_paddr_t pt2pg_pa, pt2_pa; 3677 vm_page_t m; 3678 struct spglist free; 3679 uint32_t pte1_idx, isnew = 0; 3680 3681 PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, 3682 pmap, va, pte1_load(pte1p), pte1p)); 3683 3684 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3685 3686 opte1 = pte1_load(pte1p); 3687 KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); 3688 3689 if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { 3690 KASSERT(!pte1_is_wired(opte1), 3691 ("%s: PT2 page for a wired mapping is missing", __func__)); 3692 3693 /* 3694 * Invalidate the 1MB page mapping and return 3695 * "failure" if the mapping was never accessed or the 3696 * allocation of the new page table page fails. 3697 */ 3698 if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, 3699 pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | 3700 VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { 3701 SLIST_INIT(&free); 3702 pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); 3703 pmap_free_zero_pages(&free); 3704 CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", 3705 __func__, va, pmap); 3706 return (FALSE); 3707 } 3708 if (va < VM_MAXUSER_ADDRESS) 3709 pmap->pm_stats.resident_count++; 3710 3711 isnew = 1; 3712 3713 /* 3714 * We init all L2 page tables in the page even if 3715 * we are going to change everything for one L2 page 3716 * table in a while. 3717 */ 3718 pt2pg_pa = pmap_pt2pg_init(pmap, va, m); 3719 } else { 3720 if (va < VM_MAXUSER_ADDRESS) { 3721 if (pt2_is_empty(m, va)) 3722 isnew = 1; /* Demoting section w/o promotion. */ 3723 #ifdef INVARIANTS 3724 else 3725 KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" 3726 " count %u", __func__, 3727 pt2_wirecount_get(m, pte1_index(va)))); 3728 #endif 3729 } 3730 } 3731 3732 pt2pg_pa = VM_PAGE_TO_PHYS(m); 3733 pte1_idx = pte1_index(va); 3734 /* 3735 * If the pmap is current, then the PT2MAP can provide access to 3736 * the page table page (promoted L2 page tables are not unmapped). 3737 * Otherwise, temporarily map the L2 page table page (m) into 3738 * the kernel's address space at either PADDR1 or PADDR2. 3739 * 3740 * Note that L2 page table size is not equal to PAGE_SIZE. 3741 */ 3742 if (pmap_is_current(pmap)) 3743 fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); 3744 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 3745 if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { 3746 pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); 3747 #ifdef SMP 3748 PMAP1cpu = PCPU_GET(cpuid); 3749 #endif 3750 tlb_flush_local((vm_offset_t)PADDR1); 3751 PMAP1changed++; 3752 } else 3753 #ifdef SMP 3754 if (PMAP1cpu != PCPU_GET(cpuid)) { 3755 PMAP1cpu = PCPU_GET(cpuid); 3756 tlb_flush_local((vm_offset_t)PADDR1); 3757 PMAP1changedcpu++; 3758 } else 3759 #endif 3760 PMAP1unchanged++; 3761 fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); 3762 } else { 3763 mtx_lock(&PMAP2mutex); 3764 if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { 3765 pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); 3766 tlb_flush((vm_offset_t)PADDR2); 3767 } 3768 fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); 3769 } 3770 pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); 3771 npte1 = PTE1_LINK(pt2_pa); 3772 3773 KASSERT((opte1 & PTE1_A) != 0, 3774 ("%s: opte1 is missing PTE1_A", __func__)); 3775 KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, 3776 ("%s: opte1 has PTE1_NM", __func__)); 3777 3778 /* 3779 * Get pte2 from pte1 format. 3780 */ 3781 npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; 3782 3783 /* 3784 * If the L2 page table page is new, initialize it. If the mapping 3785 * has changed attributes, update the page table entries. 3786 */ 3787 if (isnew != 0) { 3788 pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); 3789 pmap_fill_pt2(fpte2p, npte2); 3790 } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != 3791 (npte2 & PTE2_PROMOTE)) 3792 pmap_fill_pt2(fpte2p, npte2); 3793 3794 KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), 3795 ("%s: fpte2p and npte2 map different physical addresses", 3796 __func__)); 3797 3798 if (fpte2p == PADDR2) 3799 mtx_unlock(&PMAP2mutex); 3800 3801 /* 3802 * Demote the mapping. This pmap is locked. The old PTE1 has 3803 * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also 3804 * has not PTE1_NM set. Thus, there is no danger of a race with 3805 * another processor changing the setting of PTE1_A and/or PTE1_NM 3806 * between the read above and the store below. 3807 */ 3808 pmap_change_pte1(pmap, pte1p, va, npte1); 3809 3810 /* 3811 * Demote the pv entry. This depends on the earlier demotion 3812 * of the mapping. Specifically, the (re)creation of a per- 3813 * page pv entry might trigger the execution of pmap_pv_reclaim(), 3814 * which might reclaim a newly (re)created per-page pv entry 3815 * and destroy the associated mapping. In order to destroy 3816 * the mapping, the PTE1 must have already changed from mapping 3817 * the 1mpage to referencing the page table page. 3818 */ 3819 if (pte1_is_managed(opte1)) 3820 pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); 3821 3822 pmap_pte1_demotions++; 3823 CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", 3824 __func__, va, pmap); 3825 3826 PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", 3827 __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); 3828 return (TRUE); 3829 } 3830 3831 /* 3832 * Insert the given physical page (p) at 3833 * the specified virtual address (v) in the 3834 * target physical map with the protection requested. 3835 * 3836 * If specified, the page will be wired down, meaning 3837 * that the related pte can not be reclaimed. 3838 * 3839 * NB: This is the only routine which MAY NOT lazy-evaluate 3840 * or lose information. That is, this routine must actually 3841 * insert this page into the given map NOW. 3842 */ 3843 int 3844 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3845 u_int flags, int8_t psind) 3846 { 3847 pt1_entry_t *pte1p; 3848 pt2_entry_t *pte2p; 3849 pt2_entry_t npte2, opte2; 3850 pv_entry_t pv; 3851 vm_paddr_t opa, pa; 3852 vm_page_t mpte2, om; 3853 boolean_t wired; 3854 3855 va = trunc_page(va); 3856 mpte2 = NULL; 3857 wired = (flags & PMAP_ENTER_WIRED) != 0; 3858 3859 KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); 3860 KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, 3861 ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, 3862 va)); 3863 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3864 VM_OBJECT_ASSERT_LOCKED(m->object); 3865 3866 rw_wlock(&pvh_global_lock); 3867 PMAP_LOCK(pmap); 3868 sched_pin(); 3869 3870 /* 3871 * In the case that a page table page is not 3872 * resident, we are creating it here. 3873 */ 3874 if (va < VM_MAXUSER_ADDRESS) { 3875 mpte2 = pmap_allocpte2(pmap, va, flags); 3876 if (mpte2 == NULL) { 3877 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3878 ("pmap_allocpte2 failed with sleep allowed")); 3879 sched_unpin(); 3880 rw_wunlock(&pvh_global_lock); 3881 PMAP_UNLOCK(pmap); 3882 return (KERN_RESOURCE_SHORTAGE); 3883 } 3884 } 3885 pte1p = pmap_pte1(pmap, va); 3886 if (pte1_is_section(pte1_load(pte1p))) 3887 panic("%s: attempted on 1MB page", __func__); 3888 pte2p = pmap_pte2_quick(pmap, va); 3889 if (pte2p == NULL) 3890 panic("%s: invalid L1 page table entry va=%#x", __func__, va); 3891 3892 om = NULL; 3893 pa = VM_PAGE_TO_PHYS(m); 3894 opte2 = pte2_load(pte2p); 3895 opa = pte2_pa(opte2); 3896 /* 3897 * Mapping has not changed, must be protection or wiring change. 3898 */ 3899 if (pte2_is_valid(opte2) && (opa == pa)) { 3900 /* 3901 * Wiring change, just update stats. We don't worry about 3902 * wiring PT2 pages as they remain resident as long as there 3903 * are valid mappings in them. Hence, if a user page is wired, 3904 * the PT2 page will be also. 3905 */ 3906 if (wired && !pte2_is_wired(opte2)) 3907 pmap->pm_stats.wired_count++; 3908 else if (!wired && pte2_is_wired(opte2)) 3909 pmap->pm_stats.wired_count--; 3910 3911 /* 3912 * Remove extra pte2 reference 3913 */ 3914 if (mpte2) 3915 pt2_wirecount_dec(mpte2, pte1_index(va)); 3916 if (pte2_is_managed(opte2)) 3917 om = m; 3918 goto validate; 3919 } 3920 3921 /* 3922 * QQQ: We think that changing physical address on writeable mapping 3923 * is not safe. Well, maybe on kernel address space with correct 3924 * locking, it can make a sense. However, we have no idea why 3925 * anyone should do that on user address space. Are we wrong? 3926 */ 3927 KASSERT((opa == 0) || (opa == pa) || 3928 !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), 3929 ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", 3930 __func__, pmap, va, opte2, opa, pa, flags, prot)); 3931 3932 pv = NULL; 3933 3934 /* 3935 * Mapping has changed, invalidate old range and fall through to 3936 * handle validating new mapping. 3937 */ 3938 if (opa) { 3939 if (pte2_is_wired(opte2)) 3940 pmap->pm_stats.wired_count--; 3941 if (pte2_is_managed(opte2)) { 3942 om = PHYS_TO_VM_PAGE(opa); 3943 pv = pmap_pvh_remove(&om->md, pmap, va); 3944 } 3945 /* 3946 * Remove extra pte2 reference 3947 */ 3948 if (mpte2 != NULL) 3949 pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); 3950 } else 3951 pmap->pm_stats.resident_count++; 3952 3953 /* 3954 * Enter on the PV list if part of our managed memory. 3955 */ 3956 if ((m->oflags & VPO_UNMANAGED) == 0) { 3957 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3958 ("%s: managed mapping within the clean submap", __func__)); 3959 if (pv == NULL) 3960 pv = get_pv_entry(pmap, FALSE); 3961 pv->pv_va = va; 3962 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3963 } else if (pv != NULL) 3964 free_pv_entry(pmap, pv); 3965 3966 /* 3967 * Increment counters 3968 */ 3969 if (wired) 3970 pmap->pm_stats.wired_count++; 3971 3972 validate: 3973 /* 3974 * Now validate mapping with desired protection/wiring. 3975 */ 3976 npte2 = PTE2(pa, PTE2_NM, vm_page_pte2_attr(m)); 3977 if (prot & VM_PROT_WRITE) { 3978 if (pte2_is_managed(npte2)) 3979 vm_page_aflag_set(m, PGA_WRITEABLE); 3980 } 3981 else 3982 npte2 |= PTE2_RO; 3983 if ((prot & VM_PROT_EXECUTE) == 0) 3984 npte2 |= PTE2_NX; 3985 if (wired) 3986 npte2 |= PTE2_W; 3987 if (va < VM_MAXUSER_ADDRESS) 3988 npte2 |= PTE2_U; 3989 if (pmap != kernel_pmap) 3990 npte2 |= PTE2_NG; 3991 3992 /* 3993 * If the mapping or permission bits are different, we need 3994 * to update the pte2. 3995 * 3996 * QQQ: Think again and again what to do 3997 * if the mapping is going to be changed! 3998 */ 3999 if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { 4000 /* 4001 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4002 * is set. Do it now, before the mapping is stored and made 4003 * valid for hardware table walk. If done later, there is a race 4004 * for other threads of current process in lazy loading case. 4005 * Don't do it for kernel memory which is mapped with exec 4006 * permission even if the memory isn't going to hold executable 4007 * code. The only time when icache sync is needed is after 4008 * kernel module is loaded and the relocation info is processed. 4009 * And it's done in elf_cpu_load_file(). 4010 * 4011 * QQQ: (1) Does it exist any better way where 4012 * or how to sync icache? 4013 * (2) Now, we do it on a page basis. 4014 */ 4015 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 4016 m->md.pat_mode == VM_MEMATTR_WB_WA && 4017 (opa != pa || (opte2 & PTE2_NX))) 4018 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4019 4020 npte2 |= PTE2_A; 4021 if (flags & VM_PROT_WRITE) 4022 npte2 &= ~PTE2_NM; 4023 if (opte2 & PTE2_V) { 4024 /* Change mapping with break-before-make approach. */ 4025 opte2 = pte2_load_clear(pte2p); 4026 pmap_tlb_flush(pmap, va); 4027 pte2_store(pte2p, npte2); 4028 if (opte2 & PTE2_A) { 4029 if (pte2_is_managed(opte2)) 4030 vm_page_aflag_set(om, PGA_REFERENCED); 4031 } 4032 if (pte2_is_dirty(opte2)) { 4033 if (pte2_is_managed(opte2)) 4034 vm_page_dirty(om); 4035 } 4036 if (pte2_is_managed(opte2) && 4037 TAILQ_EMPTY(&om->md.pv_list) && 4038 ((om->flags & PG_FICTITIOUS) != 0 || 4039 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4040 vm_page_aflag_clear(om, PGA_WRITEABLE); 4041 } else 4042 pte2_store(pte2p, npte2); 4043 } 4044 #if 0 4045 else { 4046 /* 4047 * QQQ: In time when both access and not mofified bits are 4048 * emulated by software, this should not happen. Some 4049 * analysis is need, if this really happen. Missing 4050 * tlb flush somewhere could be the reason. 4051 */ 4052 panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, 4053 va, opte2, npte2); 4054 } 4055 #endif 4056 /* 4057 * If both the L2 page table page and the reservation are fully 4058 * populated, then attempt promotion. 4059 */ 4060 if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && 4061 sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && 4062 vm_reserv_level_iffullpop(m) == 0) 4063 pmap_promote_pte1(pmap, pte1p, va); 4064 sched_unpin(); 4065 rw_wunlock(&pvh_global_lock); 4066 PMAP_UNLOCK(pmap); 4067 return (KERN_SUCCESS); 4068 } 4069 4070 /* 4071 * Do the things to unmap a page in a process. 4072 */ 4073 static int 4074 pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, 4075 struct spglist *free) 4076 { 4077 pt2_entry_t opte2; 4078 vm_page_t m; 4079 4080 rw_assert(&pvh_global_lock, RA_WLOCKED); 4081 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4082 4083 /* Clear and invalidate the mapping. */ 4084 opte2 = pte2_load_clear(pte2p); 4085 pmap_tlb_flush(pmap, va); 4086 4087 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", 4088 __func__, pmap, va, opte2)); 4089 4090 if (opte2 & PTE2_W) 4091 pmap->pm_stats.wired_count -= 1; 4092 pmap->pm_stats.resident_count -= 1; 4093 if (pte2_is_managed(opte2)) { 4094 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4095 if (pte2_is_dirty(opte2)) 4096 vm_page_dirty(m); 4097 if (opte2 & PTE2_A) 4098 vm_page_aflag_set(m, PGA_REFERENCED); 4099 pmap_remove_entry(pmap, m, va); 4100 } 4101 return (pmap_unuse_pt2(pmap, va, free)); 4102 } 4103 4104 /* 4105 * Remove a single page from a process address space. 4106 */ 4107 static void 4108 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 4109 { 4110 pt2_entry_t *pte2p; 4111 4112 rw_assert(&pvh_global_lock, RA_WLOCKED); 4113 KASSERT(curthread->td_pinned > 0, 4114 ("%s: curthread not pinned", __func__)); 4115 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4116 if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || 4117 !pte2_is_valid(pte2_load(pte2p))) 4118 return; 4119 pmap_remove_pte2(pmap, pte2p, va, free); 4120 } 4121 4122 /* 4123 * Remove the given range of addresses from the specified map. 4124 * 4125 * It is assumed that the start and end are properly 4126 * rounded to the page size. 4127 */ 4128 void 4129 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4130 { 4131 vm_offset_t nextva; 4132 pt1_entry_t *pte1p, pte1; 4133 pt2_entry_t *pte2p, pte2; 4134 struct spglist free; 4135 4136 /* 4137 * Perform an unsynchronized read. This is, however, safe. 4138 */ 4139 if (pmap->pm_stats.resident_count == 0) 4140 return; 4141 4142 SLIST_INIT(&free); 4143 4144 rw_wlock(&pvh_global_lock); 4145 sched_pin(); 4146 PMAP_LOCK(pmap); 4147 4148 /* 4149 * Special handling of removing one page. A very common 4150 * operation and easy to short circuit some code. 4151 */ 4152 if (sva + PAGE_SIZE == eva) { 4153 pte1 = pte1_load(pmap_pte1(pmap, sva)); 4154 if (pte1_is_link(pte1)) { 4155 pmap_remove_page(pmap, sva, &free); 4156 goto out; 4157 } 4158 } 4159 4160 for (; sva < eva; sva = nextva) { 4161 /* 4162 * Calculate address for next L2 page table. 4163 */ 4164 nextva = pte1_trunc(sva + PTE1_SIZE); 4165 if (nextva < sva) 4166 nextva = eva; 4167 if (pmap->pm_stats.resident_count == 0) 4168 break; 4169 4170 pte1p = pmap_pte1(pmap, sva); 4171 pte1 = pte1_load(pte1p); 4172 4173 /* 4174 * Weed out invalid mappings. Note: we assume that the L1 page 4175 * table is always allocated, and in kernel virtual. 4176 */ 4177 if (pte1 == 0) 4178 continue; 4179 4180 if (pte1_is_section(pte1)) { 4181 /* 4182 * Are we removing the entire large page? If not, 4183 * demote the mapping and fall through. 4184 */ 4185 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4186 pmap_remove_pte1(pmap, pte1p, sva, &free); 4187 continue; 4188 } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4189 /* The large page mapping was destroyed. */ 4190 continue; 4191 } 4192 #ifdef INVARIANTS 4193 else { 4194 /* Update pte1 after demotion. */ 4195 pte1 = pte1_load(pte1p); 4196 } 4197 #endif 4198 } 4199 4200 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4201 " is not link", __func__, pmap, sva, pte1, pte1p)); 4202 4203 /* 4204 * Limit our scan to either the end of the va represented 4205 * by the current L2 page table page, or to the end of the 4206 * range being removed. 4207 */ 4208 if (nextva > eva) 4209 nextva = eva; 4210 4211 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; 4212 pte2p++, sva += PAGE_SIZE) { 4213 pte2 = pte2_load(pte2p); 4214 if (!pte2_is_valid(pte2)) 4215 continue; 4216 if (pmap_remove_pte2(pmap, pte2p, sva, &free)) 4217 break; 4218 } 4219 } 4220 out: 4221 sched_unpin(); 4222 rw_wunlock(&pvh_global_lock); 4223 PMAP_UNLOCK(pmap); 4224 pmap_free_zero_pages(&free); 4225 } 4226 4227 /* 4228 * Routine: pmap_remove_all 4229 * Function: 4230 * Removes this physical page from 4231 * all physical maps in which it resides. 4232 * Reflects back modify bits to the pager. 4233 * 4234 * Notes: 4235 * Original versions of this routine were very 4236 * inefficient because they iteratively called 4237 * pmap_remove (slow...) 4238 */ 4239 4240 void 4241 pmap_remove_all(vm_page_t m) 4242 { 4243 struct md_page *pvh; 4244 pv_entry_t pv; 4245 pmap_t pmap; 4246 pt2_entry_t *pte2p, opte2; 4247 pt1_entry_t *pte1p; 4248 vm_offset_t va; 4249 struct spglist free; 4250 4251 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4252 ("%s: page %p is not managed", __func__, m)); 4253 SLIST_INIT(&free); 4254 rw_wlock(&pvh_global_lock); 4255 sched_pin(); 4256 if ((m->flags & PG_FICTITIOUS) != 0) 4257 goto small_mappings; 4258 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4259 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4260 va = pv->pv_va; 4261 pmap = PV_PMAP(pv); 4262 PMAP_LOCK(pmap); 4263 pte1p = pmap_pte1(pmap, va); 4264 (void)pmap_demote_pte1(pmap, pte1p, va); 4265 PMAP_UNLOCK(pmap); 4266 } 4267 small_mappings: 4268 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4269 pmap = PV_PMAP(pv); 4270 PMAP_LOCK(pmap); 4271 pmap->pm_stats.resident_count--; 4272 pte1p = pmap_pte1(pmap, pv->pv_va); 4273 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " 4274 "a 1mpage in page %p's pv list", __func__, m)); 4275 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 4276 opte2 = pte2_load_clear(pte2p); 4277 pmap_tlb_flush(pmap, pv->pv_va); 4278 KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", 4279 __func__, pmap, pv->pv_va)); 4280 if (pte2_is_wired(opte2)) 4281 pmap->pm_stats.wired_count--; 4282 if (opte2 & PTE2_A) 4283 vm_page_aflag_set(m, PGA_REFERENCED); 4284 4285 /* 4286 * Update the vm_page_t clean and reference bits. 4287 */ 4288 if (pte2_is_dirty(opte2)) 4289 vm_page_dirty(m); 4290 pmap_unuse_pt2(pmap, pv->pv_va, &free); 4291 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4292 free_pv_entry(pmap, pv); 4293 PMAP_UNLOCK(pmap); 4294 } 4295 vm_page_aflag_clear(m, PGA_WRITEABLE); 4296 sched_unpin(); 4297 rw_wunlock(&pvh_global_lock); 4298 pmap_free_zero_pages(&free); 4299 } 4300 4301 /* 4302 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4303 * good coding style, a.k.a. 80 character line width limit hell. 4304 */ 4305 static __inline void 4306 pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, 4307 struct spglist *free) 4308 { 4309 vm_paddr_t pa; 4310 vm_page_t m, mt, mpt2pg; 4311 struct md_page *pvh; 4312 4313 pa = pte1_pa(pte1); 4314 m = PHYS_TO_VM_PAGE(pa); 4315 4316 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4317 __func__, m, m->phys_addr, pa)); 4318 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4319 m < &vm_page_array[vm_page_array_size], 4320 ("%s: bad pte1 %#x", __func__, pte1)); 4321 4322 if (pte1_is_dirty(pte1)) { 4323 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4324 vm_page_dirty(mt); 4325 } 4326 4327 pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; 4328 pvh = pa_to_pvh(pa); 4329 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4330 if (TAILQ_EMPTY(&pvh->pv_list)) { 4331 for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) 4332 if (TAILQ_EMPTY(&mt->md.pv_list)) 4333 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4334 } 4335 mpt2pg = pmap_pt2_page(pmap, pv->pv_va); 4336 if (mpt2pg != NULL) 4337 pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); 4338 } 4339 4340 /* 4341 * Just subroutine for pmap_remove_pages() to reasonably satisfy 4342 * good coding style, a.k.a. 80 character line width limit hell. 4343 */ 4344 static __inline void 4345 pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, 4346 struct spglist *free) 4347 { 4348 vm_paddr_t pa; 4349 vm_page_t m; 4350 struct md_page *pvh; 4351 4352 pa = pte2_pa(pte2); 4353 m = PHYS_TO_VM_PAGE(pa); 4354 4355 KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", 4356 __func__, m, m->phys_addr, pa)); 4357 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4358 m < &vm_page_array[vm_page_array_size], 4359 ("%s: bad pte2 %#x", __func__, pte2)); 4360 4361 if (pte2_is_dirty(pte2)) 4362 vm_page_dirty(m); 4363 4364 pmap->pm_stats.resident_count--; 4365 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4366 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 4367 pvh = pa_to_pvh(pa); 4368 if (TAILQ_EMPTY(&pvh->pv_list)) 4369 vm_page_aflag_clear(m, PGA_WRITEABLE); 4370 } 4371 pmap_unuse_pt2(pmap, pv->pv_va, free); 4372 } 4373 4374 /* 4375 * Remove all pages from specified address space this aids process 4376 * exit speeds. Also, this code is special cased for current process 4377 * only, but can have the more generic (and slightly slower) mode enabled. 4378 * This is much faster than pmap_remove in the case of running down 4379 * an entire address space. 4380 */ 4381 void 4382 pmap_remove_pages(pmap_t pmap) 4383 { 4384 pt1_entry_t *pte1p, pte1; 4385 pt2_entry_t *pte2p, pte2; 4386 pv_entry_t pv; 4387 struct pv_chunk *pc, *npc; 4388 struct spglist free; 4389 int field, idx; 4390 int32_t bit; 4391 uint32_t inuse, bitmask; 4392 boolean_t allfree; 4393 4394 /* 4395 * Assert that the given pmap is only active on the current 4396 * CPU. Unfortunately, we cannot block another CPU from 4397 * activating the pmap while this function is executing. 4398 */ 4399 KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), 4400 ("%s: non-current pmap %p", __func__, pmap)); 4401 #if defined(SMP) && defined(INVARIANTS) 4402 { 4403 cpuset_t other_cpus; 4404 4405 sched_pin(); 4406 other_cpus = pmap->pm_active; 4407 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 4408 sched_unpin(); 4409 KASSERT(CPU_EMPTY(&other_cpus), 4410 ("%s: pmap %p active on other cpus", __func__, pmap)); 4411 } 4412 #endif 4413 SLIST_INIT(&free); 4414 rw_wlock(&pvh_global_lock); 4415 PMAP_LOCK(pmap); 4416 sched_pin(); 4417 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4418 KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", 4419 __func__, pmap, pc->pc_pmap)); 4420 allfree = TRUE; 4421 for (field = 0; field < _NPCM; field++) { 4422 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4423 while (inuse != 0) { 4424 bit = ffs(inuse) - 1; 4425 bitmask = 1UL << bit; 4426 idx = field * 32 + bit; 4427 pv = &pc->pc_pventry[idx]; 4428 inuse &= ~bitmask; 4429 4430 /* 4431 * Note that we cannot remove wired pages 4432 * from a process' mapping at this time 4433 */ 4434 pte1p = pmap_pte1(pmap, pv->pv_va); 4435 pte1 = pte1_load(pte1p); 4436 if (pte1_is_section(pte1)) { 4437 if (pte1_is_wired(pte1)) { 4438 allfree = FALSE; 4439 continue; 4440 } 4441 pte1_clear(pte1p); 4442 pmap_remove_pte1_quick(pmap, pte1, pv, 4443 &free); 4444 } 4445 else if (pte1_is_link(pte1)) { 4446 pte2p = pt2map_entry(pv->pv_va); 4447 pte2 = pte2_load(pte2p); 4448 4449 if (!pte2_is_valid(pte2)) { 4450 printf("%s: pmap %p va %#x " 4451 "pte2 %#x\n", __func__, 4452 pmap, pv->pv_va, pte2); 4453 panic("bad pte2"); 4454 } 4455 4456 if (pte2_is_wired(pte2)) { 4457 allfree = FALSE; 4458 continue; 4459 } 4460 pte2_clear(pte2p); 4461 pmap_remove_pte2_quick(pmap, pte2, pv, 4462 &free); 4463 } else { 4464 printf("%s: pmap %p va %#x pte1 %#x\n", 4465 __func__, pmap, pv->pv_va, pte1); 4466 panic("bad pte1"); 4467 } 4468 4469 /* Mark free */ 4470 PV_STAT(pv_entry_frees++); 4471 PV_STAT(pv_entry_spare++); 4472 pv_entry_count--; 4473 pc->pc_map[field] |= bitmask; 4474 } 4475 } 4476 if (allfree) { 4477 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4478 free_pv_chunk(pc); 4479 } 4480 } 4481 tlb_flush_all_ng_local(); 4482 sched_unpin(); 4483 rw_wunlock(&pvh_global_lock); 4484 PMAP_UNLOCK(pmap); 4485 pmap_free_zero_pages(&free); 4486 } 4487 4488 /* 4489 * This code makes some *MAJOR* assumptions: 4490 * 1. Current pmap & pmap exists. 4491 * 2. Not wired. 4492 * 3. Read access. 4493 * 4. No L2 page table pages. 4494 * but is *MUCH* faster than pmap_enter... 4495 */ 4496 static vm_page_t 4497 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4498 vm_prot_t prot, vm_page_t mpt2pg) 4499 { 4500 pt2_entry_t *pte2p, pte2; 4501 vm_paddr_t pa; 4502 struct spglist free; 4503 uint32_t l2prot; 4504 4505 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4506 (m->oflags & VPO_UNMANAGED) != 0, 4507 ("%s: managed mapping within the clean submap", __func__)); 4508 rw_assert(&pvh_global_lock, RA_WLOCKED); 4509 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4510 4511 /* 4512 * In the case that a L2 page table page is not 4513 * resident, we are creating it here. 4514 */ 4515 if (va < VM_MAXUSER_ADDRESS) { 4516 u_int pte1_idx; 4517 pt1_entry_t pte1, *pte1p; 4518 vm_paddr_t pt2_pa; 4519 4520 /* 4521 * Get L1 page table things. 4522 */ 4523 pte1_idx = pte1_index(va); 4524 pte1p = pmap_pte1(pmap, va); 4525 pte1 = pte1_load(pte1p); 4526 4527 if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { 4528 /* 4529 * Each of NPT2_IN_PG L2 page tables on the page can 4530 * come here. Make sure that associated L1 page table 4531 * link is established. 4532 * 4533 * QQQ: It comes that we don't establish all links to 4534 * L2 page tables for newly allocated L2 page 4535 * tables page. 4536 */ 4537 KASSERT(!pte1_is_section(pte1), 4538 ("%s: pte1 %#x is section", __func__, pte1)); 4539 if (!pte1_is_link(pte1)) { 4540 pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), 4541 pte1_idx); 4542 pte1_store(pte1p, PTE1_LINK(pt2_pa)); 4543 } 4544 pt2_wirecount_inc(mpt2pg, pte1_idx); 4545 } else { 4546 /* 4547 * If the L2 page table page is mapped, we just 4548 * increment the hold count, and activate it. 4549 */ 4550 if (pte1_is_section(pte1)) { 4551 return (NULL); 4552 } else if (pte1_is_link(pte1)) { 4553 mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 4554 pt2_wirecount_inc(mpt2pg, pte1_idx); 4555 } else { 4556 mpt2pg = _pmap_allocpte2(pmap, va, 4557 PMAP_ENTER_NOSLEEP); 4558 if (mpt2pg == NULL) 4559 return (NULL); 4560 } 4561 } 4562 } else { 4563 mpt2pg = NULL; 4564 } 4565 4566 /* 4567 * This call to pt2map_entry() makes the assumption that we are 4568 * entering the page into the current pmap. In order to support 4569 * quick entry into any pmap, one would likely use pmap_pte2_quick(). 4570 * But that isn't as quick as pt2map_entry(). 4571 */ 4572 pte2p = pt2map_entry(va); 4573 pte2 = pte2_load(pte2p); 4574 if (pte2_is_valid(pte2)) { 4575 if (mpt2pg != NULL) { 4576 /* 4577 * Remove extra pte2 reference 4578 */ 4579 pt2_wirecount_dec(mpt2pg, pte1_index(va)); 4580 mpt2pg = NULL; 4581 } 4582 return (NULL); 4583 } 4584 4585 /* 4586 * Enter on the PV list if part of our managed memory. 4587 */ 4588 if ((m->oflags & VPO_UNMANAGED) == 0 && 4589 !pmap_try_insert_pv_entry(pmap, va, m)) { 4590 if (mpt2pg != NULL) { 4591 SLIST_INIT(&free); 4592 if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { 4593 pmap_tlb_flush(pmap, va); 4594 pmap_free_zero_pages(&free); 4595 } 4596 4597 mpt2pg = NULL; 4598 } 4599 return (NULL); 4600 } 4601 4602 /* 4603 * Increment counters 4604 */ 4605 pmap->pm_stats.resident_count++; 4606 4607 /* 4608 * Now validate mapping with RO protection 4609 */ 4610 pa = VM_PAGE_TO_PHYS(m); 4611 l2prot = PTE2_RO | PTE2_NM; 4612 if (va < VM_MAXUSER_ADDRESS) 4613 l2prot |= PTE2_U | PTE2_NG; 4614 if ((prot & VM_PROT_EXECUTE) == 0) 4615 l2prot |= PTE2_NX; 4616 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4617 /* 4618 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4619 * is set. QQQ: For more info, see comments in pmap_enter(). 4620 */ 4621 cache_icache_sync_fresh(va, pa, PAGE_SIZE); 4622 } 4623 pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); 4624 4625 return (mpt2pg); 4626 } 4627 4628 void 4629 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4630 { 4631 4632 rw_wlock(&pvh_global_lock); 4633 PMAP_LOCK(pmap); 4634 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 4635 rw_wunlock(&pvh_global_lock); 4636 PMAP_UNLOCK(pmap); 4637 } 4638 4639 /* 4640 * Tries to create 1MB page mapping. Returns TRUE if successful and 4641 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 4642 * blocking, (2) a mapping already exists at the specified virtual address, or 4643 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 4644 */ 4645 static boolean_t 4646 pmap_enter_pte1(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4647 { 4648 pt1_entry_t *pte1p; 4649 vm_paddr_t pa; 4650 uint32_t l1prot; 4651 4652 rw_assert(&pvh_global_lock, RA_WLOCKED); 4653 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4654 pte1p = pmap_pte1(pmap, va); 4655 if (pte1_is_valid(pte1_load(pte1p))) { 4656 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, 4657 va, pmap); 4658 return (FALSE); 4659 } 4660 if ((m->oflags & VPO_UNMANAGED) == 0) { 4661 /* 4662 * Abort this mapping if its PV entry could not be created. 4663 */ 4664 if (!pmap_pv_insert_pte1(pmap, va, VM_PAGE_TO_PHYS(m))) { 4665 CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", 4666 __func__, va, pmap); 4667 return (FALSE); 4668 } 4669 } 4670 /* 4671 * Increment counters. 4672 */ 4673 pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; 4674 4675 /* 4676 * Map the section. 4677 * 4678 * QQQ: Why VM_PROT_WRITE is not evaluated and the mapping is 4679 * made readonly? 4680 */ 4681 pa = VM_PAGE_TO_PHYS(m); 4682 l1prot = PTE1_RO | PTE1_NM; 4683 if (va < VM_MAXUSER_ADDRESS) 4684 l1prot |= PTE1_U | PTE1_NG; 4685 if ((prot & VM_PROT_EXECUTE) == 0) 4686 l1prot |= PTE1_NX; 4687 else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { 4688 /* 4689 * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA 4690 * is set. QQQ: For more info, see comments in pmap_enter(). 4691 */ 4692 cache_icache_sync_fresh(va, pa, PTE1_SIZE); 4693 } 4694 pte1_store(pte1p, PTE1(pa, l1prot, ATTR_TO_L1(vm_page_pte2_attr(m)))); 4695 4696 pmap_pte1_mappings++; 4697 CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, 4698 pmap); 4699 return (TRUE); 4700 } 4701 4702 /* 4703 * Maps a sequence of resident pages belonging to the same object. 4704 * The sequence begins with the given page m_start. This page is 4705 * mapped at the given virtual address start. Each subsequent page is 4706 * mapped at a virtual address that is offset from start by the same 4707 * amount as the page is offset from m_start within the object. The 4708 * last page in the sequence is the page with the largest offset from 4709 * m_start that can be mapped at a virtual address less than the given 4710 * virtual address end. Not every virtual page between start and end 4711 * is mapped; only those for which a resident page exists with the 4712 * corresponding offset from m_start are mapped. 4713 */ 4714 void 4715 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4716 vm_page_t m_start, vm_prot_t prot) 4717 { 4718 vm_offset_t va; 4719 vm_page_t m, mpt2pg; 4720 vm_pindex_t diff, psize; 4721 4722 PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", 4723 __func__, pmap, start, end, m_start, prot)); 4724 4725 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4726 psize = atop(end - start); 4727 mpt2pg = NULL; 4728 m = m_start; 4729 rw_wlock(&pvh_global_lock); 4730 PMAP_LOCK(pmap); 4731 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4732 va = start + ptoa(diff); 4733 if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && 4734 m->psind == 1 && sp_enabled && 4735 pmap_enter_pte1(pmap, va, m, prot)) 4736 m = &m[PTE1_SIZE / PAGE_SIZE - 1]; 4737 else 4738 mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, 4739 mpt2pg); 4740 m = TAILQ_NEXT(m, listq); 4741 } 4742 rw_wunlock(&pvh_global_lock); 4743 PMAP_UNLOCK(pmap); 4744 } 4745 4746 /* 4747 * This code maps large physical mmap regions into the 4748 * processor address space. Note that some shortcuts 4749 * are taken, but the code works. 4750 */ 4751 void 4752 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4753 vm_pindex_t pindex, vm_size_t size) 4754 { 4755 pt1_entry_t *pte1p; 4756 vm_paddr_t pa, pte2_pa; 4757 vm_page_t p; 4758 vm_memattr_t pat_mode; 4759 u_int l1attr, l1prot; 4760 4761 VM_OBJECT_ASSERT_WLOCKED(object); 4762 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4763 ("%s: non-device object", __func__)); 4764 if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { 4765 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4766 return; 4767 p = vm_page_lookup(object, pindex); 4768 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4769 ("%s: invalid page %p", __func__, p)); 4770 pat_mode = p->md.pat_mode; 4771 4772 /* 4773 * Abort the mapping if the first page is not physically 4774 * aligned to a 1MB page boundary. 4775 */ 4776 pte2_pa = VM_PAGE_TO_PHYS(p); 4777 if (pte2_pa & PTE1_OFFSET) 4778 return; 4779 4780 /* 4781 * Skip the first page. Abort the mapping if the rest of 4782 * the pages are not physically contiguous or have differing 4783 * memory attributes. 4784 */ 4785 p = TAILQ_NEXT(p, listq); 4786 for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; 4787 pa += PAGE_SIZE) { 4788 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4789 ("%s: invalid page %p", __func__, p)); 4790 if (pa != VM_PAGE_TO_PHYS(p) || 4791 pat_mode != p->md.pat_mode) 4792 return; 4793 p = TAILQ_NEXT(p, listq); 4794 } 4795 4796 /* 4797 * Map using 1MB pages. 4798 * 4799 * QQQ: Well, we are mapping a section, so same condition must 4800 * be hold like during promotion. It looks that only RW mapping 4801 * is done here, so readonly mapping must be done elsewhere. 4802 */ 4803 l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; 4804 l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); 4805 PMAP_LOCK(pmap); 4806 for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { 4807 pte1p = pmap_pte1(pmap, addr); 4808 if (!pte1_is_valid(pte1_load(pte1p))) { 4809 pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); 4810 pmap->pm_stats.resident_count += PTE1_SIZE / 4811 PAGE_SIZE; 4812 pmap_pte1_mappings++; 4813 } 4814 /* Else continue on if the PTE1 is already valid. */ 4815 addr += PTE1_SIZE; 4816 } 4817 PMAP_UNLOCK(pmap); 4818 } 4819 } 4820 4821 /* 4822 * Do the things to protect a 1mpage in a process. 4823 */ 4824 static void 4825 pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, 4826 vm_prot_t prot) 4827 { 4828 pt1_entry_t npte1, opte1; 4829 vm_offset_t eva, va; 4830 vm_page_t m; 4831 4832 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4833 KASSERT((sva & PTE1_OFFSET) == 0, 4834 ("%s: sva is not 1mpage aligned", __func__)); 4835 4836 opte1 = npte1 = pte1_load(pte1p); 4837 if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { 4838 eva = sva + PTE1_SIZE; 4839 for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); 4840 va < eva; va += PAGE_SIZE, m++) 4841 vm_page_dirty(m); 4842 } 4843 if ((prot & VM_PROT_WRITE) == 0) 4844 npte1 |= PTE1_RO | PTE1_NM; 4845 if ((prot & VM_PROT_EXECUTE) == 0) 4846 npte1 |= PTE1_NX; 4847 4848 /* 4849 * QQQ: Herein, execute permission is never set. 4850 * It only can be cleared. So, no icache 4851 * syncing is needed. 4852 */ 4853 4854 if (npte1 != opte1) { 4855 pte1_store(pte1p, npte1); 4856 pmap_tlb_flush(pmap, sva); 4857 } 4858 } 4859 4860 /* 4861 * Set the physical protection on the 4862 * specified range of this map as requested. 4863 */ 4864 void 4865 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4866 { 4867 boolean_t pv_lists_locked; 4868 vm_offset_t nextva; 4869 pt1_entry_t *pte1p, pte1; 4870 pt2_entry_t *pte2p, opte2, npte2; 4871 4872 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4873 if (prot == VM_PROT_NONE) { 4874 pmap_remove(pmap, sva, eva); 4875 return; 4876 } 4877 4878 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 4879 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 4880 return; 4881 4882 if (pmap_is_current(pmap)) 4883 pv_lists_locked = FALSE; 4884 else { 4885 pv_lists_locked = TRUE; 4886 resume: 4887 rw_wlock(&pvh_global_lock); 4888 sched_pin(); 4889 } 4890 4891 PMAP_LOCK(pmap); 4892 for (; sva < eva; sva = nextva) { 4893 /* 4894 * Calculate address for next L2 page table. 4895 */ 4896 nextva = pte1_trunc(sva + PTE1_SIZE); 4897 if (nextva < sva) 4898 nextva = eva; 4899 4900 pte1p = pmap_pte1(pmap, sva); 4901 pte1 = pte1_load(pte1p); 4902 4903 /* 4904 * Weed out invalid mappings. Note: we assume that L1 page 4905 * page table is always allocated, and in kernel virtual. 4906 */ 4907 if (pte1 == 0) 4908 continue; 4909 4910 if (pte1_is_section(pte1)) { 4911 /* 4912 * Are we protecting the entire large page? If not, 4913 * demote the mapping and fall through. 4914 */ 4915 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 4916 pmap_protect_pte1(pmap, pte1p, sva, prot); 4917 continue; 4918 } else { 4919 if (!pv_lists_locked) { 4920 pv_lists_locked = TRUE; 4921 if (!rw_try_wlock(&pvh_global_lock)) { 4922 PMAP_UNLOCK(pmap); 4923 goto resume; 4924 } 4925 sched_pin(); 4926 } 4927 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 4928 /* 4929 * The large page mapping 4930 * was destroyed. 4931 */ 4932 continue; 4933 } 4934 #ifdef INVARIANTS 4935 else { 4936 /* Update pte1 after demotion */ 4937 pte1 = pte1_load(pte1p); 4938 } 4939 #endif 4940 } 4941 } 4942 4943 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 4944 " is not link", __func__, pmap, sva, pte1, pte1p)); 4945 4946 /* 4947 * Limit our scan to either the end of the va represented 4948 * by the current L2 page table page, or to the end of the 4949 * range being protected. 4950 */ 4951 if (nextva > eva) 4952 nextva = eva; 4953 4954 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 4955 sva += PAGE_SIZE) { 4956 vm_page_t m; 4957 4958 opte2 = npte2 = pte2_load(pte2p); 4959 if (!pte2_is_valid(opte2)) 4960 continue; 4961 4962 if ((prot & VM_PROT_WRITE) == 0) { 4963 if (pte2_is_managed(opte2) && 4964 pte2_is_dirty(opte2)) { 4965 m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); 4966 vm_page_dirty(m); 4967 } 4968 npte2 |= PTE2_RO | PTE2_NM; 4969 } 4970 4971 if ((prot & VM_PROT_EXECUTE) == 0) 4972 npte2 |= PTE2_NX; 4973 4974 /* 4975 * QQQ: Herein, execute permission is never set. 4976 * It only can be cleared. So, no icache 4977 * syncing is needed. 4978 */ 4979 4980 if (npte2 != opte2) { 4981 pte2_store(pte2p, npte2); 4982 pmap_tlb_flush(pmap, sva); 4983 } 4984 } 4985 } 4986 if (pv_lists_locked) { 4987 sched_unpin(); 4988 rw_wunlock(&pvh_global_lock); 4989 } 4990 PMAP_UNLOCK(pmap); 4991 } 4992 4993 /* 4994 * pmap_pvh_wired_mappings: 4995 * 4996 * Return the updated number "count" of managed mappings that are wired. 4997 */ 4998 static int 4999 pmap_pvh_wired_mappings(struct md_page *pvh, int count) 5000 { 5001 pmap_t pmap; 5002 pt1_entry_t pte1; 5003 pt2_entry_t pte2; 5004 pv_entry_t pv; 5005 5006 rw_assert(&pvh_global_lock, RA_WLOCKED); 5007 sched_pin(); 5008 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5009 pmap = PV_PMAP(pv); 5010 PMAP_LOCK(pmap); 5011 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5012 if (pte1_is_section(pte1)) { 5013 if (pte1_is_wired(pte1)) 5014 count++; 5015 } else { 5016 KASSERT(pte1_is_link(pte1), 5017 ("%s: pte1 %#x is not link", __func__, pte1)); 5018 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5019 if (pte2_is_wired(pte2)) 5020 count++; 5021 } 5022 PMAP_UNLOCK(pmap); 5023 } 5024 sched_unpin(); 5025 return (count); 5026 } 5027 5028 /* 5029 * pmap_page_wired_mappings: 5030 * 5031 * Return the number of managed mappings to the given physical page 5032 * that are wired. 5033 */ 5034 int 5035 pmap_page_wired_mappings(vm_page_t m) 5036 { 5037 int count; 5038 5039 count = 0; 5040 if ((m->oflags & VPO_UNMANAGED) != 0) 5041 return (count); 5042 rw_wlock(&pvh_global_lock); 5043 count = pmap_pvh_wired_mappings(&m->md, count); 5044 if ((m->flags & PG_FICTITIOUS) == 0) { 5045 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 5046 count); 5047 } 5048 rw_wunlock(&pvh_global_lock); 5049 return (count); 5050 } 5051 5052 /* 5053 * Returns TRUE if any of the given mappings were used to modify 5054 * physical memory. Otherwise, returns FALSE. Both page and 1mpage 5055 * mappings are supported. 5056 */ 5057 static boolean_t 5058 pmap_is_modified_pvh(struct md_page *pvh) 5059 { 5060 pv_entry_t pv; 5061 pt1_entry_t pte1; 5062 pt2_entry_t pte2; 5063 pmap_t pmap; 5064 boolean_t rv; 5065 5066 rw_assert(&pvh_global_lock, RA_WLOCKED); 5067 rv = FALSE; 5068 sched_pin(); 5069 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5070 pmap = PV_PMAP(pv); 5071 PMAP_LOCK(pmap); 5072 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5073 if (pte1_is_section(pte1)) { 5074 rv = pte1_is_dirty(pte1); 5075 } else { 5076 KASSERT(pte1_is_link(pte1), 5077 ("%s: pte1 %#x is not link", __func__, pte1)); 5078 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5079 rv = pte2_is_dirty(pte2); 5080 } 5081 PMAP_UNLOCK(pmap); 5082 if (rv) 5083 break; 5084 } 5085 sched_unpin(); 5086 return (rv); 5087 } 5088 5089 /* 5090 * pmap_is_modified: 5091 * 5092 * Return whether or not the specified physical page was modified 5093 * in any physical maps. 5094 */ 5095 boolean_t 5096 pmap_is_modified(vm_page_t m) 5097 { 5098 boolean_t rv; 5099 5100 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5101 ("%s: page %p is not managed", __func__, m)); 5102 5103 /* 5104 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5105 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5106 * is clear, no PTE2s can have PG_M set. 5107 */ 5108 VM_OBJECT_ASSERT_WLOCKED(m->object); 5109 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5110 return (FALSE); 5111 rw_wlock(&pvh_global_lock); 5112 rv = pmap_is_modified_pvh(&m->md) || 5113 ((m->flags & PG_FICTITIOUS) == 0 && 5114 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5115 rw_wunlock(&pvh_global_lock); 5116 return (rv); 5117 } 5118 5119 /* 5120 * pmap_is_prefaultable: 5121 * 5122 * Return whether or not the specified virtual address is eligible 5123 * for prefault. 5124 */ 5125 boolean_t 5126 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5127 { 5128 pt1_entry_t pte1; 5129 pt2_entry_t pte2; 5130 boolean_t rv; 5131 5132 rv = FALSE; 5133 PMAP_LOCK(pmap); 5134 pte1 = pte1_load(pmap_pte1(pmap, addr)); 5135 if (pte1_is_link(pte1)) { 5136 pte2 = pte2_load(pt2map_entry(addr)); 5137 rv = !pte2_is_valid(pte2) ; 5138 } 5139 PMAP_UNLOCK(pmap); 5140 return (rv); 5141 } 5142 5143 /* 5144 * Returns TRUE if any of the given mappings were referenced and FALSE 5145 * otherwise. Both page and 1mpage mappings are supported. 5146 */ 5147 static boolean_t 5148 pmap_is_referenced_pvh(struct md_page *pvh) 5149 { 5150 5151 pv_entry_t pv; 5152 pt1_entry_t pte1; 5153 pt2_entry_t pte2; 5154 pmap_t pmap; 5155 boolean_t rv; 5156 5157 rw_assert(&pvh_global_lock, RA_WLOCKED); 5158 rv = FALSE; 5159 sched_pin(); 5160 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5161 pmap = PV_PMAP(pv); 5162 PMAP_LOCK(pmap); 5163 pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); 5164 if (pte1_is_section(pte1)) { 5165 rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); 5166 } else { 5167 pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); 5168 rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); 5169 } 5170 PMAP_UNLOCK(pmap); 5171 if (rv) 5172 break; 5173 } 5174 sched_unpin(); 5175 return (rv); 5176 } 5177 5178 /* 5179 * pmap_is_referenced: 5180 * 5181 * Return whether or not the specified physical page was referenced 5182 * in any physical maps. 5183 */ 5184 boolean_t 5185 pmap_is_referenced(vm_page_t m) 5186 { 5187 boolean_t rv; 5188 5189 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5190 ("%s: page %p is not managed", __func__, m)); 5191 rw_wlock(&pvh_global_lock); 5192 rv = pmap_is_referenced_pvh(&m->md) || 5193 ((m->flags & PG_FICTITIOUS) == 0 && 5194 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 5195 rw_wunlock(&pvh_global_lock); 5196 return (rv); 5197 } 5198 5199 /* 5200 * pmap_ts_referenced: 5201 * 5202 * Return a count of reference bits for a page, clearing those bits. 5203 * It is not necessary for every reference bit to be cleared, but it 5204 * is necessary that 0 only be returned when there are truly no 5205 * reference bits set. 5206 * 5207 * As an optimization, update the page's dirty field if a modified bit is 5208 * found while counting reference bits. This opportunistic update can be 5209 * performed at low cost and can eliminate the need for some future calls 5210 * to pmap_is_modified(). However, since this function stops after 5211 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 5212 * dirty pages. Those dirty pages will only be detected by a future call 5213 * to pmap_is_modified(). 5214 */ 5215 int 5216 pmap_ts_referenced(vm_page_t m) 5217 { 5218 struct md_page *pvh; 5219 pv_entry_t pv, pvf; 5220 pmap_t pmap; 5221 pt1_entry_t *pte1p, opte1; 5222 pt2_entry_t *pte2p, opte2; 5223 vm_paddr_t pa; 5224 int rtval = 0; 5225 5226 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5227 ("%s: page %p is not managed", __func__, m)); 5228 pa = VM_PAGE_TO_PHYS(m); 5229 pvh = pa_to_pvh(pa); 5230 rw_wlock(&pvh_global_lock); 5231 sched_pin(); 5232 if ((m->flags & PG_FICTITIOUS) != 0 || 5233 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5234 goto small_mappings; 5235 pv = pvf; 5236 do { 5237 pmap = PV_PMAP(pv); 5238 PMAP_LOCK(pmap); 5239 pte1p = pmap_pte1(pmap, pv->pv_va); 5240 opte1 = pte1_load(pte1p); 5241 if (pte1_is_dirty(opte1)) { 5242 /* 5243 * Although "opte1" is mapping a 1MB page, because 5244 * this function is called at a 4KB page granularity, 5245 * we only update the 4KB page under test. 5246 */ 5247 vm_page_dirty(m); 5248 } 5249 if ((opte1 & PTE1_A) != 0) { 5250 /* 5251 * Since this reference bit is shared by 256 4KB pages, 5252 * it should not be cleared every time it is tested. 5253 * Apply a simple "hash" function on the physical page 5254 * number, the virtual section number, and the pmap 5255 * address to select one 4KB page out of the 256 5256 * on which testing the reference bit will result 5257 * in clearing that bit. This function is designed 5258 * to avoid the selection of the same 4KB page 5259 * for every 1MB page mapping. 5260 * 5261 * On demotion, a mapping that hasn't been referenced 5262 * is simply destroyed. To avoid the possibility of a 5263 * subsequent page fault on a demoted wired mapping, 5264 * always leave its reference bit set. Moreover, 5265 * since the section is wired, the current state of 5266 * its reference bit won't affect page replacement. 5267 */ 5268 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ 5269 (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && 5270 !pte1_is_wired(opte1)) { 5271 pte1_clear_bit(pte1p, PTE1_A); 5272 pmap_tlb_flush(pmap, pv->pv_va); 5273 } 5274 rtval++; 5275 } 5276 PMAP_UNLOCK(pmap); 5277 /* Rotate the PV list if it has more than one entry. */ 5278 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5279 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5280 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5281 } 5282 if (rtval >= PMAP_TS_REFERENCED_MAX) 5283 goto out; 5284 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5285 small_mappings: 5286 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5287 goto out; 5288 pv = pvf; 5289 do { 5290 pmap = PV_PMAP(pv); 5291 PMAP_LOCK(pmap); 5292 pte1p = pmap_pte1(pmap, pv->pv_va); 5293 KASSERT(pte1_is_link(pte1_load(pte1p)), 5294 ("%s: not found a link in page %p's pv list", __func__, m)); 5295 5296 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5297 opte2 = pte2_load(pte2p); 5298 if (pte2_is_dirty(opte2)) 5299 vm_page_dirty(m); 5300 if ((opte2 & PTE2_A) != 0) { 5301 pte2_clear_bit(pte2p, PTE2_A); 5302 pmap_tlb_flush(pmap, pv->pv_va); 5303 rtval++; 5304 } 5305 PMAP_UNLOCK(pmap); 5306 /* Rotate the PV list if it has more than one entry. */ 5307 if (TAILQ_NEXT(pv, pv_next) != NULL) { 5308 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5309 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5310 } 5311 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 5312 PMAP_TS_REFERENCED_MAX); 5313 out: 5314 sched_unpin(); 5315 rw_wunlock(&pvh_global_lock); 5316 return (rtval); 5317 } 5318 5319 /* 5320 * Clear the wired attribute from the mappings for the specified range of 5321 * addresses in the given pmap. Every valid mapping within that range 5322 * must have the wired attribute set. In contrast, invalid mappings 5323 * cannot have the wired attribute set, so they are ignored. 5324 * 5325 * The wired attribute of the page table entry is not a hardware feature, 5326 * so there is no need to invalidate any TLB entries. 5327 */ 5328 void 5329 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5330 { 5331 vm_offset_t nextva; 5332 pt1_entry_t *pte1p, pte1; 5333 pt2_entry_t *pte2p, pte2; 5334 boolean_t pv_lists_locked; 5335 5336 if (pmap_is_current(pmap)) 5337 pv_lists_locked = FALSE; 5338 else { 5339 pv_lists_locked = TRUE; 5340 resume: 5341 rw_wlock(&pvh_global_lock); 5342 sched_pin(); 5343 } 5344 PMAP_LOCK(pmap); 5345 for (; sva < eva; sva = nextva) { 5346 nextva = pte1_trunc(sva + PTE1_SIZE); 5347 if (nextva < sva) 5348 nextva = eva; 5349 5350 pte1p = pmap_pte1(pmap, sva); 5351 pte1 = pte1_load(pte1p); 5352 5353 /* 5354 * Weed out invalid mappings. Note: we assume that L1 page 5355 * page table is always allocated, and in kernel virtual. 5356 */ 5357 if (pte1 == 0) 5358 continue; 5359 5360 if (pte1_is_section(pte1)) { 5361 if (!pte1_is_wired(pte1)) 5362 panic("%s: pte1 %#x not wired", __func__, pte1); 5363 5364 /* 5365 * Are we unwiring the entire large page? If not, 5366 * demote the mapping and fall through. 5367 */ 5368 if (sva + PTE1_SIZE == nextva && eva >= nextva) { 5369 pte1_clear_bit(pte1p, PTE1_W); 5370 pmap->pm_stats.wired_count -= PTE1_SIZE / 5371 PAGE_SIZE; 5372 continue; 5373 } else { 5374 if (!pv_lists_locked) { 5375 pv_lists_locked = TRUE; 5376 if (!rw_try_wlock(&pvh_global_lock)) { 5377 PMAP_UNLOCK(pmap); 5378 /* Repeat sva. */ 5379 goto resume; 5380 } 5381 sched_pin(); 5382 } 5383 if (!pmap_demote_pte1(pmap, pte1p, sva)) 5384 panic("%s: demotion failed", __func__); 5385 #ifdef INVARIANTS 5386 else { 5387 /* Update pte1 after demotion */ 5388 pte1 = pte1_load(pte1p); 5389 } 5390 #endif 5391 } 5392 } 5393 5394 KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" 5395 " is not link", __func__, pmap, sva, pte1, pte1p)); 5396 5397 /* 5398 * Limit our scan to either the end of the va represented 5399 * by the current L2 page table page, or to the end of the 5400 * range being protected. 5401 */ 5402 if (nextva > eva) 5403 nextva = eva; 5404 5405 for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, 5406 sva += PAGE_SIZE) { 5407 pte2 = pte2_load(pte2p); 5408 if (!pte2_is_valid(pte2)) 5409 continue; 5410 if (!pte2_is_wired(pte2)) 5411 panic("%s: pte2 %#x is missing PTE2_W", 5412 __func__, pte2); 5413 5414 /* 5415 * PTE2_W must be cleared atomically. Although the pmap 5416 * lock synchronizes access to PTE2_W, another processor 5417 * could be changing PTE2_NM and/or PTE2_A concurrently. 5418 */ 5419 pte2_clear_bit(pte2p, PTE2_W); 5420 pmap->pm_stats.wired_count--; 5421 } 5422 } 5423 if (pv_lists_locked) { 5424 sched_unpin(); 5425 rw_wunlock(&pvh_global_lock); 5426 } 5427 PMAP_UNLOCK(pmap); 5428 } 5429 5430 /* 5431 * Clear the write and modified bits in each of the given page's mappings. 5432 */ 5433 void 5434 pmap_remove_write(vm_page_t m) 5435 { 5436 struct md_page *pvh; 5437 pv_entry_t next_pv, pv; 5438 pmap_t pmap; 5439 pt1_entry_t *pte1p; 5440 pt2_entry_t *pte2p, opte2; 5441 vm_offset_t va; 5442 5443 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5444 ("%s: page %p is not managed", __func__, m)); 5445 5446 /* 5447 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5448 * set by another thread while the object is locked. Thus, 5449 * if PGA_WRITEABLE is clear, no page table entries need updating. 5450 */ 5451 VM_OBJECT_ASSERT_WLOCKED(m->object); 5452 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5453 return; 5454 rw_wlock(&pvh_global_lock); 5455 sched_pin(); 5456 if ((m->flags & PG_FICTITIOUS) != 0) 5457 goto small_mappings; 5458 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5459 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5460 va = pv->pv_va; 5461 pmap = PV_PMAP(pv); 5462 PMAP_LOCK(pmap); 5463 pte1p = pmap_pte1(pmap, va); 5464 if (!(pte1_load(pte1p) & PTE1_RO)) 5465 (void)pmap_demote_pte1(pmap, pte1p, va); 5466 PMAP_UNLOCK(pmap); 5467 } 5468 small_mappings: 5469 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5470 pmap = PV_PMAP(pv); 5471 PMAP_LOCK(pmap); 5472 pte1p = pmap_pte1(pmap, pv->pv_va); 5473 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5474 " a section in page %p's pv list", __func__, m)); 5475 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5476 opte2 = pte2_load(pte2p); 5477 if (!(opte2 & PTE2_RO)) { 5478 pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); 5479 if (pte2_is_dirty(opte2)) 5480 vm_page_dirty(m); 5481 pmap_tlb_flush(pmap, pv->pv_va); 5482 } 5483 PMAP_UNLOCK(pmap); 5484 } 5485 vm_page_aflag_clear(m, PGA_WRITEABLE); 5486 sched_unpin(); 5487 rw_wunlock(&pvh_global_lock); 5488 } 5489 5490 /* 5491 * Apply the given advice to the specified range of addresses within the 5492 * given pmap. Depending on the advice, clear the referenced and/or 5493 * modified flags in each mapping and set the mapped page's dirty field. 5494 */ 5495 void 5496 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5497 { 5498 pt1_entry_t *pte1p, opte1; 5499 pt2_entry_t *pte2p, pte2; 5500 vm_offset_t pdnxt; 5501 vm_page_t m; 5502 boolean_t pv_lists_locked; 5503 5504 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5505 return; 5506 if (pmap_is_current(pmap)) 5507 pv_lists_locked = FALSE; 5508 else { 5509 pv_lists_locked = TRUE; 5510 resume: 5511 rw_wlock(&pvh_global_lock); 5512 sched_pin(); 5513 } 5514 PMAP_LOCK(pmap); 5515 for (; sva < eva; sva = pdnxt) { 5516 pdnxt = pte1_trunc(sva + PTE1_SIZE); 5517 if (pdnxt < sva) 5518 pdnxt = eva; 5519 pte1p = pmap_pte1(pmap, sva); 5520 opte1 = pte1_load(pte1p); 5521 if (!pte1_is_valid(opte1)) /* XXX */ 5522 continue; 5523 else if (pte1_is_section(opte1)) { 5524 if (!pte1_is_managed(opte1)) 5525 continue; 5526 if (!pv_lists_locked) { 5527 pv_lists_locked = TRUE; 5528 if (!rw_try_wlock(&pvh_global_lock)) { 5529 PMAP_UNLOCK(pmap); 5530 goto resume; 5531 } 5532 sched_pin(); 5533 } 5534 if (!pmap_demote_pte1(pmap, pte1p, sva)) { 5535 /* 5536 * The large page mapping was destroyed. 5537 */ 5538 continue; 5539 } 5540 5541 /* 5542 * Unless the page mappings are wired, remove the 5543 * mapping to a single page so that a subsequent 5544 * access may repromote. Since the underlying L2 page 5545 * table is fully populated, this removal never 5546 * frees a L2 page table page. 5547 */ 5548 if (!pte1_is_wired(opte1)) { 5549 pte2p = pmap_pte2_quick(pmap, sva); 5550 KASSERT(pte2_is_valid(pte2_load(pte2p)), 5551 ("%s: invalid PTE2", __func__)); 5552 pmap_remove_pte2(pmap, pte2p, sva, NULL); 5553 } 5554 } 5555 if (pdnxt > eva) 5556 pdnxt = eva; 5557 for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, 5558 sva += PAGE_SIZE) { 5559 pte2 = pte2_load(pte2p); 5560 if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) 5561 continue; 5562 else if (pte2_is_dirty(pte2)) { 5563 if (advice == MADV_DONTNEED) { 5564 /* 5565 * Future calls to pmap_is_modified() 5566 * can be avoided by making the page 5567 * dirty now. 5568 */ 5569 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 5570 vm_page_dirty(m); 5571 } 5572 pte2_set_bit(pte2p, PTE2_NM); 5573 pte2_clear_bit(pte2p, PTE2_A); 5574 } else if ((pte2 & PTE2_A) != 0) 5575 pte2_clear_bit(pte2p, PTE2_A); 5576 else 5577 continue; 5578 pmap_tlb_flush(pmap, sva); 5579 } 5580 } 5581 if (pv_lists_locked) { 5582 sched_unpin(); 5583 rw_wunlock(&pvh_global_lock); 5584 } 5585 PMAP_UNLOCK(pmap); 5586 } 5587 5588 /* 5589 * Clear the modify bits on the specified physical page. 5590 */ 5591 void 5592 pmap_clear_modify(vm_page_t m) 5593 { 5594 struct md_page *pvh; 5595 pv_entry_t next_pv, pv; 5596 pmap_t pmap; 5597 pt1_entry_t *pte1p, opte1; 5598 pt2_entry_t *pte2p, opte2; 5599 vm_offset_t va; 5600 5601 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5602 ("%s: page %p is not managed", __func__, m)); 5603 VM_OBJECT_ASSERT_WLOCKED(m->object); 5604 KASSERT(!vm_page_xbusied(m), 5605 ("%s: page %p is exclusive busy", __func__, m)); 5606 5607 /* 5608 * If the page is not PGA_WRITEABLE, then no PTE2s can have PTE2_NM 5609 * cleared. If the object containing the page is locked and the page 5610 * is not exclusive busied, then PGA_WRITEABLE cannot be concurrently 5611 * set. 5612 */ 5613 if ((m->flags & PGA_WRITEABLE) == 0) 5614 return; 5615 rw_wlock(&pvh_global_lock); 5616 sched_pin(); 5617 if ((m->flags & PG_FICTITIOUS) != 0) 5618 goto small_mappings; 5619 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5620 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5621 va = pv->pv_va; 5622 pmap = PV_PMAP(pv); 5623 PMAP_LOCK(pmap); 5624 pte1p = pmap_pte1(pmap, va); 5625 opte1 = pte1_load(pte1p); 5626 if (!(opte1 & PTE1_RO)) { 5627 if (pmap_demote_pte1(pmap, pte1p, va) && 5628 !pte1_is_wired(opte1)) { 5629 /* 5630 * Write protect the mapping to a 5631 * single page so that a subsequent 5632 * write access may repromote. 5633 */ 5634 va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); 5635 pte2p = pmap_pte2_quick(pmap, va); 5636 opte2 = pte2_load(pte2p); 5637 if ((opte2 & PTE2_V)) { 5638 pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); 5639 vm_page_dirty(m); 5640 pmap_tlb_flush(pmap, va); 5641 } 5642 } 5643 } 5644 PMAP_UNLOCK(pmap); 5645 } 5646 small_mappings: 5647 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5648 pmap = PV_PMAP(pv); 5649 PMAP_LOCK(pmap); 5650 pte1p = pmap_pte1(pmap, pv->pv_va); 5651 KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" 5652 " a section in page %p's pv list", __func__, m)); 5653 pte2p = pmap_pte2_quick(pmap, pv->pv_va); 5654 if (pte2_is_dirty(pte2_load(pte2p))) { 5655 pte2_set_bit(pte2p, PTE2_NM); 5656 pmap_tlb_flush(pmap, pv->pv_va); 5657 } 5658 PMAP_UNLOCK(pmap); 5659 } 5660 sched_unpin(); 5661 rw_wunlock(&pvh_global_lock); 5662 } 5663 5664 5665 /* 5666 * Sets the memory attribute for the specified page. 5667 */ 5668 void 5669 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5670 { 5671 pt2_entry_t *cmap2_pte2p; 5672 vm_memattr_t oma; 5673 vm_paddr_t pa; 5674 struct pcpu *pc; 5675 5676 oma = m->md.pat_mode; 5677 m->md.pat_mode = ma; 5678 5679 CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, 5680 VM_PAGE_TO_PHYS(m), oma, ma); 5681 if ((m->flags & PG_FICTITIOUS) != 0) 5682 return; 5683 #if 0 5684 /* 5685 * If "m" is a normal page, flush it from the cache. 5686 * 5687 * First, try to find an existing mapping of the page by sf 5688 * buffer. sf_buf_invalidate_cache() modifies mapping and 5689 * flushes the cache. 5690 */ 5691 if (sf_buf_invalidate_cache(m, oma)) 5692 return; 5693 #endif 5694 /* 5695 * If page is not mapped by sf buffer, map the page 5696 * transient and do invalidation. 5697 */ 5698 if (ma != oma) { 5699 pa = VM_PAGE_TO_PHYS(m); 5700 sched_pin(); 5701 pc = get_pcpu(); 5702 cmap2_pte2p = pc->pc_cmap2_pte2p; 5703 mtx_lock(&pc->pc_cmap_lock); 5704 if (pte2_load(cmap2_pte2p) != 0) 5705 panic("%s: CMAP2 busy", __func__); 5706 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, 5707 vm_memattr_to_pte2(ma))); 5708 dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); 5709 pte2_clear(cmap2_pte2p); 5710 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5711 sched_unpin(); 5712 mtx_unlock(&pc->pc_cmap_lock); 5713 } 5714 } 5715 5716 /* 5717 * Miscellaneous support routines follow 5718 */ 5719 5720 /* 5721 * Returns TRUE if the given page is mapped individually or as part of 5722 * a 1mpage. Otherwise, returns FALSE. 5723 */ 5724 boolean_t 5725 pmap_page_is_mapped(vm_page_t m) 5726 { 5727 boolean_t rv; 5728 5729 if ((m->oflags & VPO_UNMANAGED) != 0) 5730 return (FALSE); 5731 rw_wlock(&pvh_global_lock); 5732 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5733 ((m->flags & PG_FICTITIOUS) == 0 && 5734 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5735 rw_wunlock(&pvh_global_lock); 5736 return (rv); 5737 } 5738 5739 /* 5740 * Returns true if the pmap's pv is one of the first 5741 * 16 pvs linked to from this page. This count may 5742 * be changed upwards or downwards in the future; it 5743 * is only necessary that true be returned for a small 5744 * subset of pmaps for proper page aging. 5745 */ 5746 boolean_t 5747 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5748 { 5749 struct md_page *pvh; 5750 pv_entry_t pv; 5751 int loops = 0; 5752 boolean_t rv; 5753 5754 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5755 ("%s: page %p is not managed", __func__, m)); 5756 rv = FALSE; 5757 rw_wlock(&pvh_global_lock); 5758 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5759 if (PV_PMAP(pv) == pmap) { 5760 rv = TRUE; 5761 break; 5762 } 5763 loops++; 5764 if (loops >= 16) 5765 break; 5766 } 5767 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5768 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5769 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5770 if (PV_PMAP(pv) == pmap) { 5771 rv = TRUE; 5772 break; 5773 } 5774 loops++; 5775 if (loops >= 16) 5776 break; 5777 } 5778 } 5779 rw_wunlock(&pvh_global_lock); 5780 return (rv); 5781 } 5782 5783 /* 5784 * pmap_zero_page zeros the specified hardware page by mapping 5785 * the page into KVM and using bzero to clear its contents. 5786 */ 5787 void 5788 pmap_zero_page(vm_page_t m) 5789 { 5790 pt2_entry_t *cmap2_pte2p; 5791 struct pcpu *pc; 5792 5793 sched_pin(); 5794 pc = get_pcpu(); 5795 cmap2_pte2p = pc->pc_cmap2_pte2p; 5796 mtx_lock(&pc->pc_cmap_lock); 5797 if (pte2_load(cmap2_pte2p) != 0) 5798 panic("%s: CMAP2 busy", __func__); 5799 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5800 vm_page_pte2_attr(m))); 5801 pagezero(pc->pc_cmap2_addr); 5802 pte2_clear(cmap2_pte2p); 5803 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5804 sched_unpin(); 5805 mtx_unlock(&pc->pc_cmap_lock); 5806 } 5807 5808 /* 5809 * pmap_zero_page_area zeros the specified hardware page by mapping 5810 * the page into KVM and using bzero to clear its contents. 5811 * 5812 * off and size may not cover an area beyond a single hardware page. 5813 */ 5814 void 5815 pmap_zero_page_area(vm_page_t m, int off, int size) 5816 { 5817 pt2_entry_t *cmap2_pte2p; 5818 struct pcpu *pc; 5819 5820 sched_pin(); 5821 pc = get_pcpu(); 5822 cmap2_pte2p = pc->pc_cmap2_pte2p; 5823 mtx_lock(&pc->pc_cmap_lock); 5824 if (pte2_load(cmap2_pte2p) != 0) 5825 panic("%s: CMAP2 busy", __func__); 5826 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5827 vm_page_pte2_attr(m))); 5828 if (off == 0 && size == PAGE_SIZE) 5829 pagezero(pc->pc_cmap2_addr); 5830 else 5831 bzero(pc->pc_cmap2_addr + off, size); 5832 pte2_clear(cmap2_pte2p); 5833 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5834 sched_unpin(); 5835 mtx_unlock(&pc->pc_cmap_lock); 5836 } 5837 5838 /* 5839 * pmap_copy_page copies the specified (machine independent) 5840 * page by mapping the page into virtual memory and using 5841 * bcopy to copy the page, one machine dependent page at a 5842 * time. 5843 */ 5844 void 5845 pmap_copy_page(vm_page_t src, vm_page_t dst) 5846 { 5847 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5848 struct pcpu *pc; 5849 5850 sched_pin(); 5851 pc = get_pcpu(); 5852 cmap1_pte2p = pc->pc_cmap1_pte2p; 5853 cmap2_pte2p = pc->pc_cmap2_pte2p; 5854 mtx_lock(&pc->pc_cmap_lock); 5855 if (pte2_load(cmap1_pte2p) != 0) 5856 panic("%s: CMAP1 busy", __func__); 5857 if (pte2_load(cmap2_pte2p) != 0) 5858 panic("%s: CMAP2 busy", __func__); 5859 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), 5860 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); 5861 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), 5862 PTE2_AP_KRW, vm_page_pte2_attr(dst))); 5863 bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); 5864 pte2_clear(cmap1_pte2p); 5865 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5866 pte2_clear(cmap2_pte2p); 5867 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5868 sched_unpin(); 5869 mtx_unlock(&pc->pc_cmap_lock); 5870 } 5871 5872 int unmapped_buf_allowed = 1; 5873 5874 void 5875 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 5876 vm_offset_t b_offset, int xfersize) 5877 { 5878 pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; 5879 vm_page_t a_pg, b_pg; 5880 char *a_cp, *b_cp; 5881 vm_offset_t a_pg_offset, b_pg_offset; 5882 struct pcpu *pc; 5883 int cnt; 5884 5885 sched_pin(); 5886 pc = get_pcpu(); 5887 cmap1_pte2p = pc->pc_cmap1_pte2p; 5888 cmap2_pte2p = pc->pc_cmap2_pte2p; 5889 mtx_lock(&pc->pc_cmap_lock); 5890 if (pte2_load(cmap1_pte2p) != 0) 5891 panic("pmap_copy_pages: CMAP1 busy"); 5892 if (pte2_load(cmap2_pte2p) != 0) 5893 panic("pmap_copy_pages: CMAP2 busy"); 5894 while (xfersize > 0) { 5895 a_pg = ma[a_offset >> PAGE_SHIFT]; 5896 a_pg_offset = a_offset & PAGE_MASK; 5897 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 5898 b_pg = mb[b_offset >> PAGE_SHIFT]; 5899 b_pg_offset = b_offset & PAGE_MASK; 5900 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 5901 pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), 5902 PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); 5903 tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); 5904 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), 5905 PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); 5906 tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); 5907 a_cp = pc->pc_cmap1_addr + a_pg_offset; 5908 b_cp = pc->pc_cmap2_addr + b_pg_offset; 5909 bcopy(a_cp, b_cp, cnt); 5910 a_offset += cnt; 5911 b_offset += cnt; 5912 xfersize -= cnt; 5913 } 5914 pte2_clear(cmap1_pte2p); 5915 tlb_flush((vm_offset_t)pc->pc_cmap1_addr); 5916 pte2_clear(cmap2_pte2p); 5917 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 5918 sched_unpin(); 5919 mtx_unlock(&pc->pc_cmap_lock); 5920 } 5921 5922 vm_offset_t 5923 pmap_quick_enter_page(vm_page_t m) 5924 { 5925 struct pcpu *pc; 5926 pt2_entry_t *pte2p; 5927 5928 critical_enter(); 5929 pc = get_pcpu(); 5930 pte2p = pc->pc_qmap_pte2p; 5931 5932 KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); 5933 5934 pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 5935 vm_page_pte2_attr(m))); 5936 return (pc->pc_qmap_addr); 5937 } 5938 5939 void 5940 pmap_quick_remove_page(vm_offset_t addr) 5941 { 5942 struct pcpu *pc; 5943 pt2_entry_t *pte2p; 5944 5945 pc = get_pcpu(); 5946 pte2p = pc->pc_qmap_pte2p; 5947 5948 KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); 5949 KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); 5950 5951 pte2_clear(pte2p); 5952 tlb_flush(pc->pc_qmap_addr); 5953 critical_exit(); 5954 } 5955 5956 /* 5957 * Copy the range specified by src_addr/len 5958 * from the source map to the range dst_addr/len 5959 * in the destination map. 5960 * 5961 * This routine is only advisory and need not do anything. 5962 */ 5963 void 5964 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 5965 vm_offset_t src_addr) 5966 { 5967 struct spglist free; 5968 vm_offset_t addr; 5969 vm_offset_t end_addr = src_addr + len; 5970 vm_offset_t nextva; 5971 5972 if (dst_addr != src_addr) 5973 return; 5974 5975 if (!pmap_is_current(src_pmap)) 5976 return; 5977 5978 rw_wlock(&pvh_global_lock); 5979 if (dst_pmap < src_pmap) { 5980 PMAP_LOCK(dst_pmap); 5981 PMAP_LOCK(src_pmap); 5982 } else { 5983 PMAP_LOCK(src_pmap); 5984 PMAP_LOCK(dst_pmap); 5985 } 5986 sched_pin(); 5987 for (addr = src_addr; addr < end_addr; addr = nextva) { 5988 pt2_entry_t *src_pte2p, *dst_pte2p; 5989 vm_page_t dst_mpt2pg, src_mpt2pg; 5990 pt1_entry_t src_pte1; 5991 u_int pte1_idx; 5992 5993 KASSERT(addr < VM_MAXUSER_ADDRESS, 5994 ("%s: invalid to pmap_copy page tables", __func__)); 5995 5996 nextva = pte1_trunc(addr + PTE1_SIZE); 5997 if (nextva < addr) 5998 nextva = end_addr; 5999 6000 pte1_idx = pte1_index(addr); 6001 src_pte1 = src_pmap->pm_pt1[pte1_idx]; 6002 if (pte1_is_section(src_pte1)) { 6003 if ((addr & PTE1_OFFSET) != 0 || 6004 (addr + PTE1_SIZE) > end_addr) 6005 continue; 6006 if (dst_pmap->pm_pt1[pte1_idx] == 0 && 6007 (!pte1_is_managed(src_pte1) || 6008 pmap_pv_insert_pte1(dst_pmap, addr, 6009 pte1_pa(src_pte1)))) { 6010 dst_pmap->pm_pt1[pte1_idx] = src_pte1 & 6011 ~PTE1_W; 6012 dst_pmap->pm_stats.resident_count += 6013 PTE1_SIZE / PAGE_SIZE; 6014 pmap_pte1_mappings++; 6015 } 6016 continue; 6017 } else if (!pte1_is_link(src_pte1)) 6018 continue; 6019 6020 src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); 6021 6022 /* 6023 * We leave PT2s to be linked from PT1 even if they are not 6024 * referenced until all PT2s in a page are without reference. 6025 * 6026 * QQQ: It could be changed ... 6027 */ 6028 #if 0 /* single_pt2_link_is_cleared */ 6029 KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, 6030 ("%s: source page table page is unused", __func__)); 6031 #else 6032 if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) 6033 continue; 6034 #endif 6035 if (nextva > end_addr) 6036 nextva = end_addr; 6037 6038 src_pte2p = pt2map_entry(addr); 6039 while (addr < nextva) { 6040 pt2_entry_t temp_pte2; 6041 temp_pte2 = pte2_load(src_pte2p); 6042 /* 6043 * we only virtual copy managed pages 6044 */ 6045 if (pte2_is_managed(temp_pte2)) { 6046 dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, 6047 PMAP_ENTER_NOSLEEP); 6048 if (dst_mpt2pg == NULL) 6049 goto out; 6050 dst_pte2p = pmap_pte2_quick(dst_pmap, addr); 6051 if (!pte2_is_valid(pte2_load(dst_pte2p)) && 6052 pmap_try_insert_pv_entry(dst_pmap, addr, 6053 PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { 6054 /* 6055 * Clear the wired, modified, and 6056 * accessed (referenced) bits 6057 * during the copy. 6058 */ 6059 temp_pte2 &= ~(PTE2_W | PTE2_A); 6060 temp_pte2 |= PTE2_NM; 6061 pte2_store(dst_pte2p, temp_pte2); 6062 dst_pmap->pm_stats.resident_count++; 6063 } else { 6064 SLIST_INIT(&free); 6065 if (pmap_unwire_pt2(dst_pmap, addr, 6066 dst_mpt2pg, &free)) { 6067 pmap_tlb_flush(dst_pmap, addr); 6068 pmap_free_zero_pages(&free); 6069 } 6070 goto out; 6071 } 6072 if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= 6073 pt2_wirecount_get(src_mpt2pg, pte1_idx)) 6074 break; 6075 } 6076 addr += PAGE_SIZE; 6077 src_pte2p++; 6078 } 6079 } 6080 out: 6081 sched_unpin(); 6082 rw_wunlock(&pvh_global_lock); 6083 PMAP_UNLOCK(src_pmap); 6084 PMAP_UNLOCK(dst_pmap); 6085 } 6086 6087 /* 6088 * Increase the starting virtual address of the given mapping if a 6089 * different alignment might result in more section mappings. 6090 */ 6091 void 6092 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6093 vm_offset_t *addr, vm_size_t size) 6094 { 6095 vm_offset_t pte1_offset; 6096 6097 if (size < PTE1_SIZE) 6098 return; 6099 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6100 offset += ptoa(object->pg_color); 6101 pte1_offset = offset & PTE1_OFFSET; 6102 if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || 6103 (*addr & PTE1_OFFSET) == pte1_offset) 6104 return; 6105 if ((*addr & PTE1_OFFSET) < pte1_offset) 6106 *addr = pte1_trunc(*addr) + pte1_offset; 6107 else 6108 *addr = pte1_roundup(*addr) + pte1_offset; 6109 } 6110 6111 void 6112 pmap_activate(struct thread *td) 6113 { 6114 pmap_t pmap, oldpmap; 6115 u_int cpuid, ttb; 6116 6117 PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); 6118 6119 critical_enter(); 6120 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6121 oldpmap = PCPU_GET(curpmap); 6122 cpuid = PCPU_GET(cpuid); 6123 6124 #if defined(SMP) 6125 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6126 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6127 #else 6128 CPU_CLR(cpuid, &oldpmap->pm_active); 6129 CPU_SET(cpuid, &pmap->pm_active); 6130 #endif 6131 6132 ttb = pmap_ttb_get(pmap); 6133 6134 /* 6135 * pmap_activate is for the current thread on the current cpu 6136 */ 6137 td->td_pcb->pcb_pagedir = ttb; 6138 cp15_ttbr_set(ttb); 6139 PCPU_SET(curpmap, pmap); 6140 critical_exit(); 6141 } 6142 6143 /* 6144 * Perform the pmap work for mincore. 6145 */ 6146 int 6147 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 6148 { 6149 pt1_entry_t *pte1p, pte1; 6150 pt2_entry_t *pte2p, pte2; 6151 vm_paddr_t pa; 6152 bool managed; 6153 int val; 6154 6155 PMAP_LOCK(pmap); 6156 retry: 6157 pte1p = pmap_pte1(pmap, addr); 6158 pte1 = pte1_load(pte1p); 6159 if (pte1_is_section(pte1)) { 6160 pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); 6161 managed = pte1_is_managed(pte1); 6162 val = MINCORE_SUPER | MINCORE_INCORE; 6163 if (pte1_is_dirty(pte1)) 6164 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6165 if (pte1 & PTE1_A) 6166 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6167 } else if (pte1_is_link(pte1)) { 6168 pte2p = pmap_pte2(pmap, addr); 6169 pte2 = pte2_load(pte2p); 6170 pmap_pte2_release(pte2p); 6171 pa = pte2_pa(pte2); 6172 managed = pte2_is_managed(pte2); 6173 val = MINCORE_INCORE; 6174 if (pte2_is_dirty(pte2)) 6175 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6176 if (pte2 & PTE2_A) 6177 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6178 } else { 6179 managed = false; 6180 val = 0; 6181 } 6182 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6183 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 6184 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 6185 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 6186 goto retry; 6187 } else 6188 PA_UNLOCK_COND(*locked_pa); 6189 PMAP_UNLOCK(pmap); 6190 return (val); 6191 } 6192 6193 void 6194 pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) 6195 { 6196 vm_offset_t sva; 6197 uint32_t l2attr; 6198 6199 KASSERT((size & PAGE_MASK) == 0, 6200 ("%s: device mapping not page-sized", __func__)); 6201 6202 sva = va; 6203 l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); 6204 while (size != 0) { 6205 pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); 6206 va += PAGE_SIZE; 6207 pa += PAGE_SIZE; 6208 size -= PAGE_SIZE; 6209 } 6210 tlb_flush_range(sva, va - sva); 6211 } 6212 6213 void 6214 pmap_kremove_device(vm_offset_t va, vm_size_t size) 6215 { 6216 vm_offset_t sva; 6217 6218 KASSERT((size & PAGE_MASK) == 0, 6219 ("%s: device mapping not page-sized", __func__)); 6220 6221 sva = va; 6222 while (size != 0) { 6223 pmap_kremove(va); 6224 va += PAGE_SIZE; 6225 size -= PAGE_SIZE; 6226 } 6227 tlb_flush_range(sva, va - sva); 6228 } 6229 6230 void 6231 pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) 6232 { 6233 6234 pcb->pcb_pagedir = pmap_ttb_get(pmap); 6235 } 6236 6237 6238 /* 6239 * Clean L1 data cache range by physical address. 6240 * The range must be within a single page. 6241 */ 6242 static void 6243 pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) 6244 { 6245 pt2_entry_t *cmap2_pte2p; 6246 struct pcpu *pc; 6247 6248 KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, 6249 ("%s: not on single page", __func__)); 6250 6251 sched_pin(); 6252 pc = get_pcpu(); 6253 cmap2_pte2p = pc->pc_cmap2_pte2p; 6254 mtx_lock(&pc->pc_cmap_lock); 6255 if (pte2_load(cmap2_pte2p) != 0) 6256 panic("%s: CMAP2 busy", __func__); 6257 pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); 6258 dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); 6259 pte2_clear(cmap2_pte2p); 6260 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6261 sched_unpin(); 6262 mtx_unlock(&pc->pc_cmap_lock); 6263 } 6264 6265 /* 6266 * Sync instruction cache range which is not mapped yet. 6267 */ 6268 void 6269 cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) 6270 { 6271 uint32_t len, offset; 6272 vm_page_t m; 6273 6274 /* Write back d-cache on given address range. */ 6275 offset = pa & PAGE_MASK; 6276 for ( ; size != 0; size -= len, pa += len, offset = 0) { 6277 len = min(PAGE_SIZE - offset, size); 6278 m = PHYS_TO_VM_PAGE(pa); 6279 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6280 __func__, pa)); 6281 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6282 } 6283 /* 6284 * I-cache is VIPT. Only way how to flush all virtual mappings 6285 * on given physical address is to invalidate all i-cache. 6286 */ 6287 icache_inv_all(); 6288 } 6289 6290 void 6291 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) 6292 { 6293 6294 /* Write back d-cache on given address range. */ 6295 if (va >= VM_MIN_KERNEL_ADDRESS) { 6296 dcache_wb_pou(va, size); 6297 } else { 6298 uint32_t len, offset; 6299 vm_paddr_t pa; 6300 vm_page_t m; 6301 6302 offset = va & PAGE_MASK; 6303 for ( ; size != 0; size -= len, va += len, offset = 0) { 6304 pa = pmap_extract(pmap, va); /* offset is preserved */ 6305 len = min(PAGE_SIZE - offset, size); 6306 m = PHYS_TO_VM_PAGE(pa); 6307 KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", 6308 __func__, pa)); 6309 pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); 6310 } 6311 } 6312 /* 6313 * I-cache is VIPT. Only way how to flush all virtual mappings 6314 * on given physical address is to invalidate all i-cache. 6315 */ 6316 icache_inv_all(); 6317 } 6318 6319 /* 6320 * The implementation of pmap_fault() uses IN_RANGE2() macro which 6321 * depends on the fact that given range size is a power of 2. 6322 */ 6323 CTASSERT(powerof2(NB_IN_PT1)); 6324 CTASSERT(powerof2(PT2MAP_SIZE)); 6325 6326 #define IN_RANGE2(addr, start, size) \ 6327 ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) 6328 6329 /* 6330 * Handle access and R/W emulation faults. 6331 */ 6332 int 6333 pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) 6334 { 6335 pt1_entry_t *pte1p, pte1; 6336 pt2_entry_t *pte2p, pte2; 6337 6338 if (pmap == NULL) 6339 pmap = kernel_pmap; 6340 6341 /* 6342 * In kernel, we should never get abort with FAR which is in range of 6343 * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here 6344 * and print out a useful abort message and even get to the debugger 6345 * otherwise it likely ends with never ending loop of aborts. 6346 */ 6347 if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { 6348 /* 6349 * All L1 tables should always be mapped and present. 6350 * However, we check only current one herein. For user mode, 6351 * only permission abort from malicious user is not fatal. 6352 * And alignment abort as it may have higher priority. 6353 */ 6354 if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { 6355 CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", 6356 __func__, pmap, pmap->pm_pt1, far); 6357 panic("%s: pm_pt1 abort", __func__); 6358 } 6359 return (KERN_INVALID_ADDRESS); 6360 } 6361 if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { 6362 /* 6363 * PT2MAP should be always mapped and present in current 6364 * L1 table. However, only existing L2 tables are mapped 6365 * in PT2MAP. For user mode, only L2 translation abort and 6366 * permission abort from malicious user is not fatal. 6367 * And alignment abort as it may have higher priority. 6368 */ 6369 if (!usermode || (idx != FAULT_ALIGN && 6370 idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { 6371 CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", 6372 __func__, pmap, PT2MAP, far); 6373 panic("%s: PT2MAP abort", __func__); 6374 } 6375 return (KERN_INVALID_ADDRESS); 6376 } 6377 6378 /* 6379 * A pmap lock is used below for handling of access and R/W emulation 6380 * aborts. They were handled by atomic operations before so some 6381 * analysis of new situation is needed to answer the following question: 6382 * Is it safe to use the lock even for these aborts? 6383 * 6384 * There may happen two cases in general: 6385 * 6386 * (1) Aborts while the pmap lock is locked already - this should not 6387 * happen as pmap lock is not recursive. However, under pmap lock only 6388 * internal kernel data should be accessed and such data should be 6389 * mapped with A bit set and NM bit cleared. If double abort happens, 6390 * then a mapping of data which has caused it must be fixed. Further, 6391 * all new mappings are always made with A bit set and the bit can be 6392 * cleared only on managed mappings. 6393 * 6394 * (2) Aborts while another lock(s) is/are locked - this already can 6395 * happen. However, there is no difference here if it's either access or 6396 * R/W emulation abort, or if it's some other abort. 6397 */ 6398 6399 PMAP_LOCK(pmap); 6400 #ifdef SMP 6401 /* 6402 * Special treatment is due to break-before-make approach done when 6403 * pte1 is updated for userland mapping during section promotion or 6404 * demotion. If not caught here, pmap_enter() can find a section 6405 * mapping on faulting address. That is not allowed. 6406 */ 6407 if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { 6408 PMAP_UNLOCK(pmap); 6409 return (KERN_SUCCESS); 6410 } 6411 #endif 6412 /* 6413 * Accesss bits for page and section. Note that the entry 6414 * is not in TLB yet, so TLB flush is not necessary. 6415 * 6416 * QQQ: This is hardware emulation, we do not call userret() 6417 * for aborts from user mode. 6418 */ 6419 if (idx == FAULT_ACCESS_L2) { 6420 pte2p = pt2map_entry(far); 6421 pte2 = pte2_load(pte2p); 6422 if (pte2_is_valid(pte2)) { 6423 pte2_store(pte2p, pte2 | PTE2_A); 6424 PMAP_UNLOCK(pmap); 6425 return (KERN_SUCCESS); 6426 } 6427 } 6428 if (idx == FAULT_ACCESS_L1) { 6429 pte1p = pmap_pte1(pmap, far); 6430 pte1 = pte1_load(pte1p); 6431 if (pte1_is_section(pte1)) { 6432 pte1_store(pte1p, pte1 | PTE1_A); 6433 PMAP_UNLOCK(pmap); 6434 return (KERN_SUCCESS); 6435 } 6436 } 6437 6438 /* 6439 * Handle modify bits for page and section. Note that the modify 6440 * bit is emulated by software. So PTEx_RO is software read only 6441 * bit and PTEx_NM flag is real hardware read only bit. 6442 * 6443 * QQQ: This is hardware emulation, we do not call userret() 6444 * for aborts from user mode. 6445 */ 6446 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { 6447 pte2p = pt2map_entry(far); 6448 pte2 = pte2_load(pte2p); 6449 if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && 6450 (pte2 & PTE2_NM)) { 6451 pte2_store(pte2p, pte2 & ~PTE2_NM); 6452 tlb_flush(trunc_page(far)); 6453 PMAP_UNLOCK(pmap); 6454 return (KERN_SUCCESS); 6455 } 6456 } 6457 if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { 6458 pte1p = pmap_pte1(pmap, far); 6459 pte1 = pte1_load(pte1p); 6460 if (pte1_is_section(pte1) && !(pte1 & PTE1_RO) && 6461 (pte1 & PTE1_NM)) { 6462 pte1_store(pte1p, pte1 & ~PTE1_NM); 6463 tlb_flush(pte1_trunc(far)); 6464 PMAP_UNLOCK(pmap); 6465 return (KERN_SUCCESS); 6466 } 6467 } 6468 6469 /* 6470 * QQQ: The previous code, mainly fast handling of access and 6471 * modify bits aborts, could be moved to ASM. Now we are 6472 * starting to deal with not fast aborts. 6473 */ 6474 6475 #ifdef INVARIANTS 6476 /* 6477 * Read an entry in PT2TAB associated with both pmap and far. 6478 * It's safe because PT2TAB is always mapped. 6479 */ 6480 pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); 6481 if (pte2_is_valid(pte2)) { 6482 /* 6483 * Now, when we know that L2 page table is allocated, 6484 * we can use PT2MAP to get L2 page table entry. 6485 */ 6486 pte2 = pte2_load(pt2map_entry(far)); 6487 if (pte2_is_valid(pte2)) { 6488 /* 6489 * If L2 page table entry is valid, make sure that 6490 * L1 page table entry is valid too. Note that we 6491 * leave L2 page entries untouched when promoted. 6492 */ 6493 pte1 = pte1_load(pmap_pte1(pmap, far)); 6494 if (!pte1_is_valid(pte1)) { 6495 panic("%s: missing L1 page entry (%p, %#x)", 6496 __func__, pmap, far); 6497 } 6498 } 6499 } 6500 #endif 6501 PMAP_UNLOCK(pmap); 6502 return (KERN_FAILURE); 6503 } 6504 6505 #if defined(PMAP_DEBUG) 6506 /* 6507 * Reusing of KVA used in pmap_zero_page function !!! 6508 */ 6509 static void 6510 pmap_zero_page_check(vm_page_t m) 6511 { 6512 pt2_entry_t *cmap2_pte2p; 6513 uint32_t *p, *end; 6514 struct pcpu *pc; 6515 6516 sched_pin(); 6517 pc = get_pcpu(); 6518 cmap2_pte2p = pc->pc_cmap2_pte2p; 6519 mtx_lock(&pc->pc_cmap_lock); 6520 if (pte2_load(cmap2_pte2p) != 0) 6521 panic("%s: CMAP2 busy", __func__); 6522 pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, 6523 vm_page_pte2_attr(m))); 6524 end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); 6525 for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) 6526 if (*p != 0) 6527 panic("%s: page %p not zero, va: %p", __func__, m, 6528 pc->pc_cmap2_addr); 6529 pte2_clear(cmap2_pte2p); 6530 tlb_flush((vm_offset_t)pc->pc_cmap2_addr); 6531 sched_unpin(); 6532 mtx_unlock(&pc->pc_cmap_lock); 6533 } 6534 6535 int 6536 pmap_pid_dump(int pid) 6537 { 6538 pmap_t pmap; 6539 struct proc *p; 6540 int npte2 = 0; 6541 int i, j, index; 6542 6543 sx_slock(&allproc_lock); 6544 FOREACH_PROC_IN_SYSTEM(p) { 6545 if (p->p_pid != pid || p->p_vmspace == NULL) 6546 continue; 6547 index = 0; 6548 pmap = vmspace_pmap(p->p_vmspace); 6549 for (i = 0; i < NPTE1_IN_PT1; i++) { 6550 pt1_entry_t pte1; 6551 pt2_entry_t *pte2p, pte2; 6552 vm_offset_t base, va; 6553 vm_paddr_t pa; 6554 vm_page_t m; 6555 6556 base = i << PTE1_SHIFT; 6557 pte1 = pte1_load(&pmap->pm_pt1[i]); 6558 6559 if (pte1_is_section(pte1)) { 6560 /* 6561 * QQQ: Do something here! 6562 */ 6563 } else if (pte1_is_link(pte1)) { 6564 for (j = 0; j < NPTE2_IN_PT2; j++) { 6565 va = base + (j << PAGE_SHIFT); 6566 if (va >= VM_MIN_KERNEL_ADDRESS) { 6567 if (index) { 6568 index = 0; 6569 printf("\n"); 6570 } 6571 sx_sunlock(&allproc_lock); 6572 return (npte2); 6573 } 6574 pte2p = pmap_pte2(pmap, va); 6575 pte2 = pte2_load(pte2p); 6576 pmap_pte2_release(pte2p); 6577 if (!pte2_is_valid(pte2)) 6578 continue; 6579 6580 pa = pte2_pa(pte2); 6581 m = PHYS_TO_VM_PAGE(pa); 6582 printf("va: 0x%x, pa: 0x%x, h: %d, w:" 6583 " %d, f: 0x%x", va, pa, 6584 m->hold_count, m->wire_count, 6585 m->flags); 6586 npte2++; 6587 index++; 6588 if (index >= 2) { 6589 index = 0; 6590 printf("\n"); 6591 } else { 6592 printf(" "); 6593 } 6594 } 6595 } 6596 } 6597 } 6598 sx_sunlock(&allproc_lock); 6599 return (npte2); 6600 } 6601 6602 #endif 6603 6604 #ifdef DDB 6605 static pt2_entry_t * 6606 pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) 6607 { 6608 pt1_entry_t pte1; 6609 vm_paddr_t pt2pg_pa; 6610 6611 pte1 = pte1_load(pmap_pte1(pmap, va)); 6612 if (!pte1_is_link(pte1)) 6613 return (NULL); 6614 6615 if (pmap_is_current(pmap)) 6616 return (pt2map_entry(va)); 6617 6618 /* Note that L2 page table size is not equal to PAGE_SIZE. */ 6619 pt2pg_pa = trunc_page(pte1_link_pa(pte1)); 6620 if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { 6621 pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); 6622 #ifdef SMP 6623 PMAP3cpu = PCPU_GET(cpuid); 6624 #endif 6625 tlb_flush_local((vm_offset_t)PADDR3); 6626 } 6627 #ifdef SMP 6628 else if (PMAP3cpu != PCPU_GET(cpuid)) { 6629 PMAP3cpu = PCPU_GET(cpuid); 6630 tlb_flush_local((vm_offset_t)PADDR3); 6631 } 6632 #endif 6633 return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); 6634 } 6635 6636 static void 6637 dump_pmap(pmap_t pmap) 6638 { 6639 6640 printf("pmap %p\n", pmap); 6641 printf(" pm_pt1: %p\n", pmap->pm_pt1); 6642 printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); 6643 printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); 6644 } 6645 6646 DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) 6647 { 6648 6649 pmap_t pmap; 6650 LIST_FOREACH(pmap, &allpmaps, pm_list) { 6651 dump_pmap(pmap); 6652 } 6653 } 6654 6655 static int 6656 pte2_class(pt2_entry_t pte2) 6657 { 6658 int cls; 6659 6660 cls = (pte2 >> 2) & 0x03; 6661 cls |= (pte2 >> 4) & 0x04; 6662 return (cls); 6663 } 6664 6665 static void 6666 dump_section(pmap_t pmap, uint32_t pte1_idx) 6667 { 6668 } 6669 6670 static void 6671 dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) 6672 { 6673 uint32_t i; 6674 vm_offset_t va; 6675 pt2_entry_t *pte2p, pte2; 6676 vm_page_t m; 6677 6678 va = pte1_idx << PTE1_SHIFT; 6679 pte2p = pmap_pte2_ddb(pmap, va); 6680 for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { 6681 pte2 = pte2_load(pte2p); 6682 if (pte2 == 0) 6683 continue; 6684 if (!pte2_is_valid(pte2)) { 6685 printf(" 0x%08X: 0x%08X", va, pte2); 6686 if (!invalid_ok) 6687 printf(" - not valid !!!"); 6688 printf("\n"); 6689 continue; 6690 } 6691 m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); 6692 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, 6693 pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); 6694 if (m != NULL) { 6695 printf(" v:%d h:%d w:%d f:0x%04X\n", m->valid, 6696 m->hold_count, m->wire_count, m->flags); 6697 } else { 6698 printf("\n"); 6699 } 6700 } 6701 } 6702 6703 static __inline boolean_t 6704 is_pv_chunk_space(vm_offset_t va) 6705 { 6706 6707 if ((((vm_offset_t)pv_chunkbase) <= va) && 6708 (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) 6709 return (TRUE); 6710 return (FALSE); 6711 } 6712 6713 DB_SHOW_COMMAND(pmap, pmap_pmap_print) 6714 { 6715 /* XXX convert args. */ 6716 pmap_t pmap = (pmap_t)addr; 6717 pt1_entry_t pte1; 6718 pt2_entry_t pte2; 6719 vm_offset_t va, eva; 6720 vm_page_t m; 6721 uint32_t i; 6722 boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; 6723 6724 if (have_addr) { 6725 pmap_t pm; 6726 6727 LIST_FOREACH(pm, &allpmaps, pm_list) 6728 if (pm == pmap) break; 6729 if (pm == NULL) { 6730 printf("given pmap %p is not in allpmaps list\n", pmap); 6731 return; 6732 } 6733 } else 6734 pmap = PCPU_GET(curpmap); 6735 6736 eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; 6737 dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ 6738 6739 printf("pmap: 0x%08X\n", (uint32_t)pmap); 6740 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6741 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6742 6743 for(i = 0; i < NPTE1_IN_PT1; i++) { 6744 pte1 = pte1_load(&pmap->pm_pt1[i]); 6745 if (pte1 == 0) 6746 continue; 6747 va = i << PTE1_SHIFT; 6748 if (va >= eva) 6749 break; 6750 6751 if (pte1_is_section(pte1)) { 6752 printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, 6753 !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); 6754 dump_section(pmap, i); 6755 } else if (pte1_is_link(pte1)) { 6756 dump_link_ok = TRUE; 6757 invalid_ok = FALSE; 6758 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6759 m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); 6760 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", 6761 va, pte1, pte2, m); 6762 if (is_pv_chunk_space(va)) { 6763 printf(" - pv_chunk space"); 6764 if (dump_pv_chunk) 6765 invalid_ok = TRUE; 6766 else 6767 dump_link_ok = FALSE; 6768 } 6769 else if (m != NULL) 6770 printf(" w:%d w2:%u", m->wire_count, 6771 pt2_wirecount_get(m, pte1_index(va))); 6772 if (pte2 == 0) 6773 printf(" !!! pt2tab entry is ZERO"); 6774 else if (pte2_pa(pte1) != pte2_pa(pte2)) 6775 printf(" !!! pt2tab entry is DIFFERENT - m: %p", 6776 PHYS_TO_VM_PAGE(pte2_pa(pte2))); 6777 printf("\n"); 6778 if (dump_link_ok) 6779 dump_link(pmap, i, invalid_ok); 6780 } else 6781 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6782 } 6783 } 6784 6785 static void 6786 dump_pt2tab(pmap_t pmap) 6787 { 6788 uint32_t i; 6789 pt2_entry_t pte2; 6790 vm_offset_t va; 6791 vm_paddr_t pa; 6792 vm_page_t m; 6793 6794 printf("PT2TAB:\n"); 6795 for (i = 0; i < PT2TAB_ENTRIES; i++) { 6796 pte2 = pte2_load(&pmap->pm_pt2tab[i]); 6797 if (!pte2_is_valid(pte2)) 6798 continue; 6799 va = i << PT2TAB_SHIFT; 6800 pa = pte2_pa(pte2); 6801 m = PHYS_TO_VM_PAGE(pa); 6802 printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, 6803 pte2_class(pte2), !!(pte2 & PTE2_S), m); 6804 if (m != NULL) 6805 printf(" , h: %d, w: %d, f: 0x%04X pidx: %lld", 6806 m->hold_count, m->wire_count, m->flags, m->pindex); 6807 printf("\n"); 6808 } 6809 } 6810 6811 DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) 6812 { 6813 /* XXX convert args. */ 6814 pmap_t pmap = (pmap_t)addr; 6815 pt1_entry_t pte1; 6816 pt2_entry_t pte2; 6817 vm_offset_t va; 6818 uint32_t i, start; 6819 6820 if (have_addr) { 6821 printf("supported only on current pmap\n"); 6822 return; 6823 } 6824 6825 pmap = PCPU_GET(curpmap); 6826 printf("curpmap: 0x%08X\n", (uint32_t)pmap); 6827 printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); 6828 printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); 6829 6830 start = pte1_index((vm_offset_t)PT2MAP); 6831 for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { 6832 pte1 = pte1_load(&pmap->pm_pt1[i]); 6833 if (pte1 == 0) 6834 continue; 6835 va = i << PTE1_SHIFT; 6836 if (pte1_is_section(pte1)) { 6837 printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, 6838 !!(pte1 & PTE1_S)); 6839 dump_section(pmap, i); 6840 } else if (pte1_is_link(pte1)) { 6841 pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); 6842 printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, 6843 pte1, pte2); 6844 if (pte2 == 0) 6845 printf(" !!! pt2tab entry is ZERO\n"); 6846 } else 6847 printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); 6848 } 6849 dump_pt2tab(pmap); 6850 } 6851 #endif 6852