1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2018 Matthew Macy 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include <sys/param.h> 32 #include <sys/kernel.h> 33 #include <sys/systm.h> 34 #include <sys/conf.h> 35 #include <sys/bitstring.h> 36 #include <sys/queue.h> 37 #include <sys/cpuset.h> 38 #include <sys/endian.h> 39 #include <sys/kerneldump.h> 40 #include <sys/ktr.h> 41 #include <sys/lock.h> 42 #include <sys/syslog.h> 43 #include <sys/msgbuf.h> 44 #include <sys/malloc.h> 45 #include <sys/mman.h> 46 #include <sys/mutex.h> 47 #include <sys/proc.h> 48 #include <sys/rwlock.h> 49 #include <sys/sched.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #include <sys/vmem.h> 53 #include <sys/vmmeter.h> 54 #include <sys/smp.h> 55 56 #include <sys/kdb.h> 57 58 #include <dev/ofw/openfirm.h> 59 60 #include <vm/vm.h> 61 #include <vm/pmap.h> 62 #include <vm/vm_param.h> 63 #include <vm/vm_kern.h> 64 #include <vm/vm_page.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_pageout.h> 69 #include <vm/vm_phys.h> 70 #include <vm/vm_reserv.h> 71 #include <vm/uma.h> 72 73 #include <machine/_inttypes.h> 74 #include <machine/cpu.h> 75 #include <machine/platform.h> 76 #include <machine/frame.h> 77 #include <machine/md_var.h> 78 #include <machine/psl.h> 79 #include <machine/bat.h> 80 #include <machine/hid.h> 81 #include <machine/pte.h> 82 #include <machine/sr.h> 83 #include <machine/trap.h> 84 #include <machine/mmuvar.h> 85 86 #ifdef INVARIANTS 87 #include <vm/uma_dbg.h> 88 #endif 89 90 #define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) 91 #define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) 92 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) 93 94 #include "opt_ddb.h" 95 #ifdef DDB 96 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); 97 #endif 98 99 #define PG_W RPTE_WIRED 100 #define PG_V RPTE_VALID 101 #define PG_MANAGED RPTE_MANAGED 102 #define PG_PROMOTED RPTE_PROMOTED 103 #define PG_M RPTE_C 104 #define PG_A RPTE_R 105 #define PG_X RPTE_EAA_X 106 #define PG_RW RPTE_EAA_W 107 #define PG_PTE_CACHE RPTE_ATTR_MASK 108 109 #define RPTE_SHIFT 9 110 #define NLS_MASK ((1UL<<5)-1) 111 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT) 112 #define RPTE_MASK (RPTE_ENTRIES-1) 113 114 #define NLB_SHIFT 0 115 #define NLB_MASK (((1UL<<52)-1) << 8) 116 117 extern int nkpt; 118 extern caddr_t crashdumpmap; 119 120 #define RIC_FLUSH_TLB 0 121 #define RIC_FLUSH_PWC 1 122 #define RIC_FLUSH_ALL 2 123 124 #define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ 125 126 #define PPC_INST_TLBIE 0x7c000264 127 #define PPC_INST_TLBIEL 0x7c000224 128 #define PPC_INST_SLBIA 0x7c0003e4 129 130 #define ___PPC_RA(a) (((a) & 0x1f) << 16) 131 #define ___PPC_RB(b) (((b) & 0x1f) << 11) 132 #define ___PPC_RS(s) (((s) & 0x1f) << 21) 133 #define ___PPC_RT(t) ___PPC_RS(t) 134 #define ___PPC_R(r) (((r) & 0x1) << 16) 135 #define ___PPC_PRS(prs) (((prs) & 0x1) << 17) 136 #define ___PPC_RIC(ric) (((ric) & 0x3) << 18) 137 138 #define PPC_SLBIA(IH) __XSTRING(.long PPC_INST_SLBIA | \ 139 ((IH & 0x7) << 21)) 140 #define PPC_TLBIE_5(rb,rs,ric,prs,r) \ 141 __XSTRING(.long PPC_INST_TLBIE | \ 142 ___PPC_RB(rb) | ___PPC_RS(rs) | \ 143 ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ 144 ___PPC_R(r)) 145 146 #define PPC_TLBIEL(rb,rs,ric,prs,r) \ 147 __XSTRING(.long PPC_INST_TLBIEL | \ 148 ___PPC_RB(rb) | ___PPC_RS(rs) | \ 149 ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ 150 ___PPC_R(r)) 151 152 #define PPC_INVALIDATE_ERAT PPC_SLBIA(7) 153 154 static __inline void 155 ttusync(void) 156 { 157 __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); 158 } 159 160 #define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */ 161 #define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */ 162 #define TLBIEL_INVAL_SET_PID 0x400 /* invalidate a set for the current PID */ 163 #define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */ 164 #define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */ 165 166 #define TLBIE_ACTUAL_PAGE_MASK 0xe0 167 #define TLBIE_ACTUAL_PAGE_4K 0x00 168 #define TLBIE_ACTUAL_PAGE_64K 0xa0 169 #define TLBIE_ACTUAL_PAGE_2M 0x20 170 #define TLBIE_ACTUAL_PAGE_1G 0x40 171 172 #define TLBIE_PRS_PARTITION_SCOPE 0x0 173 #define TLBIE_PRS_PROCESS_SCOPE 0x1 174 175 #define TLBIE_RIC_INVALIDATE_TLB 0x0 /* Invalidate just TLB */ 176 #define TLBIE_RIC_INVALIDATE_PWC 0x1 /* Invalidate just PWC */ 177 #define TLBIE_RIC_INVALIDATE_ALL 0x2 /* Invalidate TLB, PWC, 178 * cached {proc, part}tab entries 179 */ 180 #define TLBIE_RIC_INVALIDATE_SEQ 0x3 /* HPT - only: 181 * Invalidate a range of translations 182 */ 183 184 static __always_inline void 185 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid, 186 vm_offset_t va, uint16_t ap) 187 { 188 uint64_t rb, rs; 189 190 MPASS((va & PAGE_MASK) == 0); 191 192 rs = ((uint64_t)pid << 32) | lpid; 193 rb = va | is | ap; 194 __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : : 195 "r" (rb), "r" (rs), "i" (ric), "i" (prs)); 196 } 197 198 static __inline void 199 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va) 200 { 201 202 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 203 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K); 204 } 205 206 static __inline void 207 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va) 208 { 209 210 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 211 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M); 212 } 213 214 static __inline void 215 radix_tlbie_invlpwc_user(uint32_t pid) 216 { 217 218 radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, 219 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); 220 } 221 222 static __inline void 223 radix_tlbie_flush_user(uint32_t pid) 224 { 225 226 radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, 227 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); 228 } 229 230 static __inline void 231 radix_tlbie_invlpg_kernel_4k(vm_offset_t va) 232 { 233 234 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 235 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K); 236 } 237 238 static __inline void 239 radix_tlbie_invlpg_kernel_2m(vm_offset_t va) 240 { 241 242 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 243 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M); 244 } 245 246 /* 1GB pages aren't currently supported. */ 247 static __inline __unused void 248 radix_tlbie_invlpg_kernel_1g(vm_offset_t va) 249 { 250 251 radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, 252 TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G); 253 } 254 255 static __inline void 256 radix_tlbie_invlpwc_kernel(void) 257 { 258 259 radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, 260 TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); 261 } 262 263 static __inline void 264 radix_tlbie_flush_kernel(void) 265 { 266 267 radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, 268 TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); 269 } 270 271 static __inline vm_pindex_t 272 pmap_l3e_pindex(vm_offset_t va) 273 { 274 return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT); 275 } 276 277 static __inline vm_pindex_t 278 pmap_pml3e_index(vm_offset_t va) 279 { 280 281 return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); 282 } 283 284 static __inline vm_pindex_t 285 pmap_pml2e_index(vm_offset_t va) 286 { 287 return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); 288 } 289 290 static __inline vm_pindex_t 291 pmap_pml1e_index(vm_offset_t va) 292 { 293 return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); 294 } 295 296 /* Return various clipped indexes for a given VA */ 297 static __inline vm_pindex_t 298 pmap_pte_index(vm_offset_t va) 299 { 300 301 return ((va >> PAGE_SHIFT) & RPTE_MASK); 302 } 303 304 /* Return a pointer to the PT slot that corresponds to a VA */ 305 static __inline pt_entry_t * 306 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) 307 { 308 pt_entry_t *pte; 309 vm_paddr_t ptepa; 310 311 ptepa = (*l3e & NLB_MASK); 312 pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); 313 return (&pte[pmap_pte_index(va)]); 314 } 315 316 /* Return a pointer to the PD slot that corresponds to a VA */ 317 static __inline pt_entry_t * 318 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) 319 { 320 pt_entry_t *l3e; 321 vm_paddr_t l3pa; 322 323 l3pa = (*l2e & NLB_MASK); 324 l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); 325 return (&l3e[pmap_pml3e_index(va)]); 326 } 327 328 /* Return a pointer to the PD slot that corresponds to a VA */ 329 static __inline pt_entry_t * 330 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) 331 { 332 pt_entry_t *l2e; 333 vm_paddr_t l2pa; 334 335 l2pa = (*l1e & NLB_MASK); 336 337 l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); 338 return (&l2e[pmap_pml2e_index(va)]); 339 } 340 341 static __inline pml1_entry_t * 342 pmap_pml1e(pmap_t pmap, vm_offset_t va) 343 { 344 345 return (&pmap->pm_pml1[pmap_pml1e_index(va)]); 346 } 347 348 static pt_entry_t * 349 pmap_pml2e(pmap_t pmap, vm_offset_t va) 350 { 351 pt_entry_t *l1e; 352 353 l1e = pmap_pml1e(pmap, va); 354 if (l1e == NULL || (*l1e & RPTE_VALID) == 0) 355 return (NULL); 356 return (pmap_l1e_to_l2e(l1e, va)); 357 } 358 359 static __inline pt_entry_t * 360 pmap_pml3e(pmap_t pmap, vm_offset_t va) 361 { 362 pt_entry_t *l2e; 363 364 l2e = pmap_pml2e(pmap, va); 365 if (l2e == NULL || (*l2e & RPTE_VALID) == 0) 366 return (NULL); 367 return (pmap_l2e_to_l3e(l2e, va)); 368 } 369 370 static __inline pt_entry_t * 371 pmap_pte(pmap_t pmap, vm_offset_t va) 372 { 373 pt_entry_t *l3e; 374 375 l3e = pmap_pml3e(pmap, va); 376 if (l3e == NULL || (*l3e & RPTE_VALID) == 0) 377 return (NULL); 378 return (pmap_l3e_to_pte(l3e, va)); 379 } 380 381 int nkpt = 64; 382 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 383 "Number of kernel page table pages allocated on bootup"); 384 385 vm_paddr_t dmaplimit; 386 387 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 388 389 static int pg_ps_enabled = 1; 390 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 391 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 392 #ifdef INVARIANTS 393 #define VERBOSE_PMAP 0 394 #define VERBOSE_PROTECT 0 395 static int pmap_logging; 396 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, 397 &pmap_logging, 0, "verbose debug logging"); 398 #endif 399 400 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 401 402 //static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ 403 404 static vm_offset_t qframe = 0; 405 static struct mtx qframe_mtx; 406 407 void mmu_radix_activate(struct thread *); 408 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int); 409 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, 410 vm_size_t); 411 void mmu_radix_clear_modify(vm_page_t); 412 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); 413 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *); 414 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); 415 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, 416 vm_prot_t); 417 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); 418 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va); 419 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); 420 void mmu_radix_kenter(vm_offset_t, vm_paddr_t); 421 vm_paddr_t mmu_radix_kextract(vm_offset_t); 422 void mmu_radix_kremove(vm_offset_t); 423 boolean_t mmu_radix_is_modified(vm_page_t); 424 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t); 425 boolean_t mmu_radix_is_referenced(vm_page_t); 426 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t, 427 vm_pindex_t, vm_size_t); 428 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t); 429 void mmu_radix_page_init(vm_page_t); 430 boolean_t mmu_radix_page_is_mapped(vm_page_t m); 431 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t); 432 int mmu_radix_page_wired_mappings(vm_page_t); 433 int mmu_radix_pinit(pmap_t); 434 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); 435 bool mmu_radix_ps_enabled(pmap_t); 436 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int); 437 void mmu_radix_qremove(vm_offset_t, int); 438 vm_offset_t mmu_radix_quick_enter_page(vm_page_t); 439 void mmu_radix_quick_remove_page(vm_offset_t); 440 boolean_t mmu_radix_ts_referenced(vm_page_t); 441 void mmu_radix_release(pmap_t); 442 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t); 443 void mmu_radix_remove_all(vm_page_t); 444 void mmu_radix_remove_pages(pmap_t); 445 void mmu_radix_remove_write(vm_page_t); 446 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t); 447 void mmu_radix_zero_page(vm_page_t); 448 void mmu_radix_zero_page_area(vm_page_t, int, int); 449 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); 450 void mmu_radix_page_array_startup(long pages); 451 452 #include "mmu_oea64.h" 453 454 /* 455 * Kernel MMU interface 456 */ 457 458 static void mmu_radix_bootstrap(vm_offset_t, vm_offset_t); 459 460 static void mmu_radix_copy_page(vm_page_t, vm_page_t); 461 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset, 462 vm_page_t *mb, vm_offset_t b_offset, int xfersize); 463 static void mmu_radix_growkernel(vm_offset_t); 464 static void mmu_radix_init(void); 465 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *); 466 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); 467 static void mmu_radix_pinit0(pmap_t); 468 469 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t); 470 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); 471 static void mmu_radix_unmapdev(vm_offset_t, vm_size_t); 472 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); 473 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t); 474 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); 475 static void mmu_radix_scan_init(void); 476 static void mmu_radix_cpu_bootstrap(int ap); 477 static void mmu_radix_tlbie_all(void); 478 479 static struct pmap_funcs mmu_radix_methods = { 480 .bootstrap = mmu_radix_bootstrap, 481 .copy_page = mmu_radix_copy_page, 482 .copy_pages = mmu_radix_copy_pages, 483 .cpu_bootstrap = mmu_radix_cpu_bootstrap, 484 .growkernel = mmu_radix_growkernel, 485 .init = mmu_radix_init, 486 .map = mmu_radix_map, 487 .mincore = mmu_radix_mincore, 488 .pinit = mmu_radix_pinit, 489 .pinit0 = mmu_radix_pinit0, 490 491 .mapdev = mmu_radix_mapdev, 492 .mapdev_attr = mmu_radix_mapdev_attr, 493 .unmapdev = mmu_radix_unmapdev, 494 .kenter_attr = mmu_radix_kenter_attr, 495 .dev_direct_mapped = mmu_radix_dev_direct_mapped, 496 .dumpsys_pa_init = mmu_radix_scan_init, 497 .dumpsys_map_chunk = mmu_radix_dumpsys_map, 498 .page_is_mapped = mmu_radix_page_is_mapped, 499 .ps_enabled = mmu_radix_ps_enabled, 500 .object_init_pt = mmu_radix_object_init_pt, 501 .protect = mmu_radix_protect, 502 /* pmap dispatcher interface */ 503 .clear_modify = mmu_radix_clear_modify, 504 .copy = mmu_radix_copy, 505 .enter = mmu_radix_enter, 506 .enter_object = mmu_radix_enter_object, 507 .enter_quick = mmu_radix_enter_quick, 508 .extract = mmu_radix_extract, 509 .extract_and_hold = mmu_radix_extract_and_hold, 510 .is_modified = mmu_radix_is_modified, 511 .is_prefaultable = mmu_radix_is_prefaultable, 512 .is_referenced = mmu_radix_is_referenced, 513 .ts_referenced = mmu_radix_ts_referenced, 514 .page_exists_quick = mmu_radix_page_exists_quick, 515 .page_init = mmu_radix_page_init, 516 .page_wired_mappings = mmu_radix_page_wired_mappings, 517 .qenter = mmu_radix_qenter, 518 .qremove = mmu_radix_qremove, 519 .release = mmu_radix_release, 520 .remove = mmu_radix_remove, 521 .remove_all = mmu_radix_remove_all, 522 .remove_write = mmu_radix_remove_write, 523 .unwire = mmu_radix_unwire, 524 .zero_page = mmu_radix_zero_page, 525 .zero_page_area = mmu_radix_zero_page_area, 526 .activate = mmu_radix_activate, 527 .quick_enter_page = mmu_radix_quick_enter_page, 528 .quick_remove_page = mmu_radix_quick_remove_page, 529 .page_set_memattr = mmu_radix_page_set_memattr, 530 .page_array_startup = mmu_radix_page_array_startup, 531 532 /* Internal interfaces */ 533 .kenter = mmu_radix_kenter, 534 .kextract = mmu_radix_kextract, 535 .kremove = mmu_radix_kremove, 536 .change_attr = mmu_radix_change_attr, 537 .decode_kernel_ptr = mmu_radix_decode_kernel_ptr, 538 539 .tlbie_all = mmu_radix_tlbie_all, 540 }; 541 542 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods); 543 544 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, 545 struct rwlock **lockp); 546 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); 547 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); 548 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, 549 struct spglist *free, struct rwlock **lockp); 550 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 551 pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 552 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 553 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, 554 struct spglist *free); 555 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 556 pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); 557 558 static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, 559 u_int flags, struct rwlock **lockp); 560 #if VM_NRESERVLEVEL > 0 561 static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 562 struct rwlock **lockp); 563 #endif 564 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 565 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 566 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 567 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); 568 569 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 570 vm_prot_t prot, struct rwlock **lockp); 571 static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, 572 u_int flags, vm_page_t m, struct rwlock **lockp); 573 574 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 575 static void free_pv_chunk(struct pv_chunk *pc); 576 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); 577 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, 578 struct rwlock **lockp); 579 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 580 struct rwlock **lockp); 581 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 582 struct spglist *free); 583 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); 584 585 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); 586 static void pmap_invalidate_all(pmap_t pmap); 587 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); 588 589 /* 590 * Internal flags for pmap_enter()'s helper functions. 591 */ 592 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 593 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 594 595 #define UNIMPLEMENTED() panic("%s not implemented", __func__) 596 #define UNTESTED() panic("%s not yet tested", __func__) 597 598 /* Number of supported PID bits */ 599 static unsigned int isa3_pid_bits; 600 601 /* PID to start allocating from */ 602 static unsigned int isa3_base_pid; 603 604 #define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) 605 #define PROCTAB_ENTRIES (1ul << isa3_pid_bits) 606 607 /* 608 * Map of physical memory regions. 609 */ 610 static struct mem_region *regions, *pregions; 611 static struct numa_mem_region *numa_pregions; 612 static u_int phys_avail_count; 613 static int regions_sz, pregions_sz, numa_pregions_sz; 614 static struct pate *isa3_parttab; 615 static struct prte *isa3_proctab; 616 static vmem_t *asid_arena; 617 618 extern void bs_remap_earlyboot(void); 619 620 #define RADIX_PGD_SIZE_SHIFT 16 621 #define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) 622 623 #define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) 624 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) 625 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) 626 627 #define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ 628 #define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ 629 #define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ 630 631 /* POWER9 only permits a 64k partition table size. */ 632 #define PARTTAB_SIZE_SHIFT 16 633 #define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) 634 635 #define PARTTAB_HR (1UL << 63) /* host uses radix */ 636 #define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ 637 638 /* TLB flush actions. Used as argument to tlbiel_all() */ 639 enum { 640 TLB_INVAL_SCOPE_LPID = 0, /* invalidate TLBs for current LPID */ 641 TLB_INVAL_SCOPE_GLOBAL = 1, /* invalidate all TLBs */ 642 }; 643 644 #define NPV_LIST_LOCKS MAXCPU 645 static int pmap_initialized; 646 static vm_paddr_t proctab0pa; 647 static vm_paddr_t parttab_phys; 648 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 649 650 /* 651 * Data for the pv entry allocation mechanism. 652 * Updates to pv_invl_gen are protected by the pv_list_locks[] 653 * elements, but reads are not. 654 */ 655 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 656 static struct mtx __exclusive_cache_line pv_chunks_mutex; 657 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 658 static struct md_page *pv_table; 659 static struct md_page pv_dummy; 660 661 #ifdef PV_STATS 662 #define PV_STAT(x) do { x ; } while (0) 663 #else 664 #define PV_STAT(x) do { } while (0) 665 #endif 666 667 #define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT) 668 #define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)]) 669 670 #define PHYS_TO_PV_LIST_LOCK(pa) \ 671 (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS]) 672 673 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 674 struct rwlock **_lockp = (lockp); \ 675 struct rwlock *_new_lock; \ 676 \ 677 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 678 if (_new_lock != *_lockp) { \ 679 if (*_lockp != NULL) \ 680 rw_wunlock(*_lockp); \ 681 *_lockp = _new_lock; \ 682 rw_wlock(*_lockp); \ 683 } \ 684 } while (0) 685 686 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 687 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 688 689 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 690 struct rwlock **_lockp = (lockp); \ 691 \ 692 if (*_lockp != NULL) { \ 693 rw_wunlock(*_lockp); \ 694 *_lockp = NULL; \ 695 } \ 696 } while (0) 697 698 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 699 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 700 701 /* 702 * We support 52 bits, hence: 703 * bits 52 - 31 = 21, 0b10101 704 * RTS encoding details 705 * bits 0 - 3 of rts -> bits 6 - 8 unsigned long 706 * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long 707 */ 708 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) 709 710 static int powernv_enabled = 1; 711 712 static __always_inline void 713 tlbiel_radix_set_isa300(uint32_t set, uint32_t is, 714 uint32_t pid, uint32_t ric, uint32_t prs) 715 { 716 uint64_t rb; 717 uint64_t rs; 718 719 rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); 720 rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); 721 722 __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) 723 : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) 724 : "memory"); 725 } 726 727 static void 728 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) 729 { 730 uint32_t set; 731 732 __asm __volatile("ptesync": : :"memory"); 733 734 /* 735 * Flush the first set of the TLB, and the entire Page Walk Cache 736 * and partition table entries. Then flush the remaining sets of the 737 * TLB. 738 */ 739 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); 740 for (set = 1; set < num_sets; set++) 741 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); 742 743 /* Do the same for process scoped entries. */ 744 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); 745 for (set = 1; set < num_sets; set++) 746 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); 747 748 __asm __volatile("ptesync": : :"memory"); 749 } 750 751 static void 752 mmu_radix_tlbiel_flush(int scope) 753 { 754 int is; 755 756 MPASS(scope == TLB_INVAL_SCOPE_LPID || 757 scope == TLB_INVAL_SCOPE_GLOBAL); 758 is = scope + 2; 759 760 tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is); 761 __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); 762 } 763 764 static void 765 mmu_radix_tlbie_all() 766 { 767 /* TODO: LPID invalidate */ 768 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 769 } 770 771 static void 772 mmu_radix_init_amor(void) 773 { 774 /* 775 * In HV mode, we init AMOR (Authority Mask Override Register) so that 776 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 777 * Register), enable key 0 and set it to 1. 778 * 779 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 780 */ 781 mtspr(SPR_AMOR, (3ul << 62)); 782 } 783 784 static void 785 mmu_radix_init_iamr(void) 786 { 787 /* 788 * Radix always uses key0 of the IAMR to determine if an access is 789 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 790 * fetch. 791 */ 792 mtspr(SPR_IAMR, (1ul << 62)); 793 } 794 795 static void 796 mmu_radix_pid_set(pmap_t pmap) 797 { 798 799 mtspr(SPR_PID, pmap->pm_pid); 800 isync(); 801 } 802 803 /* Quick sort callout for comparing physical addresses. */ 804 static int 805 pa_cmp(const void *a, const void *b) 806 { 807 const vm_paddr_t *pa = a, *pb = b; 808 809 if (*pa < *pb) 810 return (-1); 811 else if (*pa > *pb) 812 return (1); 813 else 814 return (0); 815 } 816 817 #define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) 818 #define pte_load_clear(ptep) atomic_swap_long(ptep, 0) 819 #define pte_store(ptep, pte) do { \ 820 MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ 821 *(u_long *)(ptep) = (u_long)((pte) | PG_V | RPTE_LEAF); \ 822 } while (0) 823 /* 824 * NB: should only be used for adding directories - not for direct mappings 825 */ 826 #define pde_store(ptep, pa) do { \ 827 *(u_long *)(ptep) = (u_long)(pa|RPTE_VALID|RPTE_SHIFT); \ 828 } while (0) 829 830 #define pte_clear(ptep) do { \ 831 *(u_long *)(ptep) = (u_long)(0); \ 832 } while (0) 833 834 #define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ 835 836 /* 837 * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB 838 * (PTE) page mappings have identical settings for the following fields: 839 */ 840 #define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ 841 PG_M | PG_A | RPTE_EAA_MASK | PG_V) 842 843 static __inline void 844 pmap_resident_count_inc(pmap_t pmap, int count) 845 { 846 847 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 848 pmap->pm_stats.resident_count += count; 849 } 850 851 static __inline void 852 pmap_resident_count_dec(pmap_t pmap, int count) 853 { 854 855 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 856 KASSERT(pmap->pm_stats.resident_count >= count, 857 ("pmap %p resident count underflow %ld %d", pmap, 858 pmap->pm_stats.resident_count, count)); 859 pmap->pm_stats.resident_count -= count; 860 } 861 862 static void 863 pagezero(vm_offset_t va) 864 { 865 va = trunc_page(va); 866 867 bzero((void *)va, PAGE_SIZE); 868 } 869 870 static uint64_t 871 allocpages(int n) 872 { 873 u_int64_t ret; 874 875 ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); 876 for (int i = 0; i < n; i++) 877 pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); 878 return (ret); 879 } 880 881 static pt_entry_t * 882 kvtopte(vm_offset_t va) 883 { 884 pt_entry_t *l3e; 885 886 l3e = pmap_pml3e(kernel_pmap, va); 887 if ((*l3e & RPTE_VALID) == 0) 888 return (NULL); 889 return (pmap_l3e_to_pte(l3e, va)); 890 } 891 892 void 893 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa) 894 { 895 pt_entry_t *pte; 896 897 pte = kvtopte(va); 898 MPASS(pte != NULL); 899 *pte = pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | RPTE_EAA_W | \ 900 RPTE_EAA_P | PG_M | PG_A; 901 } 902 903 bool 904 mmu_radix_ps_enabled(pmap_t pmap) 905 { 906 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 907 } 908 909 static pt_entry_t * 910 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) 911 { 912 pml3_entry_t *l3e; 913 pt_entry_t *pte; 914 915 va &= PG_PS_FRAME; 916 l3e = pmap_pml3e(pmap, va); 917 if (l3e == NULL || (*l3e & PG_V) == 0) 918 return (NULL); 919 920 if (*l3e & RPTE_LEAF) { 921 *is_l3e = 1; 922 return (l3e); 923 } 924 *is_l3e = 0; 925 va &= PG_FRAME; 926 pte = pmap_l3e_to_pte(l3e, va); 927 if (pte == NULL || (*pte & PG_V) == 0) 928 return (NULL); 929 return (pte); 930 } 931 932 int 933 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) 934 { 935 pt_entry_t *pte; 936 pt_entry_t startpte, origpte, newpte; 937 vm_page_t m; 938 int is_l3e; 939 940 startpte = 0; 941 retry: 942 if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) 943 return (KERN_INVALID_ADDRESS); 944 origpte = newpte = *pte; 945 if (startpte == 0) { 946 startpte = origpte; 947 if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || 948 ((flags & VM_PROT_READ) && (startpte & PG_A))) { 949 pmap_invalidate_all(pmap); 950 #ifdef INVARIANTS 951 if (VERBOSE_PMAP || pmap_logging) 952 printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", 953 __func__, pmap, va, flags, origpte); 954 #endif 955 return (KERN_FAILURE); 956 } 957 } 958 #ifdef INVARIANTS 959 if (VERBOSE_PMAP || pmap_logging) 960 printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, 961 flags, origpte); 962 #endif 963 PMAP_LOCK(pmap); 964 if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || 965 *pte != origpte) { 966 PMAP_UNLOCK(pmap); 967 return (KERN_FAILURE); 968 } 969 m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); 970 MPASS(m != NULL); 971 switch (flags) { 972 case VM_PROT_READ: 973 if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) 974 goto protfail; 975 newpte |= PG_A; 976 vm_page_aflag_set(m, PGA_REFERENCED); 977 break; 978 case VM_PROT_WRITE: 979 if ((newpte & RPTE_EAA_W) == 0) 980 goto protfail; 981 if (is_l3e) 982 goto protfail; 983 newpte |= PG_M; 984 vm_page_dirty(m); 985 break; 986 case VM_PROT_EXECUTE: 987 if ((newpte & RPTE_EAA_X) == 0) 988 goto protfail; 989 newpte |= PG_A; 990 vm_page_aflag_set(m, PGA_REFERENCED); 991 break; 992 } 993 994 if (!atomic_cmpset_long(pte, origpte, newpte)) 995 goto retry; 996 ptesync(); 997 PMAP_UNLOCK(pmap); 998 if (startpte == newpte) 999 return (KERN_FAILURE); 1000 return (0); 1001 protfail: 1002 PMAP_UNLOCK(pmap); 1003 return (KERN_PROTECTION_FAILURE); 1004 } 1005 1006 /* 1007 * Returns TRUE if the given page is mapped individually or as part of 1008 * a 2mpage. Otherwise, returns FALSE. 1009 */ 1010 boolean_t 1011 mmu_radix_page_is_mapped(vm_page_t m) 1012 { 1013 struct rwlock *lock; 1014 boolean_t rv; 1015 1016 if ((m->oflags & VPO_UNMANAGED) != 0) 1017 return (FALSE); 1018 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 1019 rw_rlock(lock); 1020 rv = !TAILQ_EMPTY(&m->md.pv_list) || 1021 ((m->flags & PG_FICTITIOUS) == 0 && 1022 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 1023 rw_runlock(lock); 1024 return (rv); 1025 } 1026 1027 /* 1028 * Determine the appropriate bits to set in a PTE or PDE for a specified 1029 * caching mode. 1030 */ 1031 static int 1032 pmap_cache_bits(vm_memattr_t ma) 1033 { 1034 if (ma != VM_MEMATTR_DEFAULT) { 1035 switch (ma) { 1036 case VM_MEMATTR_UNCACHEABLE: 1037 return (RPTE_ATTR_GUARDEDIO); 1038 case VM_MEMATTR_CACHEABLE: 1039 return (RPTE_ATTR_MEM); 1040 case VM_MEMATTR_WRITE_BACK: 1041 case VM_MEMATTR_PREFETCHABLE: 1042 case VM_MEMATTR_WRITE_COMBINING: 1043 return (RPTE_ATTR_UNGUARDEDIO); 1044 } 1045 } 1046 return (0); 1047 } 1048 1049 static void 1050 pmap_invalidate_page(pmap_t pmap, vm_offset_t start) 1051 { 1052 ptesync(); 1053 if (pmap == kernel_pmap) 1054 radix_tlbie_invlpg_kernel_4k(start); 1055 else 1056 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); 1057 ttusync(); 1058 } 1059 1060 static void 1061 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) 1062 { 1063 ptesync(); 1064 if (pmap == kernel_pmap) 1065 radix_tlbie_invlpg_kernel_2m(start); 1066 else 1067 radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); 1068 ttusync(); 1069 } 1070 1071 static void 1072 pmap_invalidate_pwc(pmap_t pmap) 1073 { 1074 ptesync(); 1075 if (pmap == kernel_pmap) 1076 radix_tlbie_invlpwc_kernel(); 1077 else 1078 radix_tlbie_invlpwc_user(pmap->pm_pid); 1079 ttusync(); 1080 } 1081 1082 static void 1083 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) 1084 { 1085 if (((start - end) >> PAGE_SHIFT) > 8) { 1086 pmap_invalidate_all(pmap); 1087 return; 1088 } 1089 ptesync(); 1090 if (pmap == kernel_pmap) { 1091 while (start < end) { 1092 radix_tlbie_invlpg_kernel_4k(start); 1093 start += PAGE_SIZE; 1094 } 1095 } else { 1096 while (start < end) { 1097 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); 1098 start += PAGE_SIZE; 1099 } 1100 } 1101 ttusync(); 1102 } 1103 1104 static void 1105 pmap_invalidate_all(pmap_t pmap) 1106 { 1107 ptesync(); 1108 if (pmap == kernel_pmap) 1109 radix_tlbie_flush_kernel(); 1110 else 1111 radix_tlbie_flush_user(pmap->pm_pid); 1112 ttusync(); 1113 } 1114 1115 static void 1116 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) 1117 { 1118 1119 /* 1120 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 1121 * by a promotion that did not invalidate the 512 4KB page mappings 1122 * that might exist in the TLB. Consequently, at this point, the TLB 1123 * may hold both 4KB and 2MB page mappings for the address range [va, 1124 * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. 1125 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 1126 * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a 1127 * single INVLPG suffices to invalidate the 2MB page mapping from the 1128 * TLB. 1129 */ 1130 ptesync(); 1131 if ((l3e & PG_PROMOTED) != 0) 1132 pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); 1133 else 1134 pmap_invalidate_page_2m(pmap, va); 1135 1136 pmap_invalidate_pwc(pmap); 1137 } 1138 1139 static __inline struct pv_chunk * 1140 pv_to_chunk(pv_entry_t pv) 1141 { 1142 1143 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1144 } 1145 1146 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1147 1148 #define PC_FREE0 0xfffffffffffffffful 1149 #define PC_FREE1 0x3ffffffffffffffful 1150 1151 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 }; 1152 1153 /* 1154 * Ensure that the number of spare PV entries in the specified pmap meets or 1155 * exceeds the given count, "needed". 1156 * 1157 * The given PV list lock may be released. 1158 */ 1159 static void 1160 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1161 { 1162 struct pch new_tail; 1163 struct pv_chunk *pc; 1164 vm_page_t m; 1165 int avail, free; 1166 bool reclaimed; 1167 1168 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1169 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1170 1171 /* 1172 * Newly allocated PV chunks must be stored in a private list until 1173 * the required number of PV chunks have been allocated. Otherwise, 1174 * reclaim_pv_chunk() could recycle one of these chunks. In 1175 * contrast, these chunks must be added to the pmap upon allocation. 1176 */ 1177 TAILQ_INIT(&new_tail); 1178 retry: 1179 avail = 0; 1180 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1181 // if ((cpu_feature2 & CPUID2_POPCNT) == 0) 1182 bit_count((bitstr_t *)pc->pc_map, 0, 1183 sizeof(pc->pc_map) * NBBY, &free); 1184 #if 0 1185 free = popcnt_pc_map_pq(pc->pc_map); 1186 #endif 1187 if (free == 0) 1188 break; 1189 avail += free; 1190 if (avail >= needed) 1191 break; 1192 } 1193 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1194 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1195 VM_ALLOC_WIRED); 1196 if (m == NULL) { 1197 m = reclaim_pv_chunk(pmap, lockp); 1198 if (m == NULL) 1199 goto retry; 1200 reclaimed = true; 1201 } 1202 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1203 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1204 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1205 pc->pc_pmap = pmap; 1206 pc->pc_map[0] = PC_FREE0; 1207 pc->pc_map[1] = PC_FREE1; 1208 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1209 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1210 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 1211 1212 /* 1213 * The reclaim might have freed a chunk from the current pmap. 1214 * If that chunk contained available entries, we need to 1215 * re-count the number of available entries. 1216 */ 1217 if (reclaimed) 1218 goto retry; 1219 } 1220 if (!TAILQ_EMPTY(&new_tail)) { 1221 mtx_lock(&pv_chunks_mutex); 1222 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1223 mtx_unlock(&pv_chunks_mutex); 1224 } 1225 } 1226 1227 /* 1228 * First find and then remove the pv entry for the specified pmap and virtual 1229 * address from the specified pv list. Returns the pv entry if found and NULL 1230 * otherwise. This operation can be performed on pv lists for either 4KB or 1231 * 2MB page mappings. 1232 */ 1233 static __inline pv_entry_t 1234 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1235 { 1236 pv_entry_t pv; 1237 1238 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 1239 #ifdef INVARIANTS 1240 if (PV_PMAP(pv) == NULL) { 1241 printf("corrupted pv_chunk/pv %p\n", pv); 1242 printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); 1243 } 1244 MPASS(PV_PMAP(pv) != NULL); 1245 MPASS(pv->pv_va != 0); 1246 #endif 1247 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1248 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 1249 pvh->pv_gen++; 1250 break; 1251 } 1252 } 1253 return (pv); 1254 } 1255 1256 /* 1257 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1258 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1259 * entries for each of the 4KB page mappings. 1260 */ 1261 static void 1262 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1263 struct rwlock **lockp) 1264 { 1265 struct md_page *pvh; 1266 struct pv_chunk *pc; 1267 pv_entry_t pv; 1268 vm_offset_t va_last; 1269 vm_page_t m; 1270 int bit, field; 1271 1272 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1273 KASSERT((pa & L3_PAGE_MASK) == 0, 1274 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 1275 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1276 1277 /* 1278 * Transfer the 2mpage's pv entry for this mapping to the first 1279 * page's pv list. Once this transfer begins, the pv list lock 1280 * must not be released until the last pv entry is reinstantiated. 1281 */ 1282 pvh = pa_to_pvh(pa); 1283 va = trunc_2mpage(va); 1284 pv = pmap_pvh_remove(pvh, pmap, va); 1285 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 1286 m = PHYS_TO_VM_PAGE(pa); 1287 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1288 1289 m->md.pv_gen++; 1290 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 1291 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 1292 va_last = va + L3_PAGE_SIZE - PAGE_SIZE; 1293 for (;;) { 1294 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1295 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 1296 , ("pmap_pv_demote_pde: missing spare")); 1297 for (field = 0; field < _NPCM; field++) { 1298 while (pc->pc_map[field]) { 1299 bit = cnttzd(pc->pc_map[field]); 1300 pc->pc_map[field] &= ~(1ul << bit); 1301 pv = &pc->pc_pventry[field * 64 + bit]; 1302 va += PAGE_SIZE; 1303 pv->pv_va = va; 1304 m++; 1305 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1306 ("pmap_pv_demote_pde: page %p is not managed", m)); 1307 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1308 1309 m->md.pv_gen++; 1310 if (va == va_last) 1311 goto out; 1312 } 1313 } 1314 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1315 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1316 } 1317 out: 1318 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { 1319 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1320 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1321 } 1322 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 1323 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 1324 } 1325 1326 static void 1327 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap) 1328 { 1329 1330 if (pmap == NULL) 1331 return; 1332 pmap_invalidate_all(pmap); 1333 if (pmap != locked_pmap) 1334 PMAP_UNLOCK(pmap); 1335 } 1336 1337 /* 1338 * We are in a serious low memory condition. Resort to 1339 * drastic measures to free some pages so we can allocate 1340 * another pv entry chunk. 1341 * 1342 * Returns NULL if PV entries were reclaimed from the specified pmap. 1343 * 1344 * We do not, however, unmap 2mpages because subsequent accesses will 1345 * allocate per-page pv entries until repromotion occurs, thereby 1346 * exacerbating the shortage of free pv entries. 1347 */ 1348 static int active_reclaims = 0; 1349 static vm_page_t 1350 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1351 { 1352 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 1353 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 1354 struct md_page *pvh; 1355 pml3_entry_t *l3e; 1356 pmap_t next_pmap, pmap; 1357 pt_entry_t *pte, tpte; 1358 pv_entry_t pv; 1359 vm_offset_t va; 1360 vm_page_t m, m_pc; 1361 struct spglist free; 1362 uint64_t inuse; 1363 int bit, field, freed; 1364 1365 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 1366 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 1367 pmap = NULL; 1368 m_pc = NULL; 1369 SLIST_INIT(&free); 1370 bzero(&pc_marker_b, sizeof(pc_marker_b)); 1371 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 1372 pc_marker = (struct pv_chunk *)&pc_marker_b; 1373 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 1374 1375 mtx_lock(&pv_chunks_mutex); 1376 active_reclaims++; 1377 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); 1378 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); 1379 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 1380 SLIST_EMPTY(&free)) { 1381 next_pmap = pc->pc_pmap; 1382 if (next_pmap == NULL) { 1383 /* 1384 * The next chunk is a marker. However, it is 1385 * not our marker, so active_reclaims must be 1386 * > 1. Consequently, the next_chunk code 1387 * will not rotate the pv_chunks list. 1388 */ 1389 goto next_chunk; 1390 } 1391 mtx_unlock(&pv_chunks_mutex); 1392 1393 /* 1394 * A pv_chunk can only be removed from the pc_lru list 1395 * when both pc_chunks_mutex is owned and the 1396 * corresponding pmap is locked. 1397 */ 1398 if (pmap != next_pmap) { 1399 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); 1400 pmap = next_pmap; 1401 /* Avoid deadlock and lock recursion. */ 1402 if (pmap > locked_pmap) { 1403 RELEASE_PV_LIST_LOCK(lockp); 1404 PMAP_LOCK(pmap); 1405 mtx_lock(&pv_chunks_mutex); 1406 continue; 1407 } else if (pmap != locked_pmap) { 1408 if (PMAP_TRYLOCK(pmap)) { 1409 mtx_lock(&pv_chunks_mutex); 1410 continue; 1411 } else { 1412 pmap = NULL; /* pmap is not locked */ 1413 mtx_lock(&pv_chunks_mutex); 1414 pc = TAILQ_NEXT(pc_marker, pc_lru); 1415 if (pc == NULL || 1416 pc->pc_pmap != next_pmap) 1417 continue; 1418 goto next_chunk; 1419 } 1420 } 1421 } 1422 1423 /* 1424 * Destroy every non-wired, 4 KB page mapping in the chunk. 1425 */ 1426 freed = 0; 1427 for (field = 0; field < _NPCM; field++) { 1428 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 1429 inuse != 0; inuse &= ~(1UL << bit)) { 1430 bit = cnttzd(inuse); 1431 pv = &pc->pc_pventry[field * 64 + bit]; 1432 va = pv->pv_va; 1433 l3e = pmap_pml3e(pmap, va); 1434 if ((*l3e & RPTE_LEAF) != 0) 1435 continue; 1436 pte = pmap_l3e_to_pte(l3e, va); 1437 if ((*pte & PG_W) != 0) 1438 continue; 1439 tpte = pte_load_clear(pte); 1440 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 1441 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 1442 vm_page_dirty(m); 1443 if ((tpte & PG_A) != 0) 1444 vm_page_aflag_set(m, PGA_REFERENCED); 1445 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1446 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 1447 1448 m->md.pv_gen++; 1449 if (TAILQ_EMPTY(&m->md.pv_list) && 1450 (m->flags & PG_FICTITIOUS) == 0) { 1451 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 1452 if (TAILQ_EMPTY(&pvh->pv_list)) { 1453 vm_page_aflag_clear(m, 1454 PGA_WRITEABLE); 1455 } 1456 } 1457 pc->pc_map[field] |= 1UL << bit; 1458 pmap_unuse_pt(pmap, va, *l3e, &free); 1459 freed++; 1460 } 1461 } 1462 if (freed == 0) { 1463 mtx_lock(&pv_chunks_mutex); 1464 goto next_chunk; 1465 } 1466 /* Every freed mapping is for a 4 KB page. */ 1467 pmap_resident_count_dec(pmap, freed); 1468 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 1469 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 1470 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 1471 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1472 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) { 1473 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1474 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1475 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1476 /* Entire chunk is free; return it. */ 1477 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1478 mtx_lock(&pv_chunks_mutex); 1479 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1480 break; 1481 } 1482 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1483 mtx_lock(&pv_chunks_mutex); 1484 /* One freed pv entry in locked_pmap is sufficient. */ 1485 if (pmap == locked_pmap) 1486 break; 1487 next_chunk: 1488 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 1489 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); 1490 if (active_reclaims == 1 && pmap != NULL) { 1491 /* 1492 * Rotate the pv chunks list so that we do not 1493 * scan the same pv chunks that could not be 1494 * freed (because they contained a wired 1495 * and/or superpage mapping) on every 1496 * invocation of reclaim_pv_chunk(). 1497 */ 1498 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { 1499 MPASS(pc->pc_pmap != NULL); 1500 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1501 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1502 } 1503 } 1504 } 1505 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); 1506 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); 1507 active_reclaims--; 1508 mtx_unlock(&pv_chunks_mutex); 1509 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); 1510 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 1511 m_pc = SLIST_FIRST(&free); 1512 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 1513 /* Recycle a freed page table page. */ 1514 m_pc->ref_count = 1; 1515 } 1516 vm_page_free_pages_toq(&free, true); 1517 return (m_pc); 1518 } 1519 1520 /* 1521 * free the pv_entry back to the free list 1522 */ 1523 static void 1524 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1525 { 1526 struct pv_chunk *pc; 1527 int idx, field, bit; 1528 1529 #ifdef VERBOSE_PV 1530 if (pmap != kernel_pmap) 1531 printf("%s(%p, %p)\n", __func__, pmap, pv); 1532 #endif 1533 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1534 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1535 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1536 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1537 pc = pv_to_chunk(pv); 1538 idx = pv - &pc->pc_pventry[0]; 1539 field = idx / 64; 1540 bit = idx % 64; 1541 pc->pc_map[field] |= 1ul << bit; 1542 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) { 1543 /* 98% of the time, pc is already at the head of the list. */ 1544 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1545 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1546 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1547 } 1548 return; 1549 } 1550 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1551 free_pv_chunk(pc); 1552 } 1553 1554 static void 1555 free_pv_chunk(struct pv_chunk *pc) 1556 { 1557 vm_page_t m; 1558 1559 mtx_lock(&pv_chunks_mutex); 1560 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1561 mtx_unlock(&pv_chunks_mutex); 1562 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1563 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1564 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1565 /* entire chunk is free, return it */ 1566 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1567 vm_page_unwire_noq(m); 1568 vm_page_free(m); 1569 } 1570 1571 /* 1572 * Returns a new PV entry, allocating a new PV chunk from the system when 1573 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1574 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1575 * returned. 1576 * 1577 * The given PV list lock may be released. 1578 */ 1579 static pv_entry_t 1580 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1581 { 1582 int bit, field; 1583 pv_entry_t pv; 1584 struct pv_chunk *pc; 1585 vm_page_t m; 1586 1587 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1588 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1589 retry: 1590 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1591 if (pc != NULL) { 1592 for (field = 0; field < _NPCM; field++) { 1593 if (pc->pc_map[field]) { 1594 bit = cnttzd(pc->pc_map[field]); 1595 break; 1596 } 1597 } 1598 if (field < _NPCM) { 1599 pv = &pc->pc_pventry[field * 64 + bit]; 1600 pc->pc_map[field] &= ~(1ul << bit); 1601 /* If this was the last item, move it to tail */ 1602 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { 1603 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1604 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1605 pc_list); 1606 } 1607 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1608 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1609 MPASS(PV_PMAP(pv) != NULL); 1610 return (pv); 1611 } 1612 } 1613 /* No free items, allocate another chunk */ 1614 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1615 VM_ALLOC_WIRED); 1616 if (m == NULL) { 1617 if (lockp == NULL) { 1618 PV_STAT(pc_chunk_tryfail++); 1619 return (NULL); 1620 } 1621 m = reclaim_pv_chunk(pmap, lockp); 1622 if (m == NULL) 1623 goto retry; 1624 } 1625 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1626 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1627 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1628 pc->pc_pmap = pmap; 1629 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1630 pc->pc_map[1] = PC_FREE1; 1631 mtx_lock(&pv_chunks_mutex); 1632 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1633 mtx_unlock(&pv_chunks_mutex); 1634 pv = &pc->pc_pventry[0]; 1635 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1636 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1637 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1638 MPASS(PV_PMAP(pv) != NULL); 1639 return (pv); 1640 } 1641 1642 #if VM_NRESERVLEVEL > 0 1643 /* 1644 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 1645 * replace the many pv entries for the 4KB page mappings by a single pv entry 1646 * for the 2MB page mapping. 1647 */ 1648 static void 1649 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1650 struct rwlock **lockp) 1651 { 1652 struct md_page *pvh; 1653 pv_entry_t pv; 1654 vm_offset_t va_last; 1655 vm_page_t m; 1656 1657 KASSERT((pa & L3_PAGE_MASK) == 0, 1658 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 1659 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1660 1661 /* 1662 * Transfer the first page's pv entry for this mapping to the 2mpage's 1663 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 1664 * a transfer avoids the possibility that get_pv_entry() calls 1665 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 1666 * mappings that is being promoted. 1667 */ 1668 m = PHYS_TO_VM_PAGE(pa); 1669 va = trunc_2mpage(va); 1670 pv = pmap_pvh_remove(&m->md, pmap, va); 1671 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 1672 pvh = pa_to_pvh(pa); 1673 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 1674 pvh->pv_gen++; 1675 /* Free the remaining NPTEPG - 1 pv entries. */ 1676 va_last = va + L3_PAGE_SIZE - PAGE_SIZE; 1677 do { 1678 m++; 1679 va += PAGE_SIZE; 1680 pmap_pvh_free(&m->md, pmap, va); 1681 } while (va < va_last); 1682 } 1683 #endif /* VM_NRESERVLEVEL > 0 */ 1684 1685 /* 1686 * First find and then destroy the pv entry for the specified pmap and virtual 1687 * address. This operation can be performed on pv lists for either 4KB or 2MB 1688 * page mappings. 1689 */ 1690 static void 1691 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1692 { 1693 pv_entry_t pv; 1694 1695 pv = pmap_pvh_remove(pvh, pmap, va); 1696 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 1697 free_pv_entry(pmap, pv); 1698 } 1699 1700 /* 1701 * Conditionally create the PV entry for a 4KB page mapping if the required 1702 * memory can be allocated without resorting to reclamation. 1703 */ 1704 static boolean_t 1705 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1706 struct rwlock **lockp) 1707 { 1708 pv_entry_t pv; 1709 1710 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1711 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1712 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1713 pv->pv_va = va; 1714 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1715 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 1716 m->md.pv_gen++; 1717 return (TRUE); 1718 } else 1719 return (FALSE); 1720 } 1721 1722 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; 1723 #ifdef INVARIANTS 1724 static void 1725 validate_addr(vm_paddr_t addr, vm_size_t size) 1726 { 1727 vm_paddr_t end = addr + size; 1728 bool found = false; 1729 1730 for (int i = 0; i < 2 * phys_avail_count; i += 2) { 1731 if (addr >= phys_avail_debug[i] && 1732 end <= phys_avail_debug[i + 1]) { 1733 found = true; 1734 break; 1735 } 1736 } 1737 KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", 1738 addr, end)); 1739 } 1740 #else 1741 static void validate_addr(vm_paddr_t addr, vm_size_t size) {} 1742 #endif 1743 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) 1744 1745 static vm_paddr_t 1746 alloc_pt_page(void) 1747 { 1748 vm_paddr_t page; 1749 1750 page = allocpages(1); 1751 pagezero(PHYS_TO_DMAP(page)); 1752 return (page); 1753 } 1754 1755 static void 1756 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) 1757 { 1758 pt_entry_t *pte, pteval; 1759 vm_paddr_t page; 1760 1761 if (bootverbose) 1762 printf("%s %lx -> %lx\n", __func__, start, end); 1763 while (start < end) { 1764 pteval = start | DMAP_PAGE_BITS; 1765 pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); 1766 if ((*pte & RPTE_VALID) == 0) { 1767 page = alloc_pt_page(); 1768 pde_store(pte, page); 1769 } 1770 pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); 1771 if ((start & L2_PAGE_MASK) == 0 && 1772 end - start >= L2_PAGE_SIZE) { 1773 start += L2_PAGE_SIZE; 1774 goto done; 1775 } else if ((*pte & RPTE_VALID) == 0) { 1776 page = alloc_pt_page(); 1777 pde_store(pte, page); 1778 } 1779 1780 pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); 1781 if ((start & L3_PAGE_MASK) == 0 && 1782 end - start >= L3_PAGE_SIZE) { 1783 start += L3_PAGE_SIZE; 1784 goto done; 1785 } else if ((*pte & RPTE_VALID) == 0) { 1786 page = alloc_pt_page(); 1787 pde_store(pte, page); 1788 } 1789 pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); 1790 start += PAGE_SIZE; 1791 done: 1792 pte_store(pte, pteval); 1793 } 1794 } 1795 1796 static void 1797 mmu_radix_dmap_populate(vm_size_t hwphyssz) 1798 { 1799 vm_paddr_t start, end; 1800 1801 for (int i = 0; i < pregions_sz; i++) { 1802 start = pregions[i].mr_start; 1803 end = start + pregions[i].mr_size; 1804 if (hwphyssz && start >= hwphyssz) 1805 break; 1806 if (hwphyssz && hwphyssz < end) 1807 end = hwphyssz; 1808 mmu_radix_dmap_range(start, end); 1809 } 1810 } 1811 1812 static void 1813 mmu_radix_setup_pagetables(vm_size_t hwphyssz) 1814 { 1815 vm_paddr_t ptpages, pages; 1816 pt_entry_t *pte; 1817 vm_paddr_t l1phys; 1818 1819 bzero(kernel_pmap, sizeof(struct pmap)); 1820 PMAP_LOCK_INIT(kernel_pmap); 1821 1822 ptpages = allocpages(2); 1823 l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); 1824 validate_addr(l1phys, RADIX_PGD_SIZE); 1825 if (bootverbose) 1826 printf("l1phys=%lx\n", l1phys); 1827 MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); 1828 for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) 1829 pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); 1830 kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); 1831 1832 mmu_radix_dmap_populate(hwphyssz); 1833 1834 /* 1835 * Create page tables for first 128MB of KVA 1836 */ 1837 pages = ptpages; 1838 pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); 1839 *pte = (pages | RPTE_VALID | RPTE_SHIFT); 1840 pages += PAGE_SIZE; 1841 pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); 1842 *pte = (pages | RPTE_VALID | RPTE_SHIFT); 1843 pages += PAGE_SIZE; 1844 pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); 1845 /* 1846 * the kernel page table pages need to be preserved in 1847 * phys_avail and not overlap with previous allocations 1848 */ 1849 pages = allocpages(nkpt); 1850 if (bootverbose) { 1851 printf("phys_avail after dmap populate and nkpt allocation\n"); 1852 for (int j = 0; j < 2 * phys_avail_count; j+=2) 1853 printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", 1854 j, phys_avail[j], j + 1, phys_avail[j + 1]); 1855 } 1856 KPTphys = pages; 1857 for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) 1858 *pte = (pages | RPTE_VALID | RPTE_SHIFT); 1859 kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; 1860 if (bootverbose) 1861 printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); 1862 /* 1863 * Add a physical memory segment (vm_phys_seg) corresponding to the 1864 * preallocated kernel page table pages so that vm_page structures 1865 * representing these pages will be created. The vm_page structures 1866 * are required for promotion of the corresponding kernel virtual 1867 * addresses to superpage mappings. 1868 */ 1869 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1870 } 1871 1872 static void 1873 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) 1874 { 1875 vm_paddr_t kpstart, kpend; 1876 vm_size_t physsz, hwphyssz; 1877 //uint64_t l2virt; 1878 int rm_pavail, proctab_size; 1879 int i, j; 1880 1881 kpstart = start & ~DMAP_BASE_ADDRESS; 1882 kpend = end & ~DMAP_BASE_ADDRESS; 1883 1884 /* Get physical memory regions from firmware */ 1885 mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); 1886 CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); 1887 1888 if (2 * VM_PHYSSEG_MAX < regions_sz) 1889 panic("mmu_radix_early_bootstrap: phys_avail too small"); 1890 1891 if (bootverbose) 1892 for (int i = 0; i < regions_sz; i++) 1893 printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", 1894 i, regions[i].mr_start, i, regions[i].mr_size); 1895 /* 1896 * XXX workaround a simulator bug 1897 */ 1898 for (int i = 0; i < regions_sz; i++) 1899 if (regions[i].mr_start & PAGE_MASK) { 1900 regions[i].mr_start += PAGE_MASK; 1901 regions[i].mr_start &= ~PAGE_MASK; 1902 regions[i].mr_size &= ~PAGE_MASK; 1903 } 1904 if (bootverbose) 1905 for (int i = 0; i < pregions_sz; i++) 1906 printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", 1907 i, pregions[i].mr_start, i, pregions[i].mr_size); 1908 1909 phys_avail_count = 0; 1910 physsz = 0; 1911 hwphyssz = 0; 1912 TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); 1913 for (i = 0, j = 0; i < regions_sz; i++) { 1914 if (bootverbose) 1915 printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", 1916 i, regions[i].mr_start, i, regions[i].mr_size); 1917 1918 if (regions[i].mr_size < PAGE_SIZE) 1919 continue; 1920 1921 if (hwphyssz != 0 && 1922 (physsz + regions[i].mr_size) >= hwphyssz) { 1923 if (physsz < hwphyssz) { 1924 phys_avail[j] = regions[i].mr_start; 1925 phys_avail[j + 1] = regions[i].mr_start + 1926 (hwphyssz - physsz); 1927 physsz = hwphyssz; 1928 phys_avail_count++; 1929 dump_avail[j] = phys_avail[j]; 1930 dump_avail[j + 1] = phys_avail[j + 1]; 1931 } 1932 break; 1933 } 1934 phys_avail[j] = regions[i].mr_start; 1935 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; 1936 dump_avail[j] = phys_avail[j]; 1937 dump_avail[j + 1] = phys_avail[j + 1]; 1938 1939 phys_avail_count++; 1940 physsz += regions[i].mr_size; 1941 j += 2; 1942 } 1943 1944 /* Check for overlap with the kernel and exception vectors */ 1945 rm_pavail = 0; 1946 for (j = 0; j < 2 * phys_avail_count; j+=2) { 1947 if (phys_avail[j] < EXC_LAST) 1948 phys_avail[j] += EXC_LAST; 1949 1950 if (phys_avail[j] >= kpstart && 1951 phys_avail[j + 1] <= kpend) { 1952 phys_avail[j] = phys_avail[j + 1] = ~0; 1953 rm_pavail++; 1954 continue; 1955 } 1956 1957 if (kpstart >= phys_avail[j] && 1958 kpstart < phys_avail[j + 1]) { 1959 if (kpend < phys_avail[j + 1]) { 1960 phys_avail[2 * phys_avail_count] = 1961 (kpend & ~PAGE_MASK) + PAGE_SIZE; 1962 phys_avail[2 * phys_avail_count + 1] = 1963 phys_avail[j + 1]; 1964 phys_avail_count++; 1965 } 1966 1967 phys_avail[j + 1] = kpstart & ~PAGE_MASK; 1968 } 1969 1970 if (kpend >= phys_avail[j] && 1971 kpend < phys_avail[j + 1]) { 1972 if (kpstart > phys_avail[j]) { 1973 phys_avail[2 * phys_avail_count] = phys_avail[j]; 1974 phys_avail[2 * phys_avail_count + 1] = 1975 kpstart & ~PAGE_MASK; 1976 phys_avail_count++; 1977 } 1978 1979 phys_avail[j] = (kpend & ~PAGE_MASK) + 1980 PAGE_SIZE; 1981 } 1982 } 1983 qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); 1984 for (i = 0; i < 2 * phys_avail_count; i++) 1985 phys_avail_debug[i] = phys_avail[i]; 1986 1987 /* Remove physical available regions marked for removal (~0) */ 1988 if (rm_pavail) { 1989 phys_avail_count -= rm_pavail; 1990 for (i = 2 * phys_avail_count; 1991 i < 2*(phys_avail_count + rm_pavail); i+=2) 1992 phys_avail[i] = phys_avail[i + 1] = 0; 1993 } 1994 if (bootverbose) { 1995 printf("phys_avail ranges after filtering:\n"); 1996 for (j = 0; j < 2 * phys_avail_count; j+=2) 1997 printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", 1998 j, phys_avail[j], j + 1, phys_avail[j + 1]); 1999 } 2000 physmem = btoc(physsz); 2001 2002 /* XXX assume we're running non-virtualized and 2003 * we don't support BHYVE 2004 */ 2005 if (isa3_pid_bits == 0) 2006 isa3_pid_bits = 20; 2007 parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); 2008 validate_addr(parttab_phys, PARTTAB_SIZE); 2009 for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) 2010 pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); 2011 2012 proctab_size = 1UL << PROCTAB_SIZE_SHIFT; 2013 proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); 2014 validate_addr(proctab0pa, proctab_size); 2015 for (int i = 0; i < proctab_size/PAGE_SIZE; i++) 2016 pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); 2017 2018 mmu_radix_setup_pagetables(hwphyssz); 2019 } 2020 2021 static void 2022 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end) 2023 { 2024 int i; 2025 vm_paddr_t pa; 2026 void *dpcpu; 2027 vm_offset_t va; 2028 2029 /* 2030 * Set up the Open Firmware pmap and add its mappings if not in real 2031 * mode. 2032 */ 2033 if (bootverbose) 2034 printf("%s enter\n", __func__); 2035 2036 /* 2037 * Calculate the last available physical address, and reserve the 2038 * vm_page_array (upper bound). 2039 */ 2040 Maxmem = 0; 2041 for (i = 0; phys_avail[i + 2] != 0; i += 2) 2042 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); 2043 2044 /* 2045 * Set the start and end of kva. 2046 */ 2047 virtual_avail = VM_MIN_KERNEL_ADDRESS; 2048 virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; 2049 2050 /* 2051 * Remap any early IO mappings (console framebuffer, etc.) 2052 */ 2053 bs_remap_earlyboot(); 2054 2055 /* 2056 * Allocate a kernel stack with a guard page for thread0 and map it 2057 * into the kernel page map. 2058 */ 2059 pa = allocpages(kstack_pages); 2060 va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; 2061 virtual_avail = va + kstack_pages * PAGE_SIZE; 2062 CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); 2063 thread0.td_kstack = va; 2064 for (i = 0; i < kstack_pages; i++) { 2065 mmu_radix_kenter(va, pa); 2066 pa += PAGE_SIZE; 2067 va += PAGE_SIZE; 2068 } 2069 thread0.td_kstack_pages = kstack_pages; 2070 2071 /* 2072 * Allocate virtual address space for the message buffer. 2073 */ 2074 pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); 2075 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); 2076 2077 /* 2078 * Allocate virtual address space for the dynamic percpu area. 2079 */ 2080 pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); 2081 dpcpu = (void *)PHYS_TO_DMAP(pa); 2082 dpcpu_init(dpcpu, curcpu); 2083 /* 2084 * Reserve some special page table entries/VA space for temporary 2085 * mapping of pages. 2086 */ 2087 } 2088 2089 static void 2090 mmu_parttab_init(void) 2091 { 2092 uint64_t ptcr; 2093 2094 isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); 2095 2096 if (bootverbose) 2097 printf("%s parttab: %p\n", __func__, isa3_parttab); 2098 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); 2099 if (bootverbose) 2100 printf("setting ptcr %lx\n", ptcr); 2101 mtspr(SPR_PTCR, ptcr); 2102 } 2103 2104 static void 2105 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) 2106 { 2107 uint64_t prev; 2108 2109 if (bootverbose) 2110 printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, 2111 lpid, pagetab, proctab); 2112 prev = be64toh(isa3_parttab[lpid].pagetab); 2113 isa3_parttab[lpid].pagetab = htobe64(pagetab); 2114 isa3_parttab[lpid].proctab = htobe64(proctab); 2115 2116 if (prev & PARTTAB_HR) { 2117 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : 2118 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2119 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 2120 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2121 } else { 2122 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : 2123 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 2124 } 2125 ttusync(); 2126 } 2127 2128 static void 2129 mmu_radix_parttab_init(void) 2130 { 2131 uint64_t pagetab; 2132 2133 mmu_parttab_init(); 2134 pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ 2135 RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; 2136 mmu_parttab_update(0, pagetab, 0); 2137 } 2138 2139 static void 2140 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) 2141 { 2142 uint64_t pagetab, proctab; 2143 2144 pagetab = be64toh(isa3_parttab[0].pagetab); 2145 proctab = proctabpa | table_size | PARTTAB_GR; 2146 mmu_parttab_update(0, pagetab, proctab); 2147 } 2148 2149 static void 2150 mmu_radix_proctab_init(void) 2151 { 2152 2153 isa3_base_pid = 1; 2154 2155 isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); 2156 isa3_proctab->proctab0 = 2157 htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | 2158 RADIX_PGD_INDEX_SHIFT); 2159 2160 mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); 2161 2162 __asm __volatile("ptesync" : : : "memory"); 2163 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 2164 "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); 2165 __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); 2166 if (bootverbose) 2167 printf("process table %p and kernel radix PDE: %p\n", 2168 isa3_proctab, kernel_pmap->pm_pml1); 2169 mtmsr(mfmsr() | PSL_DR ); 2170 mtmsr(mfmsr() & ~PSL_DR); 2171 kernel_pmap->pm_pid = isa3_base_pid; 2172 isa3_base_pid++; 2173 } 2174 2175 void 2176 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 2177 int advice) 2178 { 2179 struct rwlock *lock; 2180 pml1_entry_t *l1e; 2181 pml2_entry_t *l2e; 2182 pml3_entry_t oldl3e, *l3e; 2183 pt_entry_t *pte; 2184 vm_offset_t va, va_next; 2185 vm_page_t m; 2186 boolean_t anychanged; 2187 2188 if (advice != MADV_DONTNEED && advice != MADV_FREE) 2189 return; 2190 anychanged = FALSE; 2191 PMAP_LOCK(pmap); 2192 for (; sva < eva; sva = va_next) { 2193 l1e = pmap_pml1e(pmap, sva); 2194 if ((*l1e & PG_V) == 0) { 2195 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 2196 if (va_next < sva) 2197 va_next = eva; 2198 continue; 2199 } 2200 l2e = pmap_l1e_to_l2e(l1e, sva); 2201 if ((*l2e & PG_V) == 0) { 2202 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 2203 if (va_next < sva) 2204 va_next = eva; 2205 continue; 2206 } 2207 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 2208 if (va_next < sva) 2209 va_next = eva; 2210 l3e = pmap_l2e_to_l3e(l2e, sva); 2211 oldl3e = *l3e; 2212 if ((oldl3e & PG_V) == 0) 2213 continue; 2214 else if ((oldl3e & RPTE_LEAF) != 0) { 2215 if ((oldl3e & PG_MANAGED) == 0) 2216 continue; 2217 lock = NULL; 2218 if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { 2219 if (lock != NULL) 2220 rw_wunlock(lock); 2221 2222 /* 2223 * The large page mapping was destroyed. 2224 */ 2225 continue; 2226 } 2227 2228 /* 2229 * Unless the page mappings are wired, remove the 2230 * mapping to a single page so that a subsequent 2231 * access may repromote. Since the underlying page 2232 * table page is fully populated, this removal never 2233 * frees a page table page. 2234 */ 2235 if ((oldl3e & PG_W) == 0) { 2236 pte = pmap_l3e_to_pte(l3e, sva); 2237 KASSERT((*pte & PG_V) != 0, 2238 ("pmap_advise: invalid PTE")); 2239 pmap_remove_pte(pmap, pte, sva, *l3e, NULL, 2240 &lock); 2241 anychanged = TRUE; 2242 } 2243 if (lock != NULL) 2244 rw_wunlock(lock); 2245 } 2246 if (va_next > eva) 2247 va_next = eva; 2248 va = va_next; 2249 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; 2250 pte++, sva += PAGE_SIZE) { 2251 MPASS(pte == pmap_pte(pmap, sva)); 2252 2253 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 2254 goto maybe_invlrng; 2255 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2256 if (advice == MADV_DONTNEED) { 2257 /* 2258 * Future calls to pmap_is_modified() 2259 * can be avoided by making the page 2260 * dirty now. 2261 */ 2262 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 2263 vm_page_dirty(m); 2264 } 2265 atomic_clear_long(pte, PG_M | PG_A); 2266 } else if ((*pte & PG_A) != 0) 2267 atomic_clear_long(pte, PG_A); 2268 else 2269 goto maybe_invlrng; 2270 anychanged = TRUE; 2271 continue; 2272 maybe_invlrng: 2273 if (va != va_next) { 2274 anychanged = true; 2275 va = va_next; 2276 } 2277 } 2278 if (va != va_next) 2279 anychanged = true; 2280 } 2281 if (anychanged) 2282 pmap_invalidate_all(pmap); 2283 PMAP_UNLOCK(pmap); 2284 } 2285 2286 /* 2287 * Routines used in machine-dependent code 2288 */ 2289 static void 2290 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end) 2291 { 2292 uint64_t lpcr; 2293 2294 if (bootverbose) 2295 printf("%s\n", __func__); 2296 hw_direct_map = 1; 2297 mmu_radix_early_bootstrap(start, end); 2298 if (bootverbose) 2299 printf("early bootstrap complete\n"); 2300 if (powernv_enabled) { 2301 lpcr = mfspr(SPR_LPCR); 2302 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 2303 mmu_radix_parttab_init(); 2304 mmu_radix_init_amor(); 2305 if (bootverbose) 2306 printf("powernv init complete\n"); 2307 } 2308 mmu_radix_init_iamr(); 2309 mmu_radix_proctab_init(); 2310 mmu_radix_pid_set(kernel_pmap); 2311 /* XXX assume CPU_FTR_HVMODE */ 2312 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 2313 2314 mmu_radix_late_bootstrap(start, end); 2315 numa_mem_regions(&numa_pregions, &numa_pregions_sz); 2316 if (bootverbose) 2317 printf("%s done\n", __func__); 2318 pmap_bootstrapped = 1; 2319 dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); 2320 PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS); 2321 } 2322 2323 static void 2324 mmu_radix_cpu_bootstrap(int ap) 2325 { 2326 uint64_t lpcr; 2327 uint64_t ptcr; 2328 2329 if (powernv_enabled) { 2330 lpcr = mfspr(SPR_LPCR); 2331 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 2332 2333 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); 2334 mtspr(SPR_PTCR, ptcr); 2335 mmu_radix_init_amor(); 2336 } 2337 mmu_radix_init_iamr(); 2338 mmu_radix_pid_set(kernel_pmap); 2339 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); 2340 } 2341 2342 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, 2343 "2MB page mapping counters"); 2344 2345 static u_long pmap_l3e_demotions; 2346 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, 2347 &pmap_l3e_demotions, 0, "2MB page demotions"); 2348 2349 static u_long pmap_l3e_mappings; 2350 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, 2351 &pmap_l3e_mappings, 0, "2MB page mappings"); 2352 2353 static u_long pmap_l3e_p_failures; 2354 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, 2355 &pmap_l3e_p_failures, 0, "2MB page promotion failures"); 2356 2357 static u_long pmap_l3e_promotions; 2358 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, 2359 &pmap_l3e_promotions, 0, "2MB page promotions"); 2360 2361 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, 2362 "1GB page mapping counters"); 2363 2364 static u_long pmap_l2e_demotions; 2365 SYSCTL_ULONG(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, 2366 &pmap_l2e_demotions, 0, "1GB page demotions"); 2367 2368 void 2369 mmu_radix_clear_modify(vm_page_t m) 2370 { 2371 struct md_page *pvh; 2372 pmap_t pmap; 2373 pv_entry_t next_pv, pv; 2374 pml3_entry_t oldl3e, *l3e; 2375 pt_entry_t oldpte, *pte; 2376 struct rwlock *lock; 2377 vm_offset_t va; 2378 int md_gen, pvh_gen; 2379 2380 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2381 ("pmap_clear_modify: page %p is not managed", m)); 2382 vm_page_assert_busied(m); 2383 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 2384 2385 /* 2386 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 2387 * If the object containing the page is locked and the page is not 2388 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 2389 */ 2390 if ((m->a.flags & PGA_WRITEABLE) == 0) 2391 return; 2392 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2393 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2394 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 2395 rw_wlock(lock); 2396 restart: 2397 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { 2398 pmap = PV_PMAP(pv); 2399 if (!PMAP_TRYLOCK(pmap)) { 2400 pvh_gen = pvh->pv_gen; 2401 rw_wunlock(lock); 2402 PMAP_LOCK(pmap); 2403 rw_wlock(lock); 2404 if (pvh_gen != pvh->pv_gen) { 2405 PMAP_UNLOCK(pmap); 2406 goto restart; 2407 } 2408 } 2409 va = pv->pv_va; 2410 l3e = pmap_pml3e(pmap, va); 2411 oldl3e = *l3e; 2412 if ((oldl3e & PG_RW) != 0) { 2413 if (pmap_demote_l3e_locked(pmap, l3e, va, &lock)) { 2414 if ((oldl3e & PG_W) == 0) { 2415 /* 2416 * Write protect the mapping to a 2417 * single page so that a subsequent 2418 * write access may repromote. 2419 */ 2420 va += VM_PAGE_TO_PHYS(m) - (oldl3e & 2421 PG_PS_FRAME); 2422 pte = pmap_l3e_to_pte(l3e, va); 2423 oldpte = *pte; 2424 if ((oldpte & PG_V) != 0) { 2425 while (!atomic_cmpset_long(pte, 2426 oldpte, 2427 (oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))) 2428 oldpte = *pte; 2429 vm_page_dirty(m); 2430 pmap_invalidate_page(pmap, va); 2431 } 2432 } 2433 } 2434 } 2435 PMAP_UNLOCK(pmap); 2436 } 2437 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 2438 pmap = PV_PMAP(pv); 2439 if (!PMAP_TRYLOCK(pmap)) { 2440 md_gen = m->md.pv_gen; 2441 pvh_gen = pvh->pv_gen; 2442 rw_wunlock(lock); 2443 PMAP_LOCK(pmap); 2444 rw_wlock(lock); 2445 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 2446 PMAP_UNLOCK(pmap); 2447 goto restart; 2448 } 2449 } 2450 l3e = pmap_pml3e(pmap, pv->pv_va); 2451 KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_clear_modify: found" 2452 " a 2mpage in page %p's pv list", m)); 2453 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 2454 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2455 atomic_clear_long(pte, PG_M); 2456 pmap_invalidate_page(pmap, pv->pv_va); 2457 } 2458 PMAP_UNLOCK(pmap); 2459 } 2460 rw_wunlock(lock); 2461 } 2462 2463 void 2464 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2465 vm_size_t len, vm_offset_t src_addr) 2466 { 2467 struct rwlock *lock; 2468 struct spglist free; 2469 vm_offset_t addr; 2470 vm_offset_t end_addr = src_addr + len; 2471 vm_offset_t va_next; 2472 vm_page_t dst_pdpg, dstmpte, srcmpte; 2473 bool invalidate_all; 2474 2475 CTR6(KTR_PMAP, 2476 "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", 2477 __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); 2478 2479 if (dst_addr != src_addr) 2480 return; 2481 lock = NULL; 2482 invalidate_all = false; 2483 if (dst_pmap < src_pmap) { 2484 PMAP_LOCK(dst_pmap); 2485 PMAP_LOCK(src_pmap); 2486 } else { 2487 PMAP_LOCK(src_pmap); 2488 PMAP_LOCK(dst_pmap); 2489 } 2490 2491 for (addr = src_addr; addr < end_addr; addr = va_next) { 2492 pml1_entry_t *l1e; 2493 pml2_entry_t *l2e; 2494 pml3_entry_t srcptepaddr, *l3e; 2495 pt_entry_t *src_pte, *dst_pte; 2496 2497 l1e = pmap_pml1e(src_pmap, addr); 2498 if ((*l1e & PG_V) == 0) { 2499 va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 2500 if (va_next < addr) 2501 va_next = end_addr; 2502 continue; 2503 } 2504 2505 l2e = pmap_l1e_to_l2e(l1e, addr); 2506 if ((*l2e & PG_V) == 0) { 2507 va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 2508 if (va_next < addr) 2509 va_next = end_addr; 2510 continue; 2511 } 2512 2513 va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 2514 if (va_next < addr) 2515 va_next = end_addr; 2516 2517 l3e = pmap_l2e_to_l3e(l2e, addr); 2518 srcptepaddr = *l3e; 2519 if (srcptepaddr == 0) 2520 continue; 2521 2522 if (srcptepaddr & RPTE_LEAF) { 2523 if ((addr & L3_PAGE_MASK) != 0 || 2524 addr + L3_PAGE_SIZE > end_addr) 2525 continue; 2526 dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); 2527 if (dst_pdpg == NULL) 2528 break; 2529 l3e = (pml3_entry_t *) 2530 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); 2531 l3e = &l3e[pmap_pml3e_index(addr)]; 2532 if (*l3e == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 2533 pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, 2534 PMAP_ENTER_NORECLAIM, &lock))) { 2535 *l3e = srcptepaddr & ~PG_W; 2536 pmap_resident_count_inc(dst_pmap, 2537 L3_PAGE_SIZE / PAGE_SIZE); 2538 atomic_add_long(&pmap_l3e_mappings, 1); 2539 } else 2540 dst_pdpg->ref_count--; 2541 continue; 2542 } 2543 2544 srcptepaddr &= PG_FRAME; 2545 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 2546 KASSERT(srcmpte->ref_count > 0, 2547 ("pmap_copy: source page table page is unused")); 2548 2549 if (va_next > end_addr) 2550 va_next = end_addr; 2551 2552 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 2553 src_pte = &src_pte[pmap_pte_index(addr)]; 2554 dstmpte = NULL; 2555 while (addr < va_next) { 2556 pt_entry_t ptetemp; 2557 ptetemp = *src_pte; 2558 /* 2559 * we only virtual copy managed pages 2560 */ 2561 if ((ptetemp & PG_MANAGED) != 0) { 2562 if (dstmpte != NULL && 2563 dstmpte->pindex == pmap_l3e_pindex(addr)) 2564 dstmpte->ref_count++; 2565 else if ((dstmpte = pmap_allocpte(dst_pmap, 2566 addr, NULL)) == NULL) 2567 goto out; 2568 dst_pte = (pt_entry_t *) 2569 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 2570 dst_pte = &dst_pte[pmap_pte_index(addr)]; 2571 if (*dst_pte == 0 && 2572 pmap_try_insert_pv_entry(dst_pmap, addr, 2573 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 2574 &lock)) { 2575 /* 2576 * Clear the wired, modified, and 2577 * accessed (referenced) bits 2578 * during the copy. 2579 */ 2580 *dst_pte = ptetemp & ~(PG_W | PG_M | 2581 PG_A); 2582 pmap_resident_count_inc(dst_pmap, 1); 2583 } else { 2584 SLIST_INIT(&free); 2585 if (pmap_unwire_ptp(dst_pmap, addr, 2586 dstmpte, &free)) { 2587 /* 2588 * Although "addr" is not 2589 * mapped, paging-structure 2590 * caches could nonetheless 2591 * have entries that refer to 2592 * the freed page table pages. 2593 * Invalidate those entries. 2594 */ 2595 invalidate_all = true; 2596 vm_page_free_pages_toq(&free, 2597 true); 2598 } 2599 goto out; 2600 } 2601 if (dstmpte->ref_count >= srcmpte->ref_count) 2602 break; 2603 } 2604 addr += PAGE_SIZE; 2605 if (__predict_false((addr & L3_PAGE_MASK) == 0)) 2606 src_pte = pmap_pte(src_pmap, addr); 2607 else 2608 src_pte++; 2609 } 2610 } 2611 out: 2612 if (invalidate_all) 2613 pmap_invalidate_all(dst_pmap); 2614 if (lock != NULL) 2615 rw_wunlock(lock); 2616 PMAP_UNLOCK(src_pmap); 2617 PMAP_UNLOCK(dst_pmap); 2618 } 2619 2620 static void 2621 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst) 2622 { 2623 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 2624 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 2625 2626 CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); 2627 /* 2628 * XXX slow 2629 */ 2630 bcopy((void *)src, (void *)dst, PAGE_SIZE); 2631 } 2632 2633 static void 2634 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 2635 vm_offset_t b_offset, int xfersize) 2636 { 2637 2638 CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, 2639 a_offset, mb, b_offset, xfersize); 2640 UNIMPLEMENTED(); 2641 } 2642 2643 #if VM_NRESERVLEVEL > 0 2644 /* 2645 * Tries to promote the 512, contiguous 4KB page mappings that are within a 2646 * single page table page (PTP) to a single 2MB page mapping. For promotion 2647 * to occur, two conditions must be met: (1) the 4KB page mappings must map 2648 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 2649 * identical characteristics. 2650 */ 2651 static int 2652 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, 2653 struct rwlock **lockp) 2654 { 2655 pml3_entry_t newpde; 2656 pt_entry_t *firstpte, oldpte, pa, *pte; 2657 vm_page_t mpte; 2658 2659 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2660 2661 /* 2662 * Examine the first PTE in the specified PTP. Abort if this PTE is 2663 * either invalid, unused, or does not map the first 4KB physical page 2664 * within a 2MB page. 2665 */ 2666 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 2667 setpde: 2668 newpde = *firstpte; 2669 if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 2670 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2671 " in pmap %p", va, pmap); 2672 goto fail; 2673 } 2674 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 2675 /* 2676 * When PG_M is already clear, PG_RW can be cleared without 2677 * a TLB invalidation. 2678 */ 2679 if (!atomic_cmpset_long(firstpte, newpde, (newpde | RPTE_EAA_R) & ~RPTE_EAA_W)) 2680 goto setpde; 2681 newpde &= ~RPTE_EAA_W; 2682 } 2683 2684 /* 2685 * Examine each of the other PTEs in the specified PTP. Abort if this 2686 * PTE maps an unexpected 4KB physical page or does not have identical 2687 * characteristics to the first PTE. 2688 */ 2689 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; 2690 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 2691 setpte: 2692 oldpte = *pte; 2693 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 2694 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2695 " in pmap %p", va, pmap); 2696 goto fail; 2697 } 2698 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 2699 /* 2700 * When PG_M is already clear, PG_RW can be cleared 2701 * without a TLB invalidation. 2702 */ 2703 if (!atomic_cmpset_long(pte, oldpte, (oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)) 2704 goto setpte; 2705 oldpte &= ~RPTE_EAA_W; 2706 CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" 2707 " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | 2708 (va & ~L3_PAGE_MASK), pmap); 2709 } 2710 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 2711 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" 2712 " in pmap %p", va, pmap); 2713 goto fail; 2714 } 2715 pa -= PAGE_SIZE; 2716 } 2717 2718 /* 2719 * Save the page table page in its current state until the PDE 2720 * mapping the superpage is demoted by pmap_demote_pde() or 2721 * destroyed by pmap_remove_pde(). 2722 */ 2723 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 2724 KASSERT(mpte >= vm_page_array && 2725 mpte < &vm_page_array[vm_page_array_size], 2726 ("pmap_promote_l3e: page table page is out of range")); 2727 KASSERT(mpte->pindex == pmap_l3e_pindex(va), 2728 ("pmap_promote_l3e: page table page's pindex is wrong")); 2729 if (pmap_insert_pt_page(pmap, mpte)) { 2730 CTR2(KTR_PMAP, 2731 "pmap_promote_l3e: failure for va %#lx in pmap %p", va, 2732 pmap); 2733 goto fail; 2734 } 2735 2736 /* 2737 * Promote the pv entries. 2738 */ 2739 if ((newpde & PG_MANAGED) != 0) 2740 pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); 2741 2742 pte_store(pde, PG_PROMOTED | newpde); 2743 atomic_add_long(&pmap_l3e_promotions, 1); 2744 CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" 2745 " in pmap %p", va, pmap); 2746 return (0); 2747 fail: 2748 atomic_add_long(&pmap_l3e_p_failures, 1); 2749 return (KERN_FAILURE); 2750 } 2751 #endif /* VM_NRESERVLEVEL > 0 */ 2752 2753 int 2754 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, 2755 vm_prot_t prot, u_int flags, int8_t psind) 2756 { 2757 struct rwlock *lock; 2758 pml3_entry_t *l3e; 2759 pt_entry_t *pte; 2760 pt_entry_t newpte, origpte; 2761 pv_entry_t pv; 2762 vm_paddr_t opa, pa; 2763 vm_page_t mpte, om; 2764 int rv, retrycount; 2765 boolean_t nosleep, invalidate_all, invalidate_page; 2766 2767 va = trunc_page(va); 2768 retrycount = 0; 2769 invalidate_page = invalidate_all = false; 2770 CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, 2771 m, prot, flags, psind); 2772 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 2773 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 2774 va >= kmi.clean_eva, 2775 ("pmap_enter: managed mapping within the clean submap")); 2776 if ((m->oflags & VPO_UNMANAGED) == 0) 2777 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2778 2779 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 2780 ("pmap_enter: flags %u has reserved bits set", flags)); 2781 pa = VM_PAGE_TO_PHYS(m); 2782 newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); 2783 if ((flags & VM_PROT_WRITE) != 0) 2784 newpte |= PG_M; 2785 if ((flags & VM_PROT_READ) != 0) 2786 newpte |= PG_A; 2787 if (prot & VM_PROT_READ) 2788 newpte |= RPTE_EAA_R; 2789 if ((prot & VM_PROT_WRITE) != 0) 2790 newpte |= RPTE_EAA_W; 2791 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 2792 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 2793 2794 if (prot & VM_PROT_EXECUTE) 2795 newpte |= PG_X; 2796 if ((flags & PMAP_ENTER_WIRED) != 0) 2797 newpte |= PG_W; 2798 if (va >= DMAP_MIN_ADDRESS) 2799 newpte |= RPTE_EAA_P; 2800 newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); 2801 /* 2802 * Set modified bit gratuitously for writeable mappings if 2803 * the page is unmanaged. We do not want to take a fault 2804 * to do the dirty bit accounting for these mappings. 2805 */ 2806 if ((m->oflags & VPO_UNMANAGED) != 0) { 2807 if ((newpte & PG_RW) != 0) 2808 newpte |= PG_M; 2809 } else 2810 newpte |= PG_MANAGED; 2811 2812 lock = NULL; 2813 PMAP_LOCK(pmap); 2814 if (psind == 1) { 2815 /* Assert the required virtual and physical alignment. */ 2816 KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); 2817 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2818 rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); 2819 goto out; 2820 } 2821 mpte = NULL; 2822 2823 /* 2824 * In the case that a page table page is not 2825 * resident, we are creating it here. 2826 */ 2827 retry: 2828 l3e = pmap_pml3e(pmap, va); 2829 if (l3e != NULL && (*l3e & PG_V) != 0 && ((*l3e & RPTE_LEAF) == 0 || 2830 pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { 2831 pte = pmap_l3e_to_pte(l3e, va); 2832 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 2833 mpte = PHYS_TO_VM_PAGE(*l3e & PG_FRAME); 2834 mpte->ref_count++; 2835 } 2836 } else if (va < VM_MAXUSER_ADDRESS) { 2837 /* 2838 * Here if the pte page isn't mapped, or if it has been 2839 * deallocated. 2840 */ 2841 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2842 mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), 2843 nosleep ? NULL : &lock); 2844 if (mpte == NULL && nosleep) { 2845 rv = KERN_RESOURCE_SHORTAGE; 2846 goto out; 2847 } 2848 if (__predict_false(retrycount++ == 6)) 2849 panic("too many retries"); 2850 invalidate_all = true; 2851 goto retry; 2852 } else 2853 panic("pmap_enter: invalid page directory va=%#lx", va); 2854 2855 origpte = *pte; 2856 pv = NULL; 2857 2858 /* 2859 * Is the specified virtual address already mapped? 2860 */ 2861 if ((origpte & PG_V) != 0) { 2862 #ifdef INVARIANTS 2863 if (VERBOSE_PMAP || pmap_logging) { 2864 printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" 2865 " asid=%lu curpid=%d name=%s origpte0x%lx\n", 2866 pmap, va, m, prot, flags, psind, pmap->pm_pid, 2867 curproc->p_pid, curproc->p_comm, origpte); 2868 pmap_pte_walk(pmap->pm_pml1, va); 2869 } 2870 #endif 2871 /* 2872 * Wiring change, just update stats. We don't worry about 2873 * wiring PT pages as they remain resident as long as there 2874 * are valid mappings in them. Hence, if a user page is wired, 2875 * the PT page will be also. 2876 */ 2877 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 2878 pmap->pm_stats.wired_count++; 2879 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 2880 pmap->pm_stats.wired_count--; 2881 2882 /* 2883 * Remove the extra PT page reference. 2884 */ 2885 if (mpte != NULL) { 2886 mpte->ref_count--; 2887 KASSERT(mpte->ref_count > 0, 2888 ("pmap_enter: missing reference to page table page," 2889 " va: 0x%lx", va)); 2890 } 2891 2892 /* 2893 * Has the physical page changed? 2894 */ 2895 opa = origpte & PG_FRAME; 2896 if (opa == pa) { 2897 /* 2898 * No, might be a protection or wiring change. 2899 */ 2900 if ((origpte & PG_MANAGED) != 0 && 2901 (newpte & PG_RW) != 0) 2902 vm_page_aflag_set(m, PGA_WRITEABLE); 2903 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { 2904 if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { 2905 if (!atomic_cmpset_long(pte, origpte, newpte)) 2906 goto retry; 2907 if ((newpte & PG_M) != (origpte & PG_M)) 2908 vm_page_dirty(m); 2909 if ((newpte & PG_A) != (origpte & PG_A)) 2910 vm_page_aflag_set(m, PGA_REFERENCED); 2911 ptesync(); 2912 } else 2913 invalidate_all = true; 2914 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 2915 goto unchanged; 2916 } 2917 goto validate; 2918 } 2919 2920 /* 2921 * The physical page has changed. Temporarily invalidate 2922 * the mapping. This ensures that all threads sharing the 2923 * pmap keep a consistent view of the mapping, which is 2924 * necessary for the correct handling of COW faults. It 2925 * also permits reuse of the old mapping's PV entry, 2926 * avoiding an allocation. 2927 * 2928 * For consistency, handle unmanaged mappings the same way. 2929 */ 2930 origpte = pte_load_clear(pte); 2931 KASSERT((origpte & PG_FRAME) == opa, 2932 ("pmap_enter: unexpected pa update for %#lx", va)); 2933 if ((origpte & PG_MANAGED) != 0) { 2934 om = PHYS_TO_VM_PAGE(opa); 2935 2936 /* 2937 * The pmap lock is sufficient to synchronize with 2938 * concurrent calls to pmap_page_test_mappings() and 2939 * pmap_ts_referenced(). 2940 */ 2941 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2942 vm_page_dirty(om); 2943 if ((origpte & PG_A) != 0) 2944 vm_page_aflag_set(om, PGA_REFERENCED); 2945 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2946 pv = pmap_pvh_remove(&om->md, pmap, va); 2947 if ((newpte & PG_MANAGED) == 0) 2948 free_pv_entry(pmap, pv); 2949 #ifdef INVARIANTS 2950 else if (origpte & PG_MANAGED) { 2951 if (pv == NULL) { 2952 pmap_page_print_mappings(om); 2953 MPASS(pv != NULL); 2954 } 2955 } 2956 #endif 2957 if ((om->a.flags & PGA_WRITEABLE) != 0 && 2958 TAILQ_EMPTY(&om->md.pv_list) && 2959 ((om->flags & PG_FICTITIOUS) != 0 || 2960 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 2961 vm_page_aflag_clear(om, PGA_WRITEABLE); 2962 } 2963 if ((origpte & PG_A) != 0) 2964 invalidate_page = true; 2965 origpte = 0; 2966 } else { 2967 if (pmap != kernel_pmap) { 2968 #ifdef INVARIANTS 2969 if (VERBOSE_PMAP || pmap_logging) 2970 printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", 2971 pmap, va, m, prot, flags, psind, 2972 pmap->pm_pid, curproc->p_pid, 2973 curproc->p_comm); 2974 #endif 2975 } 2976 2977 /* 2978 * Increment the counters. 2979 */ 2980 if ((newpte & PG_W) != 0) 2981 pmap->pm_stats.wired_count++; 2982 pmap_resident_count_inc(pmap, 1); 2983 } 2984 2985 /* 2986 * Enter on the PV list if part of our managed memory. 2987 */ 2988 if ((newpte & PG_MANAGED) != 0) { 2989 if (pv == NULL) { 2990 pv = get_pv_entry(pmap, &lock); 2991 pv->pv_va = va; 2992 } 2993 #ifdef VERBOSE_PV 2994 else 2995 printf("reassigning pv: %p to pmap: %p\n", 2996 pv, pmap); 2997 #endif 2998 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 2999 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 3000 m->md.pv_gen++; 3001 if ((newpte & PG_RW) != 0) 3002 vm_page_aflag_set(m, PGA_WRITEABLE); 3003 } 3004 3005 /* 3006 * Update the PTE. 3007 */ 3008 if ((origpte & PG_V) != 0) { 3009 validate: 3010 origpte = pte_load_store(pte, newpte); 3011 KASSERT((origpte & PG_FRAME) == pa, 3012 ("pmap_enter: unexpected pa update for %#lx", va)); 3013 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 3014 (PG_M | PG_RW)) { 3015 if ((origpte & PG_MANAGED) != 0) 3016 vm_page_dirty(m); 3017 invalidate_page = true; 3018 3019 /* 3020 * Although the PTE may still have PG_RW set, TLB 3021 * invalidation may nonetheless be required because 3022 * the PTE no longer has PG_M set. 3023 */ 3024 } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { 3025 /* 3026 * Removing capabilities requires invalidation on POWER 3027 */ 3028 invalidate_page = true; 3029 goto unchanged; 3030 } 3031 if ((origpte & PG_A) != 0) 3032 invalidate_page = true; 3033 } else { 3034 pte_store(pte, newpte); 3035 ptesync(); 3036 } 3037 unchanged: 3038 3039 #if VM_NRESERVLEVEL > 0 3040 /* 3041 * If both the page table page and the reservation are fully 3042 * populated, then attempt promotion. 3043 */ 3044 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 3045 mmu_radix_ps_enabled(pmap) && 3046 (m->flags & PG_FICTITIOUS) == 0 && 3047 vm_reserv_level_iffullpop(m) == 0 && 3048 pmap_promote_l3e(pmap, l3e, va, &lock) == 0) 3049 invalidate_all = true; 3050 #endif 3051 if (invalidate_all) 3052 pmap_invalidate_all(pmap); 3053 else if (invalidate_page) 3054 pmap_invalidate_page(pmap, va); 3055 3056 rv = KERN_SUCCESS; 3057 out: 3058 if (lock != NULL) 3059 rw_wunlock(lock); 3060 PMAP_UNLOCK(pmap); 3061 3062 return (rv); 3063 } 3064 3065 /* 3066 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 3067 * if successful. Returns false if (1) a page table page cannot be allocated 3068 * without sleeping, (2) a mapping already exists at the specified virtual 3069 * address, or (3) a PV entry cannot be allocated without reclaiming another 3070 * PV entry. 3071 */ 3072 static bool 3073 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3074 struct rwlock **lockp) 3075 { 3076 pml3_entry_t newpde; 3077 3078 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3079 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | 3080 RPTE_LEAF | PG_V; 3081 if ((m->oflags & VPO_UNMANAGED) == 0) 3082 newpde |= PG_MANAGED; 3083 if (prot & VM_PROT_EXECUTE) 3084 newpde |= PG_X; 3085 if (prot & VM_PROT_READ) 3086 newpde |= RPTE_EAA_R; 3087 if (va >= DMAP_MIN_ADDRESS) 3088 newpde |= RPTE_EAA_P; 3089 return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 3090 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 3091 KERN_SUCCESS); 3092 } 3093 3094 /* 3095 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3096 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 3097 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 3098 * a mapping already exists at the specified virtual address. Returns 3099 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 3100 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 3101 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 3102 * 3103 * The parameter "m" is only used when creating a managed, writeable mapping. 3104 */ 3105 static int 3106 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, 3107 vm_page_t m, struct rwlock **lockp) 3108 { 3109 struct spglist free; 3110 pml3_entry_t oldl3e, *l3e; 3111 vm_page_t mt, pdpg; 3112 3113 KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, 3114 ("pmap_enter_pde: newpde is missing PG_M")); 3115 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3116 3117 if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3118 NULL : lockp)) == NULL) { 3119 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3120 " in pmap %p", va, pmap); 3121 return (KERN_RESOURCE_SHORTAGE); 3122 } 3123 l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3124 l3e = &l3e[pmap_pml3e_index(va)]; 3125 oldl3e = *l3e; 3126 if ((oldl3e & PG_V) != 0) { 3127 KASSERT(pdpg->ref_count > 1, 3128 ("pmap_enter_pde: pdpg's wire count is too low")); 3129 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3130 pdpg->ref_count--; 3131 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3132 " in pmap %p", va, pmap); 3133 return (KERN_FAILURE); 3134 } 3135 /* Break the existing mapping(s). */ 3136 SLIST_INIT(&free); 3137 if ((oldl3e & RPTE_LEAF) != 0) { 3138 /* 3139 * The reference to the PD page that was acquired by 3140 * pmap_allocl3e() ensures that it won't be freed. 3141 * However, if the PDE resulted from a promotion, then 3142 * a reserved PT page could be freed. 3143 */ 3144 (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); 3145 } else { 3146 if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, 3147 &free, lockp)) 3148 pmap_invalidate_all(pmap); 3149 } 3150 vm_page_free_pages_toq(&free, true); 3151 if (va >= VM_MAXUSER_ADDRESS) { 3152 mt = PHYS_TO_VM_PAGE(*l3e & PG_FRAME); 3153 if (pmap_insert_pt_page(pmap, mt)) { 3154 /* 3155 * XXX Currently, this can't happen because 3156 * we do not perform pmap_enter(psind == 1) 3157 * on the kernel pmap. 3158 */ 3159 panic("pmap_enter_pde: trie insert failed"); 3160 } 3161 } else 3162 KASSERT(*l3e == 0, ("pmap_enter_pde: non-zero pde %p", 3163 l3e)); 3164 } 3165 if ((newpde & PG_MANAGED) != 0) { 3166 /* 3167 * Abort this mapping if its PV entry could not be created. 3168 */ 3169 if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { 3170 SLIST_INIT(&free); 3171 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { 3172 /* 3173 * Although "va" is not mapped, paging- 3174 * structure caches could nonetheless have 3175 * entries that refer to the freed page table 3176 * pages. Invalidate those entries. 3177 */ 3178 pmap_invalidate_page(pmap, va); 3179 vm_page_free_pages_toq(&free, true); 3180 } 3181 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3182 " in pmap %p", va, pmap); 3183 return (KERN_RESOURCE_SHORTAGE); 3184 } 3185 if ((newpde & PG_RW) != 0) { 3186 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 3187 vm_page_aflag_set(mt, PGA_WRITEABLE); 3188 } 3189 } 3190 3191 /* 3192 * Increment counters. 3193 */ 3194 if ((newpde & PG_W) != 0) 3195 pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; 3196 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); 3197 3198 /* 3199 * Map the superpage. (This is not a promoted mapping; there will not 3200 * be any lingering 4KB page mappings in the TLB.) 3201 */ 3202 pte_store(l3e, newpde); 3203 3204 atomic_add_long(&pmap_l3e_mappings, 1); 3205 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3206 " in pmap %p", va, pmap); 3207 return (KERN_SUCCESS); 3208 } 3209 3210 void 3211 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start, 3212 vm_offset_t end, vm_page_t m_start, vm_prot_t prot) 3213 { 3214 3215 struct rwlock *lock; 3216 vm_offset_t va; 3217 vm_page_t m, mpte; 3218 vm_pindex_t diff, psize; 3219 bool invalidate; 3220 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3221 3222 CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, 3223 end, m_start, prot); 3224 3225 invalidate = false; 3226 psize = atop(end - start); 3227 mpte = NULL; 3228 m = m_start; 3229 lock = NULL; 3230 PMAP_LOCK(pmap); 3231 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3232 va = start + ptoa(diff); 3233 if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && 3234 m->psind == 1 && mmu_radix_ps_enabled(pmap) && 3235 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3236 m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1]; 3237 else 3238 mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot, 3239 mpte, &lock, &invalidate); 3240 m = TAILQ_NEXT(m, listq); 3241 } 3242 ptesync(); 3243 if (lock != NULL) 3244 rw_wunlock(lock); 3245 if (invalidate) 3246 pmap_invalidate_all(pmap); 3247 PMAP_UNLOCK(pmap); 3248 } 3249 3250 static vm_page_t 3251 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3252 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) 3253 { 3254 struct spglist free; 3255 pt_entry_t *pte; 3256 vm_paddr_t pa; 3257 3258 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3259 (m->oflags & VPO_UNMANAGED) != 0, 3260 ("mmu_radix_enter_quick_locked: managed mapping within the clean submap")); 3261 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3262 3263 /* 3264 * In the case that a page table page is not 3265 * resident, we are creating it here. 3266 */ 3267 if (va < VM_MAXUSER_ADDRESS) { 3268 vm_pindex_t ptepindex; 3269 pml3_entry_t *ptepa; 3270 3271 /* 3272 * Calculate pagetable page index 3273 */ 3274 ptepindex = pmap_l3e_pindex(va); 3275 if (mpte && (mpte->pindex == ptepindex)) { 3276 mpte->ref_count++; 3277 } else { 3278 /* 3279 * Get the page directory entry 3280 */ 3281 ptepa = pmap_pml3e(pmap, va); 3282 3283 /* 3284 * If the page table page is mapped, we just increment 3285 * the hold count, and activate it. Otherwise, we 3286 * attempt to allocate a page table page. If this 3287 * attempt fails, we don't retry. Instead, we give up. 3288 */ 3289 if (ptepa && (*ptepa & PG_V) != 0) { 3290 if (*ptepa & RPTE_LEAF) 3291 return (NULL); 3292 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 3293 mpte->ref_count++; 3294 } else { 3295 /* 3296 * Pass NULL instead of the PV list lock 3297 * pointer, because we don't intend to sleep. 3298 */ 3299 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 3300 if (mpte == NULL) 3301 return (mpte); 3302 } 3303 } 3304 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3305 pte = &pte[pmap_pte_index(va)]; 3306 } else { 3307 mpte = NULL; 3308 pte = pmap_pte(pmap, va); 3309 } 3310 if (*pte) { 3311 if (mpte != NULL) { 3312 mpte->ref_count--; 3313 mpte = NULL; 3314 } 3315 return (mpte); 3316 } 3317 3318 /* 3319 * Enter on the PV list if part of our managed memory. 3320 */ 3321 if ((m->oflags & VPO_UNMANAGED) == 0 && 3322 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3323 if (mpte != NULL) { 3324 SLIST_INIT(&free); 3325 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3326 /* 3327 * Although "va" is not mapped, paging- 3328 * structure caches could nonetheless have 3329 * entries that refer to the freed page table 3330 * pages. Invalidate those entries. 3331 */ 3332 *invalidate = true; 3333 vm_page_free_pages_toq(&free, true); 3334 } 3335 mpte = NULL; 3336 } 3337 return (mpte); 3338 } 3339 3340 /* 3341 * Increment counters 3342 */ 3343 pmap_resident_count_inc(pmap, 1); 3344 3345 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); 3346 if (prot & VM_PROT_EXECUTE) 3347 pa |= PG_X; 3348 else 3349 pa |= RPTE_EAA_R; 3350 if ((m->oflags & VPO_UNMANAGED) == 0) 3351 pa |= PG_MANAGED; 3352 3353 pte_store(pte, pa); 3354 return (mpte); 3355 } 3356 3357 void 3358 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, 3359 vm_prot_t prot) 3360 { 3361 struct rwlock *lock; 3362 bool invalidate; 3363 3364 lock = NULL; 3365 invalidate = false; 3366 PMAP_LOCK(pmap); 3367 mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock, 3368 &invalidate); 3369 ptesync(); 3370 if (lock != NULL) 3371 rw_wunlock(lock); 3372 if (invalidate) 3373 pmap_invalidate_all(pmap); 3374 PMAP_UNLOCK(pmap); 3375 } 3376 3377 vm_paddr_t 3378 mmu_radix_extract(pmap_t pmap, vm_offset_t va) 3379 { 3380 pml3_entry_t *l3e; 3381 pt_entry_t *pte; 3382 vm_paddr_t pa; 3383 3384 l3e = pmap_pml3e(pmap, va); 3385 if (__predict_false(l3e == NULL)) 3386 return (0); 3387 if (*l3e & RPTE_LEAF) { 3388 pa = (*l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK); 3389 pa |= (va & L3_PAGE_MASK); 3390 } else { 3391 /* 3392 * Beware of a concurrent promotion that changes the 3393 * PDE at this point! For example, vtopte() must not 3394 * be used to access the PTE because it would use the 3395 * new PDE. It is, however, safe to use the old PDE 3396 * because the page table page is preserved by the 3397 * promotion. 3398 */ 3399 pte = pmap_l3e_to_pte(l3e, va); 3400 if (__predict_false(pte == NULL)) 3401 return (0); 3402 pa = *pte; 3403 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3404 pa |= (va & PAGE_MASK); 3405 } 3406 return (pa); 3407 } 3408 3409 vm_page_t 3410 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3411 { 3412 pml3_entry_t l3e, *l3ep; 3413 pt_entry_t pte; 3414 vm_paddr_t pa; 3415 vm_page_t m; 3416 3417 pa = 0; 3418 m = NULL; 3419 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); 3420 PMAP_LOCK(pmap); 3421 l3ep = pmap_pml3e(pmap, va); 3422 if (l3ep != NULL && (l3e = *l3ep)) { 3423 if (l3e & RPTE_LEAF) { 3424 if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) 3425 m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) | 3426 (va & L3_PAGE_MASK)); 3427 } else { 3428 pte = *pmap_l3e_to_pte(l3ep, va); 3429 if ((pte & PG_V) && 3430 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) 3431 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3432 } 3433 if (m != NULL && !vm_page_wire_mapped(m)) 3434 m = NULL; 3435 } 3436 PMAP_UNLOCK(pmap); 3437 return (m); 3438 } 3439 3440 static void 3441 mmu_radix_growkernel(vm_offset_t addr) 3442 { 3443 vm_paddr_t paddr; 3444 vm_page_t nkpg; 3445 pml3_entry_t *l3e; 3446 pml2_entry_t *l2e; 3447 3448 CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); 3449 if (VM_MIN_KERNEL_ADDRESS < addr && 3450 addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) 3451 return; 3452 3453 addr = roundup2(addr, L3_PAGE_SIZE); 3454 if (addr - 1 >= vm_map_max(kernel_map)) 3455 addr = vm_map_max(kernel_map); 3456 while (kernel_vm_end < addr) { 3457 l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); 3458 if ((*l2e & PG_V) == 0) { 3459 /* We need a new PDP entry */ 3460 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT, 3461 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 3462 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3463 if (nkpg == NULL) 3464 panic("pmap_growkernel: no memory to grow kernel"); 3465 if ((nkpg->flags & PG_ZERO) == 0) 3466 mmu_radix_zero_page(nkpg); 3467 paddr = VM_PAGE_TO_PHYS(nkpg); 3468 pde_store(l2e, paddr); 3469 continue; /* try again */ 3470 } 3471 l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); 3472 if ((*l3e & PG_V) != 0) { 3473 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 3474 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3475 kernel_vm_end = vm_map_max(kernel_map); 3476 break; 3477 } 3478 continue; 3479 } 3480 3481 nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end), 3482 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3483 VM_ALLOC_ZERO); 3484 if (nkpg == NULL) 3485 panic("pmap_growkernel: no memory to grow kernel"); 3486 if ((nkpg->flags & PG_ZERO) == 0) 3487 mmu_radix_zero_page(nkpg); 3488 paddr = VM_PAGE_TO_PHYS(nkpg); 3489 pde_store(l3e, paddr); 3490 3491 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 3492 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3493 kernel_vm_end = vm_map_max(kernel_map); 3494 break; 3495 } 3496 } 3497 ptesync(); 3498 } 3499 3500 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); 3501 static uma_zone_t zone_radix_pgd; 3502 3503 static int 3504 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, 3505 int flags) 3506 { 3507 3508 for (int i = 0; i < count; i++) { 3509 vm_page_t m = vm_page_alloc_contig(NULL, 0, 3510 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 3511 VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE, 3512 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, 3513 VM_MEMATTR_DEFAULT); 3514 /* XXX zero on alloc here so we don't have to later */ 3515 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3516 } 3517 return (count); 3518 } 3519 3520 static void 3521 radix_pgd_release(void *arg __unused, void **store, int count) 3522 { 3523 vm_page_t m; 3524 struct spglist free; 3525 int page_count; 3526 3527 SLIST_INIT(&free); 3528 page_count = RADIX_PGD_SIZE/PAGE_SIZE; 3529 3530 for (int i = 0; i < count; i++) { 3531 /* 3532 * XXX selectively remove dmap and KVA entries so we don't 3533 * need to bzero 3534 */ 3535 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); 3536 for (int j = page_count-1; j >= 0; j--) { 3537 vm_page_unwire_noq(&m[j]); 3538 SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); 3539 } 3540 vm_page_free_pages_toq(&free, false); 3541 } 3542 } 3543 3544 static void 3545 mmu_radix_init() 3546 { 3547 vm_page_t mpte; 3548 vm_size_t s; 3549 int error, i, pv_npg; 3550 3551 /* L1TF, reserve page @0 unconditionally */ 3552 vm_page_blacklist_add(0, bootverbose); 3553 3554 zone_radix_pgd = uma_zcache_create("radix_pgd_cache", 3555 RADIX_PGD_SIZE, NULL, NULL, 3556 #ifdef INVARIANTS 3557 trash_init, trash_fini, 3558 #else 3559 NULL, NULL, 3560 #endif 3561 radix_pgd_import, radix_pgd_release, 3562 NULL, UMA_ZONE_NOBUCKET); 3563 3564 /* 3565 * Initialize the vm page array entries for the kernel pmap's 3566 * page table pages. 3567 */ 3568 PMAP_LOCK(kernel_pmap); 3569 for (i = 0; i < nkpt; i++) { 3570 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 3571 KASSERT(mpte >= vm_page_array && 3572 mpte < &vm_page_array[vm_page_array_size], 3573 ("pmap_init: page table page is out of range size: %lu", 3574 vm_page_array_size)); 3575 mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; 3576 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 3577 MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); 3578 //pmap_insert_pt_page(kernel_pmap, mpte); 3579 mpte->ref_count = 1; 3580 } 3581 PMAP_UNLOCK(kernel_pmap); 3582 vm_wire_add(nkpt); 3583 3584 CTR1(KTR_PMAP, "%s()", __func__); 3585 TAILQ_INIT(&pv_dummy.pv_list); 3586 3587 /* 3588 * Are large page mappings enabled? 3589 */ 3590 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 3591 if (pg_ps_enabled) { 3592 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 3593 ("pmap_init: can't assign to pagesizes[1]")); 3594 pagesizes[1] = L3_PAGE_SIZE; 3595 } 3596 3597 /* 3598 * Initialize the pv chunk list mutex. 3599 */ 3600 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 3601 3602 /* 3603 * Initialize the pool of pv list locks. 3604 */ 3605 for (i = 0; i < NPV_LIST_LOCKS; i++) 3606 rw_init(&pv_list_locks[i], "pmap pv list"); 3607 3608 /* 3609 * Calculate the size of the pv head table for superpages. 3610 */ 3611 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); 3612 3613 /* 3614 * Allocate memory for the pv head table for superpages. 3615 */ 3616 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 3617 s = round_page(s); 3618 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 3619 for (i = 0; i < pv_npg; i++) 3620 TAILQ_INIT(&pv_table[i].pv_list); 3621 TAILQ_INIT(&pv_dummy.pv_list); 3622 3623 pmap_initialized = 1; 3624 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 3625 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3626 (vmem_addr_t *)&qframe); 3627 3628 if (error != 0) 3629 panic("qframe allocation failed"); 3630 asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits), 3631 1, 1, M_WAITOK); 3632 } 3633 3634 static boolean_t 3635 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3636 { 3637 struct rwlock *lock; 3638 pv_entry_t pv; 3639 struct md_page *pvh; 3640 pt_entry_t *pte, mask; 3641 pmap_t pmap; 3642 int md_gen, pvh_gen; 3643 boolean_t rv; 3644 3645 rv = FALSE; 3646 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3647 rw_rlock(lock); 3648 restart: 3649 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 3650 pmap = PV_PMAP(pv); 3651 if (!PMAP_TRYLOCK(pmap)) { 3652 md_gen = m->md.pv_gen; 3653 rw_runlock(lock); 3654 PMAP_LOCK(pmap); 3655 rw_rlock(lock); 3656 if (md_gen != m->md.pv_gen) { 3657 PMAP_UNLOCK(pmap); 3658 goto restart; 3659 } 3660 } 3661 pte = pmap_pte(pmap, pv->pv_va); 3662 mask = 0; 3663 if (modified) 3664 mask |= PG_RW | PG_M; 3665 if (accessed) 3666 mask |= PG_V | PG_A; 3667 rv = (*pte & mask) == mask; 3668 PMAP_UNLOCK(pmap); 3669 if (rv) 3670 goto out; 3671 } 3672 if ((m->flags & PG_FICTITIOUS) == 0) { 3673 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3674 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 3675 pmap = PV_PMAP(pv); 3676 if (!PMAP_TRYLOCK(pmap)) { 3677 md_gen = m->md.pv_gen; 3678 pvh_gen = pvh->pv_gen; 3679 rw_runlock(lock); 3680 PMAP_LOCK(pmap); 3681 rw_rlock(lock); 3682 if (md_gen != m->md.pv_gen || 3683 pvh_gen != pvh->pv_gen) { 3684 PMAP_UNLOCK(pmap); 3685 goto restart; 3686 } 3687 } 3688 pte = pmap_pml3e(pmap, pv->pv_va); 3689 mask = 0; 3690 if (modified) 3691 mask |= PG_RW | PG_M; 3692 if (accessed) 3693 mask |= PG_V | PG_A; 3694 rv = (*pte & mask) == mask; 3695 PMAP_UNLOCK(pmap); 3696 if (rv) 3697 goto out; 3698 } 3699 } 3700 out: 3701 rw_runlock(lock); 3702 return (rv); 3703 } 3704 3705 /* 3706 * pmap_is_modified: 3707 * 3708 * Return whether or not the specified physical page was modified 3709 * in any physical maps. 3710 */ 3711 boolean_t 3712 mmu_radix_is_modified(vm_page_t m) 3713 { 3714 3715 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3716 ("pmap_is_modified: page %p is not managed", m)); 3717 3718 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3719 /* 3720 * If the page is not busied then this check is racy. 3721 */ 3722 if (!pmap_page_is_write_mapped(m)) 3723 return (FALSE); 3724 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3725 } 3726 3727 boolean_t 3728 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3729 { 3730 pml3_entry_t *l3e; 3731 pt_entry_t *pte; 3732 boolean_t rv; 3733 3734 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); 3735 rv = FALSE; 3736 PMAP_LOCK(pmap); 3737 l3e = pmap_pml3e(pmap, addr); 3738 if (l3e != NULL && (*l3e & (RPTE_LEAF | PG_V)) == PG_V) { 3739 pte = pmap_l3e_to_pte(l3e, addr); 3740 rv = (*pte & PG_V) == 0; 3741 } 3742 PMAP_UNLOCK(pmap); 3743 return (rv); 3744 } 3745 3746 boolean_t 3747 mmu_radix_is_referenced(vm_page_t m) 3748 { 3749 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3750 ("pmap_is_referenced: page %p is not managed", m)); 3751 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3752 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3753 } 3754 3755 /* 3756 * pmap_ts_referenced: 3757 * 3758 * Return a count of reference bits for a page, clearing those bits. 3759 * It is not necessary for every reference bit to be cleared, but it 3760 * is necessary that 0 only be returned when there are truly no 3761 * reference bits set. 3762 * 3763 * As an optimization, update the page's dirty field if a modified bit is 3764 * found while counting reference bits. This opportunistic update can be 3765 * performed at low cost and can eliminate the need for some future calls 3766 * to pmap_is_modified(). However, since this function stops after 3767 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3768 * dirty pages. Those dirty pages will only be detected by a future call 3769 * to pmap_is_modified(). 3770 * 3771 * A DI block is not needed within this function, because 3772 * invalidations are performed before the PV list lock is 3773 * released. 3774 */ 3775 boolean_t 3776 mmu_radix_ts_referenced(vm_page_t m) 3777 { 3778 struct md_page *pvh; 3779 pv_entry_t pv, pvf; 3780 pmap_t pmap; 3781 struct rwlock *lock; 3782 pml3_entry_t oldl3e, *l3e; 3783 pt_entry_t *pte; 3784 vm_paddr_t pa; 3785 int cleared, md_gen, not_cleared, pvh_gen; 3786 struct spglist free; 3787 3788 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 3789 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3790 ("pmap_ts_referenced: page %p is not managed", m)); 3791 SLIST_INIT(&free); 3792 cleared = 0; 3793 pa = VM_PAGE_TO_PHYS(m); 3794 lock = PHYS_TO_PV_LIST_LOCK(pa); 3795 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 3796 rw_wlock(lock); 3797 retry: 3798 not_cleared = 0; 3799 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 3800 goto small_mappings; 3801 pv = pvf; 3802 do { 3803 if (pvf == NULL) 3804 pvf = pv; 3805 pmap = PV_PMAP(pv); 3806 if (!PMAP_TRYLOCK(pmap)) { 3807 pvh_gen = pvh->pv_gen; 3808 rw_wunlock(lock); 3809 PMAP_LOCK(pmap); 3810 rw_wlock(lock); 3811 if (pvh_gen != pvh->pv_gen) { 3812 PMAP_UNLOCK(pmap); 3813 goto retry; 3814 } 3815 } 3816 l3e = pmap_pml3e(pmap, pv->pv_va); 3817 oldl3e = *l3e; 3818 if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3819 /* 3820 * Although "oldpde" is mapping a 2MB page, because 3821 * this function is called at a 4KB page granularity, 3822 * we only update the 4KB page under test. 3823 */ 3824 vm_page_dirty(m); 3825 } 3826 if ((oldl3e & PG_A) != 0) { 3827 /* 3828 * Since this reference bit is shared by 512 4KB 3829 * pages, it should not be cleared every time it is 3830 * tested. Apply a simple "hash" function on the 3831 * physical page number, the virtual superpage number, 3832 * and the pmap address to select one 4KB page out of 3833 * the 512 on which testing the reference bit will 3834 * result in clearing that reference bit. This 3835 * function is designed to avoid the selection of the 3836 * same 4KB page for every 2MB page mapping. 3837 * 3838 * On demotion, a mapping that hasn't been referenced 3839 * is simply destroyed. To avoid the possibility of a 3840 * subsequent page fault on a demoted wired mapping, 3841 * always leave its reference bit set. Moreover, 3842 * since the superpage is wired, the current state of 3843 * its reference bit won't affect page replacement. 3844 */ 3845 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ 3846 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 3847 (oldl3e & PG_W) == 0) { 3848 atomic_clear_long(l3e, PG_A); 3849 pmap_invalidate_page(pmap, pv->pv_va); 3850 cleared++; 3851 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3852 ("inconsistent pv lock %p %p for page %p", 3853 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3854 } else 3855 not_cleared++; 3856 } 3857 PMAP_UNLOCK(pmap); 3858 /* Rotate the PV list if it has more than one entry. */ 3859 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { 3860 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 3861 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 3862 pvh->pv_gen++; 3863 } 3864 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 3865 goto out; 3866 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 3867 small_mappings: 3868 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 3869 goto out; 3870 pv = pvf; 3871 do { 3872 if (pvf == NULL) 3873 pvf = pv; 3874 pmap = PV_PMAP(pv); 3875 if (!PMAP_TRYLOCK(pmap)) { 3876 pvh_gen = pvh->pv_gen; 3877 md_gen = m->md.pv_gen; 3878 rw_wunlock(lock); 3879 PMAP_LOCK(pmap); 3880 rw_wlock(lock); 3881 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3882 PMAP_UNLOCK(pmap); 3883 goto retry; 3884 } 3885 } 3886 l3e = pmap_pml3e(pmap, pv->pv_va); 3887 KASSERT((*l3e & RPTE_LEAF) == 0, 3888 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 3889 m)); 3890 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 3891 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3892 vm_page_dirty(m); 3893 if ((*pte & PG_A) != 0) { 3894 atomic_clear_long(pte, PG_A); 3895 pmap_invalidate_page(pmap, pv->pv_va); 3896 cleared++; 3897 } 3898 PMAP_UNLOCK(pmap); 3899 /* Rotate the PV list if it has more than one entry. */ 3900 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { 3901 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 3902 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); 3903 m->md.pv_gen++; 3904 } 3905 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 3906 not_cleared < PMAP_TS_REFERENCED_MAX); 3907 out: 3908 rw_wunlock(lock); 3909 vm_page_free_pages_toq(&free, true); 3910 return (cleared + not_cleared); 3911 } 3912 3913 static vm_offset_t 3914 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start, 3915 vm_paddr_t end, int prot __unused) 3916 { 3917 3918 CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, 3919 prot); 3920 return (PHYS_TO_DMAP(start)); 3921 } 3922 3923 void 3924 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr, 3925 vm_object_t object, vm_pindex_t pindex, vm_size_t size) 3926 { 3927 pml3_entry_t *l3e; 3928 vm_paddr_t pa, ptepa; 3929 vm_page_t p, pdpg; 3930 vm_memattr_t ma; 3931 3932 CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, 3933 object, pindex, size); 3934 VM_OBJECT_ASSERT_WLOCKED(object); 3935 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3936 ("pmap_object_init_pt: non-device object")); 3937 /* NB: size can be logically ored with addr here */ 3938 if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { 3939 if (!mmu_radix_ps_enabled(pmap)) 3940 return; 3941 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3942 return; 3943 p = vm_page_lookup(object, pindex); 3944 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3945 ("pmap_object_init_pt: invalid page %p", p)); 3946 ma = p->md.mdpg_cache_attrs; 3947 3948 /* 3949 * Abort the mapping if the first page is not physically 3950 * aligned to a 2MB page boundary. 3951 */ 3952 ptepa = VM_PAGE_TO_PHYS(p); 3953 if (ptepa & L3_PAGE_MASK) 3954 return; 3955 3956 /* 3957 * Skip the first page. Abort the mapping if the rest of 3958 * the pages are not physically contiguous or have differing 3959 * memory attributes. 3960 */ 3961 p = TAILQ_NEXT(p, listq); 3962 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3963 pa += PAGE_SIZE) { 3964 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3965 ("pmap_object_init_pt: invalid page %p", p)); 3966 if (pa != VM_PAGE_TO_PHYS(p) || 3967 ma != p->md.mdpg_cache_attrs) 3968 return; 3969 p = TAILQ_NEXT(p, listq); 3970 } 3971 3972 PMAP_LOCK(pmap); 3973 for (pa = ptepa | pmap_cache_bits(ma); 3974 pa < ptepa + size; pa += L3_PAGE_SIZE) { 3975 pdpg = pmap_allocl3e(pmap, addr, NULL); 3976 if (pdpg == NULL) { 3977 /* 3978 * The creation of mappings below is only an 3979 * optimization. If a page directory page 3980 * cannot be allocated without blocking, 3981 * continue on to the next mapping rather than 3982 * blocking. 3983 */ 3984 addr += L3_PAGE_SIZE; 3985 continue; 3986 } 3987 l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3988 l3e = &l3e[pmap_pml3e_index(addr)]; 3989 if ((*l3e & PG_V) == 0) { 3990 pa |= PG_M | PG_A | PG_RW; 3991 pte_store(l3e, pa); 3992 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); 3993 atomic_add_long(&pmap_l3e_mappings, 1); 3994 } else { 3995 /* Continue on if the PDE is already valid. */ 3996 pdpg->ref_count--; 3997 KASSERT(pdpg->ref_count > 0, 3998 ("pmap_object_init_pt: missing reference " 3999 "to page directory page, va: 0x%lx", addr)); 4000 } 4001 addr += L3_PAGE_SIZE; 4002 } 4003 ptesync(); 4004 PMAP_UNLOCK(pmap); 4005 } 4006 } 4007 4008 boolean_t 4009 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m) 4010 { 4011 struct md_page *pvh; 4012 struct rwlock *lock; 4013 pv_entry_t pv; 4014 int loops = 0; 4015 boolean_t rv; 4016 4017 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4018 ("pmap_page_exists_quick: page %p is not managed", m)); 4019 CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); 4020 rv = FALSE; 4021 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4022 rw_rlock(lock); 4023 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 4024 if (PV_PMAP(pv) == pmap) { 4025 rv = TRUE; 4026 break; 4027 } 4028 loops++; 4029 if (loops >= 16) 4030 break; 4031 } 4032 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4033 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4034 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 4035 if (PV_PMAP(pv) == pmap) { 4036 rv = TRUE; 4037 break; 4038 } 4039 loops++; 4040 if (loops >= 16) 4041 break; 4042 } 4043 } 4044 rw_runlock(lock); 4045 return (rv); 4046 } 4047 4048 void 4049 mmu_radix_page_init(vm_page_t m) 4050 { 4051 4052 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 4053 TAILQ_INIT(&m->md.pv_list); 4054 m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; 4055 } 4056 4057 int 4058 mmu_radix_page_wired_mappings(vm_page_t m) 4059 { 4060 struct rwlock *lock; 4061 struct md_page *pvh; 4062 pmap_t pmap; 4063 pt_entry_t *pte; 4064 pv_entry_t pv; 4065 int count, md_gen, pvh_gen; 4066 4067 if ((m->oflags & VPO_UNMANAGED) != 0) 4068 return (0); 4069 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 4070 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4071 rw_rlock(lock); 4072 restart: 4073 count = 0; 4074 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 4075 pmap = PV_PMAP(pv); 4076 if (!PMAP_TRYLOCK(pmap)) { 4077 md_gen = m->md.pv_gen; 4078 rw_runlock(lock); 4079 PMAP_LOCK(pmap); 4080 rw_rlock(lock); 4081 if (md_gen != m->md.pv_gen) { 4082 PMAP_UNLOCK(pmap); 4083 goto restart; 4084 } 4085 } 4086 pte = pmap_pte(pmap, pv->pv_va); 4087 if ((*pte & PG_W) != 0) 4088 count++; 4089 PMAP_UNLOCK(pmap); 4090 } 4091 if ((m->flags & PG_FICTITIOUS) == 0) { 4092 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4093 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { 4094 pmap = PV_PMAP(pv); 4095 if (!PMAP_TRYLOCK(pmap)) { 4096 md_gen = m->md.pv_gen; 4097 pvh_gen = pvh->pv_gen; 4098 rw_runlock(lock); 4099 PMAP_LOCK(pmap); 4100 rw_rlock(lock); 4101 if (md_gen != m->md.pv_gen || 4102 pvh_gen != pvh->pv_gen) { 4103 PMAP_UNLOCK(pmap); 4104 goto restart; 4105 } 4106 } 4107 pte = pmap_pml3e(pmap, pv->pv_va); 4108 if ((*pte & PG_W) != 0) 4109 count++; 4110 PMAP_UNLOCK(pmap); 4111 } 4112 } 4113 rw_runlock(lock); 4114 return (count); 4115 } 4116 4117 static void 4118 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) 4119 { 4120 isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); 4121 } 4122 4123 int 4124 mmu_radix_pinit(pmap_t pmap) 4125 { 4126 vmem_addr_t pid; 4127 vm_paddr_t l1pa; 4128 4129 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4130 4131 /* 4132 * allocate the page directory page 4133 */ 4134 pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); 4135 4136 for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) 4137 pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); 4138 pmap->pm_radix.rt_root = 0; 4139 TAILQ_INIT(&pmap->pm_pvchunk); 4140 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4141 pmap->pm_flags = PMAP_PDE_SUPERPAGE; 4142 vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); 4143 4144 pmap->pm_pid = pid; 4145 l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); 4146 mmu_radix_update_proctab(pid, l1pa); 4147 __asm __volatile("ptesync;isync" : : : "memory"); 4148 4149 return (1); 4150 } 4151 4152 /* 4153 * This routine is called if the desired page table page does not exist. 4154 * 4155 * If page table page allocation fails, this routine may sleep before 4156 * returning NULL. It sleeps only if a lock pointer was given. 4157 * 4158 * Note: If a page allocation fails at page table level two or three, 4159 * one or two pages may be held during the wait, only to be released 4160 * afterwards. This conservative approach is easily argued to avoid 4161 * race conditions. 4162 */ 4163 static vm_page_t 4164 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 4165 { 4166 vm_page_t m, pdppg, pdpg; 4167 4168 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4169 4170 /* 4171 * Allocate a page table page. 4172 */ 4173 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 4174 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 4175 if (lockp != NULL) { 4176 RELEASE_PV_LIST_LOCK(lockp); 4177 PMAP_UNLOCK(pmap); 4178 vm_wait(NULL); 4179 PMAP_LOCK(pmap); 4180 } 4181 /* 4182 * Indicate the need to retry. While waiting, the page table 4183 * page may have been allocated. 4184 */ 4185 return (NULL); 4186 } 4187 if ((m->flags & PG_ZERO) == 0) 4188 mmu_radix_zero_page(m); 4189 4190 /* 4191 * Map the pagetable page into the process address space, if 4192 * it isn't already there. 4193 */ 4194 4195 if (ptepindex >= (NUPDE + NUPDPE)) { 4196 pml1_entry_t *l1e; 4197 vm_pindex_t pml1index; 4198 4199 /* Wire up a new PDPE page */ 4200 pml1index = ptepindex - (NUPDE + NUPDPE); 4201 l1e = &pmap->pm_pml1[pml1index]; 4202 pde_store(l1e, VM_PAGE_TO_PHYS(m)); 4203 4204 } else if (ptepindex >= NUPDE) { 4205 vm_pindex_t pml1index; 4206 vm_pindex_t pdpindex; 4207 pml1_entry_t *l1e; 4208 pml2_entry_t *l2e; 4209 4210 /* Wire up a new l2e page */ 4211 pdpindex = ptepindex - NUPDE; 4212 pml1index = pdpindex >> RPTE_SHIFT; 4213 4214 l1e = &pmap->pm_pml1[pml1index]; 4215 if ((*l1e & PG_V) == 0) { 4216 /* Have to allocate a new pdp, recurse */ 4217 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, 4218 lockp) == NULL) { 4219 vm_page_unwire_noq(m); 4220 vm_page_free_zero(m); 4221 return (NULL); 4222 } 4223 } else { 4224 /* Add reference to l2e page */ 4225 pdppg = PHYS_TO_VM_PAGE(*l1e & PG_FRAME); 4226 pdppg->ref_count++; 4227 } 4228 l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); 4229 4230 /* Now find the pdp page */ 4231 l2e = &l2e[pdpindex & RPTE_MASK]; 4232 pde_store(l2e, VM_PAGE_TO_PHYS(m)); 4233 4234 } else { 4235 vm_pindex_t pml1index; 4236 vm_pindex_t pdpindex; 4237 pml1_entry_t *l1e; 4238 pml2_entry_t *l2e; 4239 pml3_entry_t *l3e; 4240 4241 /* Wire up a new PTE page */ 4242 pdpindex = ptepindex >> RPTE_SHIFT; 4243 pml1index = pdpindex >> RPTE_SHIFT; 4244 4245 /* First, find the pdp and check that its valid. */ 4246 l1e = &pmap->pm_pml1[pml1index]; 4247 if ((*l1e & PG_V) == 0) { 4248 /* Have to allocate a new pd, recurse */ 4249 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 4250 lockp) == NULL) { 4251 vm_page_unwire_noq(m); 4252 vm_page_free_zero(m); 4253 return (NULL); 4254 } 4255 l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); 4256 l2e = &l2e[pdpindex & RPTE_MASK]; 4257 } else { 4258 l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); 4259 l2e = &l2e[pdpindex & RPTE_MASK]; 4260 if ((*l2e & PG_V) == 0) { 4261 /* Have to allocate a new pd, recurse */ 4262 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 4263 lockp) == NULL) { 4264 vm_page_unwire_noq(m); 4265 vm_page_free_zero(m); 4266 return (NULL); 4267 } 4268 } else { 4269 /* Add reference to the pd page */ 4270 pdpg = PHYS_TO_VM_PAGE(*l2e & PG_FRAME); 4271 pdpg->ref_count++; 4272 } 4273 } 4274 l3e = (pml3_entry_t *)PHYS_TO_DMAP(*l2e & PG_FRAME); 4275 4276 /* Now we know where the page directory page is */ 4277 l3e = &l3e[ptepindex & RPTE_MASK]; 4278 pde_store(l3e, VM_PAGE_TO_PHYS(m)); 4279 } 4280 4281 pmap_resident_count_inc(pmap, 1); 4282 return (m); 4283 } 4284 static vm_page_t 4285 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4286 { 4287 vm_pindex_t pdpindex, ptepindex; 4288 pml2_entry_t *pdpe; 4289 vm_page_t pdpg; 4290 4291 retry: 4292 pdpe = pmap_pml2e(pmap, va); 4293 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4294 /* Add a reference to the pd page. */ 4295 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4296 pdpg->ref_count++; 4297 } else { 4298 /* Allocate a pd page. */ 4299 ptepindex = pmap_l3e_pindex(va); 4300 pdpindex = ptepindex >> RPTE_SHIFT; 4301 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 4302 if (pdpg == NULL && lockp != NULL) 4303 goto retry; 4304 } 4305 return (pdpg); 4306 } 4307 4308 static vm_page_t 4309 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4310 { 4311 vm_pindex_t ptepindex; 4312 pml3_entry_t *pd; 4313 vm_page_t m; 4314 4315 /* 4316 * Calculate pagetable page index 4317 */ 4318 ptepindex = pmap_l3e_pindex(va); 4319 retry: 4320 /* 4321 * Get the page directory entry 4322 */ 4323 pd = pmap_pml3e(pmap, va); 4324 4325 /* 4326 * This supports switching from a 2MB page to a 4327 * normal 4K page. 4328 */ 4329 if (pd != NULL && (*pd & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { 4330 if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { 4331 /* 4332 * Invalidation of the 2MB page mapping may have caused 4333 * the deallocation of the underlying PD page. 4334 */ 4335 pd = NULL; 4336 } 4337 } 4338 4339 /* 4340 * If the page table page is mapped, we just increment the 4341 * hold count, and activate it. 4342 */ 4343 if (pd != NULL && (*pd & PG_V) != 0) { 4344 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4345 m->ref_count++; 4346 } else { 4347 /* 4348 * Here if the pte page isn't mapped, or if it has been 4349 * deallocated. 4350 */ 4351 m = _pmap_allocpte(pmap, ptepindex, lockp); 4352 if (m == NULL && lockp != NULL) 4353 goto retry; 4354 } 4355 return (m); 4356 } 4357 4358 static void 4359 mmu_radix_pinit0(pmap_t pmap) 4360 { 4361 4362 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4363 PMAP_LOCK_INIT(pmap); 4364 pmap->pm_pml1 = kernel_pmap->pm_pml1; 4365 pmap->pm_pid = kernel_pmap->pm_pid; 4366 4367 pmap->pm_radix.rt_root = 0; 4368 TAILQ_INIT(&pmap->pm_pvchunk); 4369 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4370 kernel_pmap->pm_flags = 4371 pmap->pm_flags = PMAP_PDE_SUPERPAGE; 4372 } 4373 /* 4374 * pmap_protect_l3e: do the things to protect a 2mpage in a process 4375 */ 4376 static boolean_t 4377 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) 4378 { 4379 pt_entry_t newpde, oldpde; 4380 vm_offset_t eva, va; 4381 vm_page_t m; 4382 boolean_t anychanged; 4383 4384 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4385 KASSERT((sva & L3_PAGE_MASK) == 0, 4386 ("pmap_protect_l3e: sva is not 2mpage aligned")); 4387 anychanged = FALSE; 4388 retry: 4389 oldpde = newpde = *l3e; 4390 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 4391 (PG_MANAGED | PG_M | PG_RW)) { 4392 eva = sva + L3_PAGE_SIZE; 4393 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4394 va < eva; va += PAGE_SIZE, m++) 4395 vm_page_dirty(m); 4396 } 4397 if ((prot & VM_PROT_WRITE) == 0) { 4398 newpde &= ~(PG_RW | PG_M); 4399 newpde |= RPTE_EAA_R; 4400 } 4401 if (prot & VM_PROT_EXECUTE) 4402 newpde |= PG_X; 4403 if (newpde != oldpde) { 4404 /* 4405 * As an optimization to future operations on this PDE, clear 4406 * PG_PROMOTED. The impending invalidation will remove any 4407 * lingering 4KB page mappings from the TLB. 4408 */ 4409 if (!atomic_cmpset_long(l3e, oldpde, newpde & ~PG_PROMOTED)) 4410 goto retry; 4411 anychanged = TRUE; 4412 } 4413 return (anychanged); 4414 } 4415 4416 void 4417 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 4418 vm_prot_t prot) 4419 { 4420 vm_offset_t va_next; 4421 pml1_entry_t *l1e; 4422 pml2_entry_t *l2e; 4423 pml3_entry_t ptpaddr, *l3e; 4424 pt_entry_t *pte; 4425 boolean_t anychanged; 4426 4427 CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva, 4428 prot); 4429 4430 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4431 if (prot == VM_PROT_NONE) { 4432 mmu_radix_remove(pmap, sva, eva); 4433 return; 4434 } 4435 4436 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 4437 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 4438 return; 4439 4440 #ifdef INVARIANTS 4441 if (VERBOSE_PROTECT || pmap_logging) 4442 printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", 4443 pmap, sva, eva, prot, pmap->pm_pid); 4444 #endif 4445 anychanged = FALSE; 4446 4447 PMAP_LOCK(pmap); 4448 for (; sva < eva; sva = va_next) { 4449 l1e = pmap_pml1e(pmap, sva); 4450 if ((*l1e & PG_V) == 0) { 4451 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 4452 if (va_next < sva) 4453 va_next = eva; 4454 continue; 4455 } 4456 4457 l2e = pmap_l1e_to_l2e(l1e, sva); 4458 if ((*l2e & PG_V) == 0) { 4459 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 4460 if (va_next < sva) 4461 va_next = eva; 4462 continue; 4463 } 4464 4465 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 4466 if (va_next < sva) 4467 va_next = eva; 4468 4469 l3e = pmap_l2e_to_l3e(l2e, sva); 4470 ptpaddr = *l3e; 4471 4472 /* 4473 * Weed out invalid mappings. 4474 */ 4475 if (ptpaddr == 0) 4476 continue; 4477 4478 /* 4479 * Check for large page. 4480 */ 4481 if ((ptpaddr & RPTE_LEAF) != 0) { 4482 /* 4483 * Are we protecting the entire large page? If not, 4484 * demote the mapping and fall through. 4485 */ 4486 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 4487 if (pmap_protect_l3e(pmap, l3e, sva, prot)) 4488 anychanged = TRUE; 4489 continue; 4490 } else if (!pmap_demote_l3e(pmap, l3e, sva)) { 4491 /* 4492 * The large page mapping was destroyed. 4493 */ 4494 continue; 4495 } 4496 } 4497 4498 if (va_next > eva) 4499 va_next = eva; 4500 4501 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, 4502 sva += PAGE_SIZE) { 4503 pt_entry_t obits, pbits; 4504 vm_page_t m; 4505 4506 retry: 4507 MPASS(pte == pmap_pte(pmap, sva)); 4508 obits = pbits = *pte; 4509 if ((pbits & PG_V) == 0) 4510 continue; 4511 4512 if ((prot & VM_PROT_WRITE) == 0) { 4513 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 4514 (PG_MANAGED | PG_M | PG_RW)) { 4515 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4516 vm_page_dirty(m); 4517 } 4518 pbits &= ~(PG_RW | PG_M); 4519 pbits |= RPTE_EAA_R; 4520 } 4521 if (prot & VM_PROT_EXECUTE) 4522 pbits |= PG_X; 4523 4524 if (pbits != obits) { 4525 if (!atomic_cmpset_long(pte, obits, pbits)) 4526 goto retry; 4527 if (obits & (PG_A|PG_M)) { 4528 anychanged = TRUE; 4529 #ifdef INVARIANTS 4530 if (VERBOSE_PROTECT || pmap_logging) 4531 printf("%#lx %#lx -> %#lx\n", 4532 sva, obits, pbits); 4533 #endif 4534 } 4535 } 4536 } 4537 } 4538 if (anychanged) 4539 pmap_invalidate_all(pmap); 4540 PMAP_UNLOCK(pmap); 4541 } 4542 4543 void 4544 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count) 4545 { 4546 4547 CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count); 4548 pt_entry_t oldpte, pa, *pte; 4549 vm_page_t m; 4550 uint64_t cache_bits, attr_bits; 4551 vm_offset_t va; 4552 4553 oldpte = 0; 4554 attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; 4555 va = sva; 4556 pte = kvtopte(va); 4557 while (va < sva + PAGE_SIZE * count) { 4558 if (__predict_false((va & L3_PAGE_MASK) == 0)) 4559 pte = kvtopte(va); 4560 MPASS(pte == pmap_pte(kernel_pmap, va)); 4561 4562 /* 4563 * XXX there has to be a more efficient way than traversing 4564 * the page table every time - but go for correctness for 4565 * today 4566 */ 4567 4568 m = *ma++; 4569 cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); 4570 pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; 4571 if (*pte != pa) { 4572 oldpte |= *pte; 4573 pte_store(pte, pa); 4574 } 4575 va += PAGE_SIZE; 4576 pte++; 4577 } 4578 if (__predict_false((oldpte & RPTE_VALID) != 0)) 4579 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4580 PAGE_SIZE); 4581 else 4582 ptesync(); 4583 } 4584 4585 void 4586 mmu_radix_qremove(vm_offset_t sva, int count) 4587 { 4588 vm_offset_t va; 4589 pt_entry_t *pte; 4590 4591 CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); 4592 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); 4593 4594 va = sva; 4595 pte = kvtopte(va); 4596 while (va < sva + PAGE_SIZE * count) { 4597 if (__predict_false((va & L3_PAGE_MASK) == 0)) 4598 pte = kvtopte(va); 4599 pte_clear(pte); 4600 pte++; 4601 va += PAGE_SIZE; 4602 } 4603 pmap_invalidate_range(kernel_pmap, sva, va); 4604 } 4605 4606 /*************************************************** 4607 * Page table page management routines..... 4608 ***************************************************/ 4609 /* 4610 * Schedule the specified unused page table page to be freed. Specifically, 4611 * add the page to the specified list of pages that will be released to the 4612 * physical memory manager after the TLB has been updated. 4613 */ 4614 static __inline void 4615 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 4616 boolean_t set_PG_ZERO) 4617 { 4618 4619 if (set_PG_ZERO) 4620 m->flags |= PG_ZERO; 4621 else 4622 m->flags &= ~PG_ZERO; 4623 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4624 } 4625 4626 /* 4627 * Inserts the specified page table page into the specified pmap's collection 4628 * of idle page table pages. Each of a pmap's page table pages is responsible 4629 * for mapping a distinct range of virtual addresses. The pmap's collection is 4630 * ordered by this virtual address range. 4631 */ 4632 static __inline int 4633 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 4634 { 4635 4636 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4637 return (vm_radix_insert(&pmap->pm_radix, mpte)); 4638 } 4639 4640 /* 4641 * Removes the page table page mapping the specified virtual address from the 4642 * specified pmap's collection of idle page table pages, and returns it. 4643 * Otherwise, returns NULL if there is no page table page corresponding to the 4644 * specified virtual address. 4645 */ 4646 static __inline vm_page_t 4647 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4648 { 4649 4650 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4651 return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va))); 4652 } 4653 4654 /* 4655 * Decrements a page table page's wire count, which is used to record the 4656 * number of valid page table entries within the page. If the wire count 4657 * drops to zero, then the page table page is unmapped. Returns TRUE if the 4658 * page table page was unmapped and FALSE otherwise. 4659 */ 4660 static inline boolean_t 4661 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4662 { 4663 4664 --m->ref_count; 4665 if (m->ref_count == 0) { 4666 _pmap_unwire_ptp(pmap, va, m, free); 4667 return (TRUE); 4668 } else 4669 return (FALSE); 4670 } 4671 4672 static void 4673 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4674 { 4675 4676 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4677 /* 4678 * unmap the page table page 4679 */ 4680 if (m->pindex >= (NUPDE + NUPDPE)) { 4681 /* PDP page */ 4682 pml1_entry_t *pml1; 4683 pml1 = pmap_pml1e(pmap, va); 4684 *pml1 = 0; 4685 } else if (m->pindex >= NUPDE) { 4686 /* PD page */ 4687 pml2_entry_t *l2e; 4688 l2e = pmap_pml2e(pmap, va); 4689 *l2e = 0; 4690 } else { 4691 /* PTE page */ 4692 pml3_entry_t *l3e; 4693 l3e = pmap_pml3e(pmap, va); 4694 *l3e = 0; 4695 } 4696 pmap_resident_count_dec(pmap, 1); 4697 if (m->pindex < NUPDE) { 4698 /* We just released a PT, unhold the matching PD */ 4699 vm_page_t pdpg; 4700 4701 pdpg = PHYS_TO_VM_PAGE(*pmap_pml2e(pmap, va) & PG_FRAME); 4702 pmap_unwire_ptp(pmap, va, pdpg, free); 4703 } 4704 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 4705 /* We just released a PD, unhold the matching PDP */ 4706 vm_page_t pdppg; 4707 4708 pdppg = PHYS_TO_VM_PAGE(*pmap_pml1e(pmap, va) & PG_FRAME); 4709 pmap_unwire_ptp(pmap, va, pdppg, free); 4710 } 4711 4712 /* 4713 * Put page on a list so that it is released after 4714 * *ALL* TLB shootdown is done 4715 */ 4716 pmap_add_delayed_free_list(m, free, TRUE); 4717 } 4718 4719 /* 4720 * After removing a page table entry, this routine is used to 4721 * conditionally free the page, and manage the hold/wire counts. 4722 */ 4723 static int 4724 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, 4725 struct spglist *free) 4726 { 4727 vm_page_t mpte; 4728 4729 if (va >= VM_MAXUSER_ADDRESS) 4730 return (0); 4731 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4732 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4733 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4734 } 4735 4736 void 4737 mmu_radix_release(pmap_t pmap) 4738 { 4739 4740 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 4741 KASSERT(pmap->pm_stats.resident_count == 0, 4742 ("pmap_release: pmap resident count %ld != 0", 4743 pmap->pm_stats.resident_count)); 4744 KASSERT(vm_radix_is_empty(&pmap->pm_radix), 4745 ("pmap_release: pmap has reserved page table page(s)")); 4746 4747 pmap_invalidate_all(pmap); 4748 isa3_proctab[pmap->pm_pid].proctab0 = 0; 4749 uma_zfree(zone_radix_pgd, pmap->pm_pml1); 4750 vmem_free(asid_arena, pmap->pm_pid, 1); 4751 } 4752 4753 /* 4754 * Create the PV entry for a 2MB page mapping. Always returns true unless the 4755 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 4756 * false if the PV entry cannot be allocated without resorting to reclamation. 4757 */ 4758 static bool 4759 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, 4760 struct rwlock **lockp) 4761 { 4762 struct md_page *pvh; 4763 pv_entry_t pv; 4764 vm_paddr_t pa; 4765 4766 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4767 /* Pass NULL instead of the lock pointer to disable reclamation. */ 4768 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 4769 NULL : lockp)) == NULL) 4770 return (false); 4771 pv->pv_va = va; 4772 pa = pde & PG_PS_FRAME; 4773 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4774 pvh = pa_to_pvh(pa); 4775 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); 4776 pvh->pv_gen++; 4777 return (true); 4778 } 4779 4780 /* 4781 * Fills a page table page with mappings to consecutive physical pages. 4782 */ 4783 static void 4784 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 4785 { 4786 pt_entry_t *pte; 4787 4788 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 4789 *pte = newpte; 4790 newpte += PAGE_SIZE; 4791 } 4792 } 4793 4794 static boolean_t 4795 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) 4796 { 4797 struct rwlock *lock; 4798 boolean_t rv; 4799 4800 lock = NULL; 4801 rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); 4802 if (lock != NULL) 4803 rw_wunlock(lock); 4804 return (rv); 4805 } 4806 4807 static boolean_t 4808 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, 4809 struct rwlock **lockp) 4810 { 4811 pml3_entry_t oldpde; 4812 pt_entry_t *firstpte; 4813 vm_paddr_t mptepa; 4814 vm_page_t mpte; 4815 struct spglist free; 4816 vm_offset_t sva; 4817 4818 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4819 oldpde = *l3e; 4820 KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), 4821 ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", 4822 oldpde)); 4823 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 4824 NULL) { 4825 KASSERT((oldpde & PG_W) == 0, 4826 ("pmap_demote_l3e: page table page for a wired mapping" 4827 " is missing")); 4828 4829 /* 4830 * Invalidate the 2MB page mapping and return "failure" if the 4831 * mapping was never accessed or the allocation of the new 4832 * page table page fails. If the 2MB page mapping belongs to 4833 * the direct map region of the kernel's address space, then 4834 * the page allocation request specifies the highest possible 4835 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 4836 * normal. Page table pages are preallocated for every other 4837 * part of the kernel address space, so the direct map region 4838 * is the only part of the kernel address space that must be 4839 * handled here. 4840 */ 4841 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 4842 pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 4843 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 4844 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 4845 SLIST_INIT(&free); 4846 sva = trunc_2mpage(va); 4847 pmap_remove_l3e(pmap, l3e, sva, &free, lockp); 4848 pmap_invalidate_l3e_page(pmap, sva, oldpde); 4849 vm_page_free_pages_toq(&free, true); 4850 CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" 4851 " in pmap %p", va, pmap); 4852 return (FALSE); 4853 } 4854 if (va < VM_MAXUSER_ADDRESS) 4855 pmap_resident_count_inc(pmap, 1); 4856 } 4857 mptepa = VM_PAGE_TO_PHYS(mpte); 4858 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 4859 KASSERT((oldpde & PG_A) != 0, 4860 ("pmap_demote_l3e: oldpde is missing PG_A")); 4861 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 4862 ("pmap_demote_l3e: oldpde is missing PG_M")); 4863 4864 /* 4865 * If the page table page is new, initialize it. 4866 */ 4867 if (mpte->ref_count == 1) { 4868 mpte->ref_count = NPTEPG; 4869 pmap_fill_ptp(firstpte, oldpde); 4870 } 4871 4872 KASSERT((*firstpte & PG_FRAME) == (oldpde & PG_FRAME), 4873 ("pmap_demote_l3e: firstpte and newpte map different physical" 4874 " addresses")); 4875 4876 /* 4877 * If the mapping has changed attributes, update the page table 4878 * entries. 4879 */ 4880 if ((*firstpte & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) 4881 pmap_fill_ptp(firstpte, oldpde); 4882 4883 /* 4884 * The spare PV entries must be reserved prior to demoting the 4885 * mapping, that is, prior to changing the PDE. Otherwise, the state 4886 * of the PDE and the PV lists will be inconsistent, which can result 4887 * in reclaim_pv_chunk() attempting to remove a PV entry from the 4888 * wrong PV list and pmap_pv_demote_l3e() failing to find the expected 4889 * PV entry for the 2MB page mapping that is being demoted. 4890 */ 4891 if ((oldpde & PG_MANAGED) != 0) 4892 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 4893 4894 /* 4895 * Demote the mapping. This pmap is locked. The old PDE has 4896 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 4897 * set. Thus, there is no danger of a race with another 4898 * processor changing the setting of PG_A and/or PG_M between 4899 * the read above and the store below. 4900 */ 4901 pde_store(l3e, mptepa); 4902 ptesync(); 4903 /* 4904 * Demote the PV entry. 4905 */ 4906 if ((oldpde & PG_MANAGED) != 0) 4907 pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); 4908 4909 atomic_add_long(&pmap_l3e_demotions, 1); 4910 CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" 4911 " in pmap %p", va, pmap); 4912 return (TRUE); 4913 } 4914 4915 /* 4916 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 4917 */ 4918 static void 4919 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) 4920 { 4921 vm_paddr_t mptepa; 4922 vm_page_t mpte; 4923 4924 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 4925 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4926 mpte = pmap_remove_pt_page(pmap, va); 4927 if (mpte == NULL) 4928 panic("pmap_remove_kernel_pde: Missing pt page."); 4929 4930 mptepa = VM_PAGE_TO_PHYS(mpte); 4931 4932 /* 4933 * Initialize the page table page. 4934 */ 4935 pagezero(PHYS_TO_DMAP(mptepa)); 4936 4937 /* 4938 * Demote the mapping. 4939 */ 4940 pde_store(l3e, mptepa); 4941 ptesync(); 4942 } 4943 4944 /* 4945 * pmap_remove_l3e: do the things to unmap a superpage in a process 4946 */ 4947 static int 4948 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, 4949 struct spglist *free, struct rwlock **lockp) 4950 { 4951 struct md_page *pvh; 4952 pml3_entry_t oldpde; 4953 vm_offset_t eva, va; 4954 vm_page_t m, mpte; 4955 4956 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4957 KASSERT((sva & L3_PAGE_MASK) == 0, 4958 ("pmap_remove_l3e: sva is not 2mpage aligned")); 4959 oldpde = pte_load_clear(pdq); 4960 if (oldpde & PG_W) 4961 pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); 4962 pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); 4963 if (oldpde & PG_MANAGED) { 4964 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 4965 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 4966 pmap_pvh_free(pvh, pmap, sva); 4967 eva = sva + L3_PAGE_SIZE; 4968 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 4969 va < eva; va += PAGE_SIZE, m++) { 4970 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4971 vm_page_dirty(m); 4972 if (oldpde & PG_A) 4973 vm_page_aflag_set(m, PGA_REFERENCED); 4974 if (TAILQ_EMPTY(&m->md.pv_list) && 4975 TAILQ_EMPTY(&pvh->pv_list)) 4976 vm_page_aflag_clear(m, PGA_WRITEABLE); 4977 } 4978 } 4979 if (pmap == kernel_pmap) { 4980 pmap_remove_kernel_l3e(pmap, pdq, sva); 4981 } else { 4982 mpte = pmap_remove_pt_page(pmap, sva); 4983 if (mpte != NULL) { 4984 pmap_resident_count_dec(pmap, 1); 4985 KASSERT(mpte->ref_count == NPTEPG, 4986 ("pmap_remove_l3e: pte page wire count error")); 4987 mpte->ref_count = 0; 4988 pmap_add_delayed_free_list(mpte, free, FALSE); 4989 } 4990 } 4991 return (pmap_unuse_pt(pmap, sva, *pmap_pml2e(pmap, sva), free)); 4992 } 4993 4994 /* 4995 * pmap_remove_pte: do the things to unmap a page in a process 4996 */ 4997 static int 4998 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 4999 pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 5000 { 5001 struct md_page *pvh; 5002 pt_entry_t oldpte; 5003 vm_page_t m; 5004 5005 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5006 oldpte = pte_load_clear(ptq); 5007 if (oldpte & RPTE_WIRED) 5008 pmap->pm_stats.wired_count -= 1; 5009 pmap_resident_count_dec(pmap, 1); 5010 if (oldpte & RPTE_MANAGED) { 5011 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 5012 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5013 vm_page_dirty(m); 5014 if (oldpte & PG_A) 5015 vm_page_aflag_set(m, PGA_REFERENCED); 5016 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5017 pmap_pvh_free(&m->md, pmap, va); 5018 if (TAILQ_EMPTY(&m->md.pv_list) && 5019 (m->flags & PG_FICTITIOUS) == 0) { 5020 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5021 if (TAILQ_EMPTY(&pvh->pv_list)) 5022 vm_page_aflag_clear(m, PGA_WRITEABLE); 5023 } 5024 } 5025 return (pmap_unuse_pt(pmap, va, ptepde, free)); 5026 } 5027 5028 /* 5029 * Remove a single page from a process address space 5030 */ 5031 static bool 5032 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, 5033 struct spglist *free) 5034 { 5035 struct rwlock *lock; 5036 pt_entry_t *pte; 5037 bool invalidate_all; 5038 5039 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5040 if ((*l3e & RPTE_VALID) == 0) { 5041 return (false); 5042 } 5043 pte = pmap_l3e_to_pte(l3e, va); 5044 if ((*pte & RPTE_VALID) == 0) { 5045 return (false); 5046 } 5047 lock = NULL; 5048 5049 invalidate_all = pmap_remove_pte(pmap, pte, va, *l3e, free, &lock); 5050 if (lock != NULL) 5051 rw_wunlock(lock); 5052 if (!invalidate_all) 5053 pmap_invalidate_page(pmap, va); 5054 return (invalidate_all); 5055 } 5056 5057 /* 5058 * Removes the specified range of addresses from the page table page. 5059 */ 5060 static bool 5061 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 5062 pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) 5063 { 5064 pt_entry_t *pte; 5065 vm_offset_t va; 5066 bool anyvalid; 5067 5068 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5069 anyvalid = false; 5070 va = eva; 5071 for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, 5072 sva += PAGE_SIZE) { 5073 MPASS(pte == pmap_pte(pmap, sva)); 5074 if (*pte == 0) { 5075 if (va != eva) { 5076 anyvalid = true; 5077 va = eva; 5078 } 5079 continue; 5080 } 5081 if (va == eva) 5082 va = sva; 5083 if (pmap_remove_pte(pmap, pte, sva, *l3e, free, lockp)) { 5084 anyvalid = true; 5085 sva += PAGE_SIZE; 5086 break; 5087 } 5088 } 5089 if (anyvalid) 5090 pmap_invalidate_all(pmap); 5091 else if (va != eva) 5092 pmap_invalidate_range(pmap, va, sva); 5093 return (anyvalid); 5094 } 5095 5096 void 5097 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5098 { 5099 struct rwlock *lock; 5100 vm_offset_t va_next; 5101 pml1_entry_t *l1e; 5102 pml2_entry_t *l2e; 5103 pml3_entry_t ptpaddr, *l3e; 5104 struct spglist free; 5105 bool anyvalid; 5106 5107 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); 5108 5109 /* 5110 * Perform an unsynchronized read. This is, however, safe. 5111 */ 5112 if (pmap->pm_stats.resident_count == 0) 5113 return; 5114 5115 anyvalid = false; 5116 SLIST_INIT(&free); 5117 5118 /* XXX something fishy here */ 5119 sva = (sva + PAGE_MASK) & ~PAGE_MASK; 5120 eva = (eva + PAGE_MASK) & ~PAGE_MASK; 5121 5122 PMAP_LOCK(pmap); 5123 5124 /* 5125 * special handling of removing one page. a very 5126 * common operation and easy to short circuit some 5127 * code. 5128 */ 5129 if (sva + PAGE_SIZE == eva) { 5130 l3e = pmap_pml3e(pmap, sva); 5131 if (l3e && (*l3e & RPTE_LEAF) == 0) { 5132 anyvalid = pmap_remove_page(pmap, sva, l3e, &free); 5133 goto out; 5134 } 5135 } 5136 5137 lock = NULL; 5138 for (; sva < eva; sva = va_next) { 5139 if (pmap->pm_stats.resident_count == 0) 5140 break; 5141 l1e = pmap_pml1e(pmap, sva); 5142 if (l1e == NULL || (*l1e & PG_V) == 0) { 5143 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 5144 if (va_next < sva) 5145 va_next = eva; 5146 continue; 5147 } 5148 5149 l2e = pmap_l1e_to_l2e(l1e, sva); 5150 if (l2e == NULL || (*l2e & PG_V) == 0) { 5151 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 5152 if (va_next < sva) 5153 va_next = eva; 5154 continue; 5155 } 5156 5157 /* 5158 * Calculate index for next page table. 5159 */ 5160 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 5161 if (va_next < sva) 5162 va_next = eva; 5163 5164 l3e = pmap_l2e_to_l3e(l2e, sva); 5165 ptpaddr = *l3e; 5166 5167 /* 5168 * Weed out invalid mappings. 5169 */ 5170 if (ptpaddr == 0) 5171 continue; 5172 5173 /* 5174 * Check for large page. 5175 */ 5176 if ((ptpaddr & RPTE_LEAF) != 0) { 5177 /* 5178 * Are we removing the entire large page? If not, 5179 * demote the mapping and fall through. 5180 */ 5181 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 5182 pmap_remove_l3e(pmap, l3e, sva, &free, &lock); 5183 continue; 5184 } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, 5185 &lock)) { 5186 /* The large page mapping was destroyed. */ 5187 continue; 5188 } else 5189 ptpaddr = *l3e; 5190 } 5191 5192 /* 5193 * Limit our scan to either the end of the va represented 5194 * by the current page table page, or to the end of the 5195 * range being removed. 5196 */ 5197 if (va_next > eva) 5198 va_next = eva; 5199 5200 if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) 5201 anyvalid = true; 5202 } 5203 if (lock != NULL) 5204 rw_wunlock(lock); 5205 out: 5206 if (anyvalid) 5207 pmap_invalidate_all(pmap); 5208 PMAP_UNLOCK(pmap); 5209 vm_page_free_pages_toq(&free, true); 5210 } 5211 5212 void 5213 mmu_radix_remove_all(vm_page_t m) 5214 { 5215 struct md_page *pvh; 5216 pv_entry_t pv; 5217 pmap_t pmap; 5218 struct rwlock *lock; 5219 pt_entry_t *pte, tpte; 5220 pml3_entry_t *l3e; 5221 vm_offset_t va; 5222 struct spglist free; 5223 int pvh_gen, md_gen; 5224 5225 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5226 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5227 ("pmap_remove_all: page %p is not managed", m)); 5228 SLIST_INIT(&free); 5229 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5230 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5231 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5232 retry: 5233 rw_wlock(lock); 5234 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 5235 pmap = PV_PMAP(pv); 5236 if (!PMAP_TRYLOCK(pmap)) { 5237 pvh_gen = pvh->pv_gen; 5238 rw_wunlock(lock); 5239 PMAP_LOCK(pmap); 5240 rw_wlock(lock); 5241 if (pvh_gen != pvh->pv_gen) { 5242 rw_wunlock(lock); 5243 PMAP_UNLOCK(pmap); 5244 goto retry; 5245 } 5246 } 5247 va = pv->pv_va; 5248 l3e = pmap_pml3e(pmap, va); 5249 (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); 5250 PMAP_UNLOCK(pmap); 5251 } 5252 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 5253 pmap = PV_PMAP(pv); 5254 if (!PMAP_TRYLOCK(pmap)) { 5255 pvh_gen = pvh->pv_gen; 5256 md_gen = m->md.pv_gen; 5257 rw_wunlock(lock); 5258 PMAP_LOCK(pmap); 5259 rw_wlock(lock); 5260 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5261 rw_wunlock(lock); 5262 PMAP_UNLOCK(pmap); 5263 goto retry; 5264 } 5265 } 5266 pmap_resident_count_dec(pmap, 1); 5267 l3e = pmap_pml3e(pmap, pv->pv_va); 5268 KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_remove_all: found" 5269 " a 2mpage in page %p's pv list", m)); 5270 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 5271 tpte = pte_load_clear(pte); 5272 if (tpte & PG_W) 5273 pmap->pm_stats.wired_count--; 5274 if (tpte & PG_A) 5275 vm_page_aflag_set(m, PGA_REFERENCED); 5276 5277 /* 5278 * Update the vm_page_t clean and reference bits. 5279 */ 5280 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5281 vm_page_dirty(m); 5282 pmap_unuse_pt(pmap, pv->pv_va, *l3e, &free); 5283 pmap_invalidate_page(pmap, pv->pv_va); 5284 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 5285 m->md.pv_gen++; 5286 free_pv_entry(pmap, pv); 5287 PMAP_UNLOCK(pmap); 5288 } 5289 vm_page_aflag_clear(m, PGA_WRITEABLE); 5290 rw_wunlock(lock); 5291 vm_page_free_pages_toq(&free, true); 5292 } 5293 5294 /* 5295 * Destroy all managed, non-wired mappings in the given user-space 5296 * pmap. This pmap cannot be active on any processor besides the 5297 * caller. 5298 * 5299 * This function cannot be applied to the kernel pmap. Moreover, it 5300 * is not intended for general use. It is only to be used during 5301 * process termination. Consequently, it can be implemented in ways 5302 * that make it faster than pmap_remove(). First, it can more quickly 5303 * destroy mappings by iterating over the pmap's collection of PV 5304 * entries, rather than searching the page table. Second, it doesn't 5305 * have to test and clear the page table entries atomically, because 5306 * no processor is currently accessing the user address space. In 5307 * particular, a page table entry's dirty bit won't change state once 5308 * this function starts. 5309 * 5310 * Although this function destroys all of the pmap's managed, 5311 * non-wired mappings, it can delay and batch the invalidation of TLB 5312 * entries without calling pmap_delayed_invl_started() and 5313 * pmap_delayed_invl_finished(). Because the pmap is not active on 5314 * any other processor, none of these TLB entries will ever be used 5315 * before their eventual invalidation. Consequently, there is no need 5316 * for either pmap_remove_all() or pmap_remove_write() to wait for 5317 * that eventual TLB invalidation. 5318 */ 5319 5320 void 5321 mmu_radix_remove_pages(pmap_t pmap) 5322 { 5323 5324 CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); 5325 pml3_entry_t ptel3e; 5326 pt_entry_t *pte, tpte; 5327 struct spglist free; 5328 vm_page_t m, mpte, mt; 5329 pv_entry_t pv; 5330 struct md_page *pvh; 5331 struct pv_chunk *pc, *npc; 5332 struct rwlock *lock; 5333 int64_t bit; 5334 uint64_t inuse, bitmask; 5335 int allfree, field, freed, idx; 5336 boolean_t superpage; 5337 vm_paddr_t pa; 5338 5339 /* 5340 * Assert that the given pmap is only active on the current 5341 * CPU. Unfortunately, we cannot block another CPU from 5342 * activating the pmap while this function is executing. 5343 */ 5344 KASSERT(pmap->pm_pid == mfspr(SPR_PID), 5345 ("non-current asid %lu - expected %lu", pmap->pm_pid, 5346 mfspr(SPR_PID))); 5347 5348 lock = NULL; 5349 5350 SLIST_INIT(&free); 5351 PMAP_LOCK(pmap); 5352 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5353 allfree = 1; 5354 freed = 0; 5355 for (field = 0; field < _NPCM; field++) { 5356 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5357 while (inuse != 0) { 5358 bit = cnttzd(inuse); 5359 bitmask = 1UL << bit; 5360 idx = field * 64 + bit; 5361 pv = &pc->pc_pventry[idx]; 5362 inuse &= ~bitmask; 5363 5364 pte = pmap_pml2e(pmap, pv->pv_va); 5365 ptel3e = *pte; 5366 pte = pmap_l2e_to_l3e(pte, pv->pv_va); 5367 tpte = *pte; 5368 if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { 5369 superpage = FALSE; 5370 ptel3e = tpte; 5371 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5372 PG_FRAME); 5373 pte = &pte[pmap_pte_index(pv->pv_va)]; 5374 tpte = *pte; 5375 } else { 5376 /* 5377 * Keep track whether 'tpte' is a 5378 * superpage explicitly instead of 5379 * relying on RPTE_LEAF being set. 5380 * 5381 * This is because RPTE_LEAF is numerically 5382 * identical to PG_PTE_PAT and thus a 5383 * regular page could be mistaken for 5384 * a superpage. 5385 */ 5386 superpage = TRUE; 5387 } 5388 5389 if ((tpte & PG_V) == 0) { 5390 panic("bad pte va %lx pte %lx", 5391 pv->pv_va, tpte); 5392 } 5393 5394 /* 5395 * We cannot remove wired pages from a process' mapping at this time 5396 */ 5397 if (tpte & PG_W) { 5398 allfree = 0; 5399 continue; 5400 } 5401 5402 if (superpage) 5403 pa = tpte & PG_PS_FRAME; 5404 else 5405 pa = tpte & PG_FRAME; 5406 5407 m = PHYS_TO_VM_PAGE(pa); 5408 KASSERT(m->phys_addr == pa, 5409 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5410 m, (uintmax_t)m->phys_addr, 5411 (uintmax_t)tpte)); 5412 5413 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5414 m < &vm_page_array[vm_page_array_size], 5415 ("pmap_remove_pages: bad tpte %#jx", 5416 (uintmax_t)tpte)); 5417 5418 pte_clear(pte); 5419 5420 /* 5421 * Update the vm_page_t clean/reference bits. 5422 */ 5423 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5424 if (superpage) { 5425 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 5426 vm_page_dirty(mt); 5427 } else 5428 vm_page_dirty(m); 5429 } 5430 5431 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5432 5433 /* Mark free */ 5434 pc->pc_map[field] |= bitmask; 5435 if (superpage) { 5436 pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); 5437 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5438 TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); 5439 pvh->pv_gen++; 5440 if (TAILQ_EMPTY(&pvh->pv_list)) { 5441 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) 5442 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 5443 TAILQ_EMPTY(&mt->md.pv_list)) 5444 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5445 } 5446 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 5447 if (mpte != NULL) { 5448 pmap_resident_count_dec(pmap, 1); 5449 KASSERT(mpte->ref_count == NPTEPG, 5450 ("pmap_remove_pages: pte page wire count error")); 5451 mpte->ref_count = 0; 5452 pmap_add_delayed_free_list(mpte, &free, FALSE); 5453 } 5454 } else { 5455 pmap_resident_count_dec(pmap, 1); 5456 #ifdef VERBOSE_PV 5457 printf("freeing pv (%p, %p)\n", 5458 pmap, pv); 5459 #endif 5460 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); 5461 m->md.pv_gen++; 5462 if ((m->a.flags & PGA_WRITEABLE) != 0 && 5463 TAILQ_EMPTY(&m->md.pv_list) && 5464 (m->flags & PG_FICTITIOUS) == 0) { 5465 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5466 if (TAILQ_EMPTY(&pvh->pv_list)) 5467 vm_page_aflag_clear(m, PGA_WRITEABLE); 5468 } 5469 } 5470 pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); 5471 freed++; 5472 } 5473 } 5474 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5475 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5476 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5477 if (allfree) { 5478 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5479 free_pv_chunk(pc); 5480 } 5481 } 5482 if (lock != NULL) 5483 rw_wunlock(lock); 5484 pmap_invalidate_all(pmap); 5485 PMAP_UNLOCK(pmap); 5486 vm_page_free_pages_toq(&free, true); 5487 } 5488 5489 void 5490 mmu_radix_remove_write(vm_page_t m) 5491 { 5492 struct md_page *pvh; 5493 pmap_t pmap; 5494 struct rwlock *lock; 5495 pv_entry_t next_pv, pv; 5496 pml3_entry_t *l3e; 5497 pt_entry_t oldpte, *pte; 5498 int pvh_gen, md_gen; 5499 5500 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5501 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5502 ("pmap_remove_write: page %p is not managed", m)); 5503 vm_page_assert_busied(m); 5504 5505 if (!pmap_page_is_write_mapped(m)) 5506 return; 5507 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5508 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 5509 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5510 retry_pv_loop: 5511 rw_wlock(lock); 5512 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { 5513 pmap = PV_PMAP(pv); 5514 if (!PMAP_TRYLOCK(pmap)) { 5515 pvh_gen = pvh->pv_gen; 5516 rw_wunlock(lock); 5517 PMAP_LOCK(pmap); 5518 rw_wlock(lock); 5519 if (pvh_gen != pvh->pv_gen) { 5520 PMAP_UNLOCK(pmap); 5521 rw_wunlock(lock); 5522 goto retry_pv_loop; 5523 } 5524 } 5525 l3e = pmap_pml3e(pmap, pv->pv_va); 5526 if ((*l3e & PG_RW) != 0) 5527 (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); 5528 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5529 ("inconsistent pv lock %p %p for page %p", 5530 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5531 PMAP_UNLOCK(pmap); 5532 } 5533 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 5534 pmap = PV_PMAP(pv); 5535 if (!PMAP_TRYLOCK(pmap)) { 5536 pvh_gen = pvh->pv_gen; 5537 md_gen = m->md.pv_gen; 5538 rw_wunlock(lock); 5539 PMAP_LOCK(pmap); 5540 rw_wlock(lock); 5541 if (pvh_gen != pvh->pv_gen || 5542 md_gen != m->md.pv_gen) { 5543 PMAP_UNLOCK(pmap); 5544 rw_wunlock(lock); 5545 goto retry_pv_loop; 5546 } 5547 } 5548 l3e = pmap_pml3e(pmap, pv->pv_va); 5549 KASSERT((*l3e & RPTE_LEAF) == 0, 5550 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5551 m)); 5552 pte = pmap_l3e_to_pte(l3e, pv->pv_va); 5553 retry: 5554 oldpte = *pte; 5555 if (oldpte & PG_RW) { 5556 if (!atomic_cmpset_long(pte, oldpte, 5557 (oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))) 5558 goto retry; 5559 if ((oldpte & PG_M) != 0) 5560 vm_page_dirty(m); 5561 pmap_invalidate_page(pmap, pv->pv_va); 5562 } 5563 PMAP_UNLOCK(pmap); 5564 } 5565 rw_wunlock(lock); 5566 vm_page_aflag_clear(m, PGA_WRITEABLE); 5567 } 5568 5569 /* 5570 * Clear the wired attribute from the mappings for the specified range of 5571 * addresses in the given pmap. Every valid mapping within that range 5572 * must have the wired attribute set. In contrast, invalid mappings 5573 * cannot have the wired attribute set, so they are ignored. 5574 * 5575 * The wired attribute of the page table entry is not a hardware 5576 * feature, so there is no need to invalidate any TLB entries. 5577 * Since pmap_demote_l3e() for the wired entry must never fail, 5578 * pmap_delayed_invl_started()/finished() calls around the 5579 * function are not needed. 5580 */ 5581 void 5582 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5583 { 5584 vm_offset_t va_next; 5585 pml1_entry_t *l1e; 5586 pml2_entry_t *l2e; 5587 pml3_entry_t *l3e; 5588 pt_entry_t *pte; 5589 5590 CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); 5591 PMAP_LOCK(pmap); 5592 for (; sva < eva; sva = va_next) { 5593 l1e = pmap_pml1e(pmap, sva); 5594 if ((*l1e & PG_V) == 0) { 5595 va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; 5596 if (va_next < sva) 5597 va_next = eva; 5598 continue; 5599 } 5600 l2e = pmap_l1e_to_l2e(l1e, sva); 5601 if ((*l2e & PG_V) == 0) { 5602 va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; 5603 if (va_next < sva) 5604 va_next = eva; 5605 continue; 5606 } 5607 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; 5608 if (va_next < sva) 5609 va_next = eva; 5610 l3e = pmap_l2e_to_l3e(l2e, sva); 5611 if ((*l3e & PG_V) == 0) 5612 continue; 5613 if ((*l3e & RPTE_LEAF) != 0) { 5614 if ((*l3e & PG_W) == 0) 5615 panic("pmap_unwire: pde %#jx is missing PG_W", 5616 (uintmax_t)*l3e); 5617 5618 /* 5619 * Are we unwiring the entire large page? If not, 5620 * demote the mapping and fall through. 5621 */ 5622 if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { 5623 atomic_clear_long(l3e, PG_W); 5624 pmap->pm_stats.wired_count -= L3_PAGE_SIZE / 5625 PAGE_SIZE; 5626 continue; 5627 } else if (!pmap_demote_l3e(pmap, l3e, sva)) 5628 panic("pmap_unwire: demotion failed"); 5629 } 5630 if (va_next > eva) 5631 va_next = eva; 5632 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, 5633 sva += PAGE_SIZE) { 5634 MPASS(pte == pmap_pte(pmap, sva)); 5635 if ((*pte & PG_V) == 0) 5636 continue; 5637 if ((*pte & PG_W) == 0) 5638 panic("pmap_unwire: pte %#jx is missing PG_W", 5639 (uintmax_t)*pte); 5640 5641 /* 5642 * PG_W must be cleared atomically. Although the pmap 5643 * lock synchronizes access to PG_W, another processor 5644 * could be setting PG_M and/or PG_A concurrently. 5645 */ 5646 atomic_clear_long(pte, PG_W); 5647 pmap->pm_stats.wired_count--; 5648 } 5649 } 5650 PMAP_UNLOCK(pmap); 5651 } 5652 5653 void 5654 mmu_radix_zero_page(vm_page_t m) 5655 { 5656 vm_offset_t addr; 5657 5658 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 5659 addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5660 pagezero(addr); 5661 } 5662 5663 void 5664 mmu_radix_zero_page_area(vm_page_t m, int off, int size) 5665 { 5666 caddr_t addr; 5667 5668 CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); 5669 MPASS(off + size <= PAGE_SIZE); 5670 addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 5671 memset(addr + off, 0, size); 5672 } 5673 5674 static int 5675 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5676 { 5677 pml3_entry_t *l3ep; 5678 pt_entry_t pte; 5679 vm_paddr_t pa; 5680 int val; 5681 5682 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); 5683 PMAP_LOCK(pmap); 5684 5685 l3ep = pmap_pml3e(pmap, addr); 5686 if (l3ep != NULL && (*l3ep & PG_V)) { 5687 if (*l3ep & RPTE_LEAF) { 5688 pte = *l3ep; 5689 /* Compute the physical address of the 4KB page. */ 5690 pa = ((*l3ep & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & 5691 PG_FRAME; 5692 val = MINCORE_PSIND(1); 5693 } else { 5694 pte = *pmap_l3e_to_pte(l3ep, addr); 5695 pa = pte & PG_FRAME; 5696 val = 0; 5697 } 5698 } else { 5699 pte = 0; 5700 pa = 0; 5701 val = 0; 5702 } 5703 if ((pte & PG_V) != 0) { 5704 val |= MINCORE_INCORE; 5705 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5706 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5707 if ((pte & PG_A) != 0) 5708 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5709 } 5710 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5711 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5712 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5713 *locked_pa = pa; 5714 } 5715 PMAP_UNLOCK(pmap); 5716 return (val); 5717 } 5718 5719 void 5720 mmu_radix_activate(struct thread *td) 5721 { 5722 pmap_t pmap; 5723 uint32_t curpid; 5724 5725 CTR2(KTR_PMAP, "%s(%p)", __func__, td); 5726 critical_enter(); 5727 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5728 curpid = mfspr(SPR_PID); 5729 if (pmap->pm_pid > isa3_base_pid && 5730 curpid != pmap->pm_pid) { 5731 mmu_radix_pid_set(pmap); 5732 } 5733 critical_exit(); 5734 } 5735 5736 /* 5737 * Increase the starting virtual address of the given mapping if a 5738 * different alignment might result in more superpage mappings. 5739 */ 5740 void 5741 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset, 5742 vm_offset_t *addr, vm_size_t size) 5743 { 5744 5745 CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, 5746 size); 5747 vm_offset_t superpage_offset; 5748 5749 if (size < L3_PAGE_SIZE) 5750 return; 5751 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5752 offset += ptoa(object->pg_color); 5753 superpage_offset = offset & L3_PAGE_MASK; 5754 if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || 5755 (*addr & L3_PAGE_MASK) == superpage_offset) 5756 return; 5757 if ((*addr & L3_PAGE_MASK) < superpage_offset) 5758 *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; 5759 else 5760 *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; 5761 } 5762 5763 static void * 5764 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) 5765 { 5766 vm_offset_t va, tmpva, ppa, offset; 5767 5768 ppa = trunc_page(pa); 5769 offset = pa & PAGE_MASK; 5770 size = roundup2(offset + size, PAGE_SIZE); 5771 if (pa < powerpc_ptob(Maxmem)) 5772 panic("bad pa: %#lx less than Maxmem %#lx\n", 5773 pa, powerpc_ptob(Maxmem)); 5774 va = kva_alloc(size); 5775 if (bootverbose) 5776 printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); 5777 KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); 5778 5779 if (!va) 5780 panic("%s: Couldn't alloc kernel virtual memory", __func__); 5781 5782 for (tmpva = va; size > 0;) { 5783 mmu_radix_kenter_attr(tmpva, ppa, attr); 5784 size -= PAGE_SIZE; 5785 tmpva += PAGE_SIZE; 5786 ppa += PAGE_SIZE; 5787 } 5788 ptesync(); 5789 5790 return ((void *)(va + offset)); 5791 } 5792 5793 static void * 5794 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size) 5795 { 5796 5797 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); 5798 5799 return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); 5800 } 5801 5802 void 5803 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5804 { 5805 5806 CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); 5807 m->md.mdpg_cache_attrs = ma; 5808 5809 /* 5810 * If "m" is a normal page, update its direct mapping. This update 5811 * can be relied upon to perform any cache operations that are 5812 * required for data coherence. 5813 */ 5814 if ((m->flags & PG_FICTITIOUS) == 0 && 5815 mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 5816 PAGE_SIZE, m->md.mdpg_cache_attrs)) 5817 panic("memory attribute change on the direct map failed"); 5818 } 5819 5820 static void 5821 mmu_radix_unmapdev(vm_offset_t va, vm_size_t size) 5822 { 5823 vm_offset_t offset; 5824 5825 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); 5826 /* If we gave a direct map region in pmap_mapdev, do nothing */ 5827 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 5828 return; 5829 5830 offset = va & PAGE_MASK; 5831 size = round_page(offset + size); 5832 va = trunc_page(va); 5833 5834 if (pmap_initialized) { 5835 mmu_radix_qremove(va, atop(size)); 5836 kva_free(va, size); 5837 } 5838 } 5839 5840 static __inline void 5841 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) 5842 { 5843 uint64_t opte, npte; 5844 5845 /* 5846 * The cache mode bits are all in the low 32-bits of the 5847 * PTE, so we can just spin on updating the low 32-bits. 5848 */ 5849 do { 5850 opte = *pte; 5851 npte = opte & ~mask; 5852 npte |= cache_bits; 5853 } while (npte != opte && !atomic_cmpset_long(pte, opte, npte)); 5854 } 5855 5856 /* 5857 * Tries to demote a 1GB page mapping. 5858 */ 5859 static boolean_t 5860 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) 5861 { 5862 pml2_entry_t oldpdpe; 5863 pml3_entry_t *firstpde, newpde, *pde; 5864 vm_paddr_t pdpgpa; 5865 vm_page_t pdpg; 5866 5867 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5868 oldpdpe = *l2e; 5869 KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), 5870 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 5871 pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT, 5872 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 5873 if (pdpg == NULL) { 5874 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 5875 " in pmap %p", va, pmap); 5876 return (FALSE); 5877 } 5878 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 5879 firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); 5880 KASSERT((oldpdpe & PG_A) != 0, 5881 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 5882 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 5883 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 5884 newpde = oldpdpe; 5885 5886 /* 5887 * Initialize the page directory page. 5888 */ 5889 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 5890 *pde = newpde; 5891 newpde += L3_PAGE_SIZE; 5892 } 5893 5894 /* 5895 * Demote the mapping. 5896 */ 5897 pde_store(l2e, pdpgpa); 5898 5899 /* 5900 * Flush PWC --- XXX revisit 5901 */ 5902 pmap_invalidate_all(pmap); 5903 5904 pmap_l2e_demotions++; 5905 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 5906 " in pmap %p", va, pmap); 5907 return (TRUE); 5908 } 5909 5910 vm_paddr_t 5911 mmu_radix_kextract(vm_offset_t va) 5912 { 5913 pml3_entry_t l3e; 5914 vm_paddr_t pa; 5915 5916 CTR2(KTR_PMAP, "%s(%#x)", __func__, va); 5917 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 5918 pa = DMAP_TO_PHYS(va); 5919 } else { 5920 l3e = *pmap_pml3e(kernel_pmap, va); 5921 if (l3e & RPTE_LEAF) { 5922 pa = (l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK); 5923 pa |= (va & L3_PAGE_MASK); 5924 } else { 5925 /* 5926 * Beware of a concurrent promotion that changes the 5927 * PDE at this point! For example, vtopte() must not 5928 * be used to access the PTE because it would use the 5929 * new PDE. It is, however, safe to use the old PDE 5930 * because the page table page is preserved by the 5931 * promotion. 5932 */ 5933 pa = *pmap_l3e_to_pte(&l3e, va); 5934 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 5935 pa |= (va & PAGE_MASK); 5936 } 5937 } 5938 return (pa); 5939 } 5940 5941 static pt_entry_t 5942 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) 5943 { 5944 5945 if (ma != VM_MEMATTR_DEFAULT) { 5946 return pmap_cache_bits(ma); 5947 } 5948 5949 /* 5950 * Assume the page is cache inhibited and access is guarded unless 5951 * it's in our available memory array. 5952 */ 5953 for (int i = 0; i < pregions_sz; i++) { 5954 if ((pa >= pregions[i].mr_start) && 5955 (pa < (pregions[i].mr_start + pregions[i].mr_size))) 5956 return (RPTE_ATTR_MEM); 5957 } 5958 return (RPTE_ATTR_GUARDEDIO); 5959 } 5960 5961 static void 5962 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) 5963 { 5964 pt_entry_t *pte, pteval; 5965 uint64_t cache_bits; 5966 5967 pte = kvtopte(va); 5968 MPASS(pte != NULL); 5969 pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; 5970 cache_bits = mmu_radix_calc_wimg(pa, ma); 5971 pte_store(pte, pteval | cache_bits); 5972 } 5973 5974 void 5975 mmu_radix_kremove(vm_offset_t va) 5976 { 5977 pt_entry_t *pte; 5978 5979 CTR2(KTR_PMAP, "%s(%#x)", __func__, va); 5980 5981 pte = kvtopte(va); 5982 pte_clear(pte); 5983 } 5984 5985 int 5986 mmu_radix_decode_kernel_ptr(vm_offset_t addr, 5987 int *is_user, vm_offset_t *decoded) 5988 { 5989 5990 CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); 5991 *decoded = addr; 5992 *is_user = (addr < VM_MAXUSER_ADDRESS); 5993 return (0); 5994 } 5995 5996 static boolean_t 5997 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) 5998 { 5999 6000 CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); 6001 return (mem_valid(pa, size)); 6002 } 6003 6004 static void 6005 mmu_radix_scan_init() 6006 { 6007 6008 CTR1(KTR_PMAP, "%s()", __func__); 6009 UNIMPLEMENTED(); 6010 } 6011 6012 static void 6013 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, 6014 void **va) 6015 { 6016 CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); 6017 UNIMPLEMENTED(); 6018 } 6019 6020 vm_offset_t 6021 mmu_radix_quick_enter_page(vm_page_t m) 6022 { 6023 vm_paddr_t paddr; 6024 6025 CTR2(KTR_PMAP, "%s(%p)", __func__, m); 6026 paddr = VM_PAGE_TO_PHYS(m); 6027 return (PHYS_TO_DMAP(paddr)); 6028 } 6029 6030 void 6031 mmu_radix_quick_remove_page(vm_offset_t addr __unused) 6032 { 6033 /* no work to do here */ 6034 CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); 6035 } 6036 6037 static void 6038 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 6039 { 6040 cpu_flush_dcache((void *)sva, eva - sva); 6041 } 6042 6043 int 6044 mmu_radix_change_attr(vm_offset_t va, vm_size_t size, 6045 vm_memattr_t mode) 6046 { 6047 int error; 6048 6049 CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); 6050 PMAP_LOCK(kernel_pmap); 6051 error = pmap_change_attr_locked(va, size, mode, true); 6052 PMAP_UNLOCK(kernel_pmap); 6053 return (error); 6054 } 6055 6056 static int 6057 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) 6058 { 6059 vm_offset_t base, offset, tmpva; 6060 vm_paddr_t pa_start, pa_end, pa_end1; 6061 pml2_entry_t *l2e; 6062 pml3_entry_t *l3e; 6063 pt_entry_t *pte; 6064 int cache_bits, error; 6065 boolean_t changed; 6066 6067 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6068 base = trunc_page(va); 6069 offset = va & PAGE_MASK; 6070 size = round_page(offset + size); 6071 6072 /* 6073 * Only supported on kernel virtual addresses, including the direct 6074 * map but excluding the recursive map. 6075 */ 6076 if (base < DMAP_MIN_ADDRESS) 6077 return (EINVAL); 6078 6079 cache_bits = pmap_cache_bits(mode); 6080 changed = FALSE; 6081 6082 /* 6083 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6084 * into 4KB pages if required. 6085 */ 6086 for (tmpva = base; tmpva < base + size; ) { 6087 l2e = pmap_pml2e(kernel_pmap, tmpva); 6088 if (l2e == NULL || *l2e == 0) 6089 return (EINVAL); 6090 if (*l2e & RPTE_LEAF) { 6091 /* 6092 * If the current 1GB page already has the required 6093 * memory type, then we need not demote this page. Just 6094 * increment tmpva to the next 1GB page frame. 6095 */ 6096 if ((*l2e & RPTE_ATTR_MASK) == cache_bits) { 6097 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; 6098 continue; 6099 } 6100 6101 /* 6102 * If the current offset aligns with a 1GB page frame 6103 * and there is at least 1GB left within the range, then 6104 * we need not break down this page into 2MB pages. 6105 */ 6106 if ((tmpva & L2_PAGE_MASK) == 0 && 6107 tmpva + L2_PAGE_MASK < base + size) { 6108 tmpva += L2_PAGE_MASK; 6109 continue; 6110 } 6111 if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) 6112 return (ENOMEM); 6113 } 6114 l3e = pmap_l2e_to_l3e(l2e, tmpva); 6115 KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", 6116 tmpva, l2e)); 6117 if (*l3e == 0) 6118 return (EINVAL); 6119 if (*l3e & RPTE_LEAF) { 6120 /* 6121 * If the current 2MB page already has the required 6122 * memory type, then we need not demote this page. Just 6123 * increment tmpva to the next 2MB page frame. 6124 */ 6125 if ((*l3e & RPTE_ATTR_MASK) == cache_bits) { 6126 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; 6127 continue; 6128 } 6129 6130 /* 6131 * If the current offset aligns with a 2MB page frame 6132 * and there is at least 2MB left within the range, then 6133 * we need not break down this page into 4KB pages. 6134 */ 6135 if ((tmpva & L3_PAGE_MASK) == 0 && 6136 tmpva + L3_PAGE_MASK < base + size) { 6137 tmpva += L3_PAGE_SIZE; 6138 continue; 6139 } 6140 if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) 6141 return (ENOMEM); 6142 } 6143 pte = pmap_l3e_to_pte(l3e, tmpva); 6144 if (*pte == 0) 6145 return (EINVAL); 6146 tmpva += PAGE_SIZE; 6147 } 6148 error = 0; 6149 6150 /* 6151 * Ok, all the pages exist, so run through them updating their 6152 * cache mode if required. 6153 */ 6154 pa_start = pa_end = 0; 6155 for (tmpva = base; tmpva < base + size; ) { 6156 l2e = pmap_pml2e(kernel_pmap, tmpva); 6157 if (*l2e & RPTE_LEAF) { 6158 if ((*l2e & RPTE_ATTR_MASK) != cache_bits) { 6159 pmap_pte_attr(l2e, cache_bits, 6160 RPTE_ATTR_MASK); 6161 changed = TRUE; 6162 } 6163 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6164 (*l2e & PG_PS_FRAME) < dmaplimit) { 6165 if (pa_start == pa_end) { 6166 /* Start physical address run. */ 6167 pa_start = *l2e & PG_PS_FRAME; 6168 pa_end = pa_start + L2_PAGE_SIZE; 6169 } else if (pa_end == (*l2e & PG_PS_FRAME)) 6170 pa_end += L2_PAGE_SIZE; 6171 else { 6172 /* Run ended, update direct map. */ 6173 error = pmap_change_attr_locked( 6174 PHYS_TO_DMAP(pa_start), 6175 pa_end - pa_start, mode, flush); 6176 if (error != 0) 6177 break; 6178 /* Start physical address run. */ 6179 pa_start = *l2e & PG_PS_FRAME; 6180 pa_end = pa_start + L2_PAGE_SIZE; 6181 } 6182 } 6183 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; 6184 continue; 6185 } 6186 l3e = pmap_l2e_to_l3e(l2e, tmpva); 6187 if (*l3e & RPTE_LEAF) { 6188 if ((*l3e & RPTE_ATTR_MASK) != cache_bits) { 6189 pmap_pte_attr(l3e, cache_bits, 6190 RPTE_ATTR_MASK); 6191 changed = TRUE; 6192 } 6193 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6194 (*l3e & PG_PS_FRAME) < dmaplimit) { 6195 if (pa_start == pa_end) { 6196 /* Start physical address run. */ 6197 pa_start = *l3e & PG_PS_FRAME; 6198 pa_end = pa_start + L3_PAGE_SIZE; 6199 } else if (pa_end == (*l3e & PG_PS_FRAME)) 6200 pa_end += L3_PAGE_SIZE; 6201 else { 6202 /* Run ended, update direct map. */ 6203 error = pmap_change_attr_locked( 6204 PHYS_TO_DMAP(pa_start), 6205 pa_end - pa_start, mode, flush); 6206 if (error != 0) 6207 break; 6208 /* Start physical address run. */ 6209 pa_start = *l3e & PG_PS_FRAME; 6210 pa_end = pa_start + L3_PAGE_SIZE; 6211 } 6212 } 6213 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; 6214 } else { 6215 pte = pmap_l3e_to_pte(l3e, tmpva); 6216 if ((*pte & RPTE_ATTR_MASK) != cache_bits) { 6217 pmap_pte_attr(pte, cache_bits, 6218 RPTE_ATTR_MASK); 6219 changed = TRUE; 6220 } 6221 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 6222 (*pte & PG_FRAME) < dmaplimit) { 6223 if (pa_start == pa_end) { 6224 /* Start physical address run. */ 6225 pa_start = *pte & PG_FRAME; 6226 pa_end = pa_start + PAGE_SIZE; 6227 } else if (pa_end == (*pte & PG_FRAME)) 6228 pa_end += PAGE_SIZE; 6229 else { 6230 /* Run ended, update direct map. */ 6231 error = pmap_change_attr_locked( 6232 PHYS_TO_DMAP(pa_start), 6233 pa_end - pa_start, mode, flush); 6234 if (error != 0) 6235 break; 6236 /* Start physical address run. */ 6237 pa_start = *pte & PG_FRAME; 6238 pa_end = pa_start + PAGE_SIZE; 6239 } 6240 } 6241 tmpva += PAGE_SIZE; 6242 } 6243 } 6244 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 6245 pa_end1 = MIN(pa_end, dmaplimit); 6246 if (pa_start != pa_end1) 6247 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6248 pa_end1 - pa_start, mode, flush); 6249 } 6250 6251 /* 6252 * Flush CPU caches if required to make sure any data isn't cached that 6253 * shouldn't be, etc. 6254 */ 6255 if (changed) { 6256 pmap_invalidate_all(kernel_pmap); 6257 6258 if (flush) 6259 pmap_invalidate_cache_range(base, tmpva); 6260 } 6261 return (error); 6262 } 6263 6264 /* 6265 * Allocate physical memory for the vm_page array and map it into KVA, 6266 * attempting to back the vm_pages with domain-local memory. 6267 */ 6268 void 6269 mmu_radix_page_array_startup(long pages) 6270 { 6271 #ifdef notyet 6272 pml2_entry_t *l2e; 6273 pml3_entry_t *pde; 6274 pml3_entry_t newl3; 6275 vm_offset_t va; 6276 long pfn; 6277 int domain, i; 6278 #endif 6279 vm_paddr_t pa; 6280 vm_offset_t start, end; 6281 6282 vm_page_array_size = pages; 6283 6284 start = VM_MIN_KERNEL_ADDRESS; 6285 end = start + pages * sizeof(struct vm_page); 6286 6287 pa = vm_phys_early_alloc(0, end - start); 6288 6289 start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT); 6290 #ifdef notyet 6291 /* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */ 6292 for (va = start; va < end; va += L3_PAGE_SIZE) { 6293 pfn = first_page + (va - start) / sizeof(struct vm_page); 6294 domain = _vm_phys_domain(ptoa(pfn)); 6295 l2e = pmap_pml2e(kernel_pmap, va); 6296 if ((*l2e & PG_V) == 0) { 6297 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 6298 dump_add_page(pa); 6299 pagezero(PHYS_TO_DMAP(pa)); 6300 pde_store(l2e, (pml2_entry_t)pa); 6301 } 6302 pde = pmap_l2e_to_l3e(l2e, va); 6303 if ((*pde & PG_V) != 0) 6304 panic("Unexpected pde %p", pde); 6305 pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE); 6306 for (i = 0; i < NPDEPG; i++) 6307 dump_add_page(pa + i * PAGE_SIZE); 6308 newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W); 6309 pte_store(pde, newl3); 6310 } 6311 #endif 6312 vm_page_array = (vm_page_t)start; 6313 } 6314 6315 #ifdef DDB 6316 #include <sys/kdb.h> 6317 #include <ddb/ddb.h> 6318 6319 static void 6320 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) 6321 { 6322 pml1_entry_t *l1e; 6323 pml2_entry_t *l2e; 6324 pml3_entry_t *l3e; 6325 pt_entry_t *pte; 6326 6327 l1e = &l1[pmap_pml1e_index(va)]; 6328 db_printf("VA %#016lx l1e %#016lx", va, *l1e); 6329 if ((*l1e & PG_V) == 0) { 6330 db_printf("\n"); 6331 return; 6332 } 6333 l2e = pmap_l1e_to_l2e(l1e, va); 6334 db_printf(" l2e %#016lx", *l2e); 6335 if ((*l2e & PG_V) == 0 || (*l2e & RPTE_LEAF) != 0) { 6336 db_printf("\n"); 6337 return; 6338 } 6339 l3e = pmap_l2e_to_l3e(l2e, va); 6340 db_printf(" l3e %#016lx", *l3e); 6341 if ((*l3e & PG_V) == 0 || (*l3e & RPTE_LEAF) != 0) { 6342 db_printf("\n"); 6343 return; 6344 } 6345 pte = pmap_l3e_to_pte(l3e, va); 6346 db_printf(" pte %#016lx\n", *pte); 6347 } 6348 6349 void 6350 pmap_page_print_mappings(vm_page_t m) 6351 { 6352 pmap_t pmap; 6353 pv_entry_t pv; 6354 6355 db_printf("page %p(%lx)\n", m, m->phys_addr); 6356 /* need to elide locks if running in ddb */ 6357 TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { 6358 db_printf("pv: %p ", pv); 6359 db_printf("va: %#016lx ", pv->pv_va); 6360 pmap = PV_PMAP(pv); 6361 db_printf("pmap %p ", pmap); 6362 if (pmap != NULL) { 6363 db_printf("asid: %lu\n", pmap->pm_pid); 6364 pmap_pte_walk(pmap->pm_pml1, pv->pv_va); 6365 } 6366 } 6367 } 6368 6369 DB_SHOW_COMMAND(pte, pmap_print_pte) 6370 { 6371 vm_offset_t va; 6372 pmap_t pmap; 6373 6374 if (!have_addr) { 6375 db_printf("show pte addr\n"); 6376 return; 6377 } 6378 va = (vm_offset_t)addr; 6379 6380 if (va >= DMAP_MIN_ADDRESS) 6381 pmap = kernel_pmap; 6382 else if (kdb_thread != NULL) 6383 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 6384 else 6385 pmap = vmspace_pmap(curthread->td_proc->p_vmspace); 6386 6387 pmap_pte_walk(pmap->pm_pml1, va); 6388 } 6389 6390 #endif 6391