1 /* 2 * Copyright (C) 2010 Andreas Tobler 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 18 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 21 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 22 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 23 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include <sys/param.h> 30 #include <sys/kernel.h> 31 #include <sys/ktr.h> 32 #include <sys/lock.h> 33 #include <sys/msgbuf.h> 34 #include <sys/mutex.h> 35 #include <sys/proc.h> 36 #include <sys/sysctl.h> 37 #include <sys/systm.h> 38 #include <sys/vmmeter.h> 39 40 #include <dev/ofw/openfirm.h> 41 #include <machine/ofw_machdep.h> 42 43 #include <vm/vm.h> 44 #include <vm/vm_param.h> 45 #include <vm/vm_kern.h> 46 #include <vm/vm_page.h> 47 #include <vm/vm_map.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_extern.h> 50 #include <vm/vm_pageout.h> 51 #include <vm/uma.h> 52 53 #include <powerpc/aim/mmu_oea64.h> 54 55 #include "mmu_if.h" 56 #include "moea64_if.h" 57 58 #include "phyp-hvcall.h" 59 60 extern int n_slbs; 61 62 /* 63 * Kernel MMU interface 64 */ 65 66 static void mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, 67 vm_offset_t kernelend); 68 static void mphyp_cpu_bootstrap(mmu_t mmup, int ap); 69 static void mphyp_pte_synch(mmu_t, uintptr_t pt, struct lpte *pvo_pt); 70 static void mphyp_pte_clear(mmu_t, uintptr_t pt, struct lpte *pvo_pt, 71 uint64_t vpn, u_int64_t ptebit); 72 static void mphyp_pte_unset(mmu_t, uintptr_t pt, struct lpte *pvo_pt, 73 uint64_t vpn); 74 static void mphyp_pte_change(mmu_t, uintptr_t pt, struct lpte *pvo_pt, 75 uint64_t vpn); 76 static int mphyp_pte_insert(mmu_t, u_int ptegidx, struct lpte *pvo_pt); 77 static uintptr_t mphyp_pvo_to_pte(mmu_t, const struct pvo_entry *pvo); 78 79 #define VSID_HASH_MASK 0x0000007fffffffffULL 80 81 82 static mmu_method_t mphyp_methods[] = { 83 MMUMETHOD(mmu_bootstrap, mphyp_bootstrap), 84 MMUMETHOD(mmu_cpu_bootstrap, mphyp_cpu_bootstrap), 85 86 MMUMETHOD(moea64_pte_synch, mphyp_pte_synch), 87 MMUMETHOD(moea64_pte_clear, mphyp_pte_clear), 88 MMUMETHOD(moea64_pte_unset, mphyp_pte_unset), 89 MMUMETHOD(moea64_pte_change, mphyp_pte_change), 90 MMUMETHOD(moea64_pte_insert, mphyp_pte_insert), 91 MMUMETHOD(moea64_pvo_to_pte, mphyp_pvo_to_pte), 92 93 { 0, 0 } 94 }; 95 96 MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, 0, oea64_mmu); 97 98 static void 99 mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) 100 { 101 uint64_t final_pteg_count = 0; 102 char buf[8]; 103 uint32_t prop[2]; 104 uint32_t nptlp, shift = 0, slb_encoding = 0; 105 uint32_t lp_size, lp_encoding; 106 phandle_t dev, node, root; 107 int idx, len, res; 108 109 moea64_early_bootstrap(mmup, kernelstart, kernelend); 110 111 root = OF_peer(0); 112 113 dev = OF_child(root); 114 while (dev != 0) { 115 res = OF_getprop(dev, "name", buf, sizeof(buf)); 116 if (res > 0 && strcmp(buf, "cpus") == 0) 117 break; 118 dev = OF_peer(dev); 119 } 120 121 node = OF_child(dev); 122 123 while (node != 0) { 124 res = OF_getprop(node, "device_type", buf, sizeof(buf)); 125 if (res > 0 && strcmp(buf, "cpu") == 0) 126 break; 127 node = OF_peer(node); 128 } 129 130 res = OF_getprop(node, "ibm,pft-size", prop, sizeof(prop)); 131 if (res <= 0) 132 panic("mmu_phyp: unknown PFT size"); 133 final_pteg_count = 1 << prop[1]; 134 res = OF_getprop(node, "ibm,slb-size", prop, sizeof(prop[0])); 135 if (res > 0) 136 n_slbs = prop[0]; 137 138 moea64_pteg_count = final_pteg_count / sizeof(struct lpteg); 139 140 /* 141 * Scan the large page size property for PAPR compatible machines. 142 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties' 143 * for the encoding of the property. 144 */ 145 146 len = OF_getproplen(node, "ibm,segment-page-sizes"); 147 if (len > 0) { 148 /* 149 * We have to use a variable length array on the stack 150 * since we have very limited stack space. 151 */ 152 pcell_t arr[len/sizeof(cell_t)]; 153 res = OF_getencprop(node, "ibm,segment-page-sizes", arr, 154 sizeof(arr)); 155 len /= 4; 156 idx = 0; 157 while (len > 0) { 158 shift = arr[idx]; 159 slb_encoding = arr[idx + 1]; 160 nptlp = arr[idx + 2]; 161 idx += 3; 162 len -= 3; 163 while (len > 0 && nptlp) { 164 lp_size = arr[idx]; 165 lp_encoding = arr[idx+1]; 166 if (slb_encoding == SLBV_L && lp_encoding == 0) 167 break; 168 169 idx += 2; 170 len -= 2; 171 nptlp--; 172 } 173 if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) 174 break; 175 } 176 177 if (len == 0) 178 panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " 179 "not supported by this system. Please enable huge " 180 "page backing if running under PowerKVM."); 181 182 moea64_large_page_shift = shift; 183 moea64_large_page_size = 1ULL << lp_size; 184 } 185 186 moea64_mid_bootstrap(mmup, kernelstart, kernelend); 187 moea64_late_bootstrap(mmup, kernelstart, kernelend); 188 } 189 190 static void 191 mphyp_cpu_bootstrap(mmu_t mmup, int ap) 192 { 193 struct slb *slb = PCPU_GET(slb); 194 register_t seg0; 195 int i; 196 197 /* 198 * Install kernel SLB entries 199 */ 200 201 __asm __volatile ("slbia"); 202 __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : "r"(0)); 203 for (i = 0; i < 64; i++) { 204 if (!(slb[i].slbe & SLBE_VALID)) 205 continue; 206 207 __asm __volatile ("slbmte %0, %1" :: 208 "r"(slb[i].slbv), "r"(slb[i].slbe)); 209 } 210 } 211 212 static void 213 mphyp_pte_synch(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt) 214 { 215 struct lpte pte; 216 uint64_t junk; 217 218 __asm __volatile("ptesync"); 219 phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pte.pte_hi, &pte.pte_lo, 220 &junk); 221 222 pvo_pt->pte_lo |= pte.pte_lo & (LPTE_CHG | LPTE_REF); 223 } 224 225 static void 226 mphyp_pte_clear(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn, 227 u_int64_t ptebit) 228 { 229 230 if (ptebit & LPTE_CHG) 231 phyp_hcall(H_CLEAR_MOD, 0, slot); 232 if (ptebit & LPTE_REF) 233 phyp_hcall(H_CLEAR_REF, 0, slot); 234 } 235 236 static void 237 mphyp_pte_unset(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) 238 { 239 struct lpte pte; 240 uint64_t junk; 241 int err; 242 243 pvo_pt->pte_hi &= ~LPTE_VALID; 244 err = phyp_pft_hcall(H_REMOVE, 1UL << 31, slot, 245 pvo_pt->pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, 246 &junk); 247 KASSERT(err == H_SUCCESS, ("Error removing page: %d", err)); 248 249 pvo_pt->pte_lo |= pte.pte_lo & (LPTE_CHG | LPTE_REF); 250 } 251 252 static void 253 mphyp_pte_change(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) 254 { 255 struct lpte evicted; 256 uint64_t index, junk; 257 int64_t result; 258 259 /* 260 * NB: this is protected by the global table lock, so this two-step 261 * is safe, except for the scratch-page case. No CPUs on which we run 262 * this code should be using scratch pages. 263 */ 264 KASSERT(!(pvo_pt->pte_hi & LPTE_LOCKED), 265 ("Locked pages not supported on PHYP")); 266 267 /* XXX: optimization using H_PROTECT for common case? */ 268 mphyp_pte_unset(mmu, slot, pvo_pt, vpn); 269 pvo_pt->pte_hi |= LPTE_VALID; 270 result = phyp_pft_hcall(H_ENTER, H_EXACT, slot, pvo_pt->pte_hi, 271 pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); 272 if (result != H_SUCCESS) 273 panic("mphyp_pte_change() insertion failure: %ld\n", result); 274 } 275 276 static __inline int 277 mphyp_pte_spillable_ident(u_int ptegidx, struct lpte *to_evict) 278 { 279 uint64_t slot, junk, k; 280 struct lpte pt; 281 int i, j; 282 283 /* Start at a random slot */ 284 i = mftb() % 8; 285 k = -1; 286 for (j = 0; j < 8; j++) { 287 slot = (ptegidx << 3) + (i + j) % 8; 288 phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, &pt.pte_lo, 289 &junk); 290 291 if (pt.pte_hi & LPTE_WIRED) 292 continue; 293 294 /* This is a candidate, so remember it */ 295 k = slot; 296 297 /* Try to get a page that has not been used lately */ 298 if (!(pt.pte_lo & LPTE_REF)) { 299 memcpy(to_evict, &pt, sizeof(struct lpte)); 300 return (k); 301 } 302 } 303 304 if (k == -1) 305 return (k); 306 307 phyp_pft_hcall(H_READ, 0, k, 0, 0, &to_evict->pte_hi, 308 &to_evict->pte_lo, &junk); 309 return (k); 310 } 311 312 static int 313 mphyp_pte_insert(mmu_t mmu, u_int ptegidx, struct lpte *pvo_pt) 314 { 315 int64_t result; 316 struct lpte evicted; 317 struct pvo_entry *pvo; 318 uint64_t index, junk; 319 u_int pteg_bktidx; 320 321 /* Check for locked pages, which we can't support on this system */ 322 KASSERT(!(pvo_pt->pte_hi & LPTE_LOCKED), 323 ("Locked pages not supported on PHYP")); 324 325 /* Initialize PTE */ 326 pvo_pt->pte_hi |= LPTE_VALID; 327 pvo_pt->pte_hi &= ~LPTE_HID; 328 evicted.pte_hi = 0; 329 330 /* 331 * First try primary hash. 332 */ 333 pteg_bktidx = ptegidx; 334 result = phyp_pft_hcall(H_ENTER, 0, pteg_bktidx << 3, pvo_pt->pte_hi, 335 pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); 336 if (result == H_SUCCESS) 337 return (index & 0x07); 338 KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld " 339 "(ptegidx: %#x/%#x, PTE %#lx/%#lx", result, ptegidx, 340 moea64_pteg_count, pvo_pt->pte_hi, pvo_pt->pte_lo)); 341 342 /* 343 * Next try secondary hash. 344 */ 345 pteg_bktidx ^= moea64_pteg_mask; 346 pvo_pt->pte_hi |= LPTE_HID; 347 result = phyp_pft_hcall(H_ENTER, 0, pteg_bktidx << 3, 348 pvo_pt->pte_hi, pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); 349 if (result == H_SUCCESS) 350 return (index & 0x07); 351 KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld", 352 result)); 353 354 /* 355 * Out of luck. Find a PTE to sacrifice. 356 */ 357 pteg_bktidx = ptegidx; 358 index = mphyp_pte_spillable_ident(pteg_bktidx, &evicted); 359 if (index == -1L) { 360 pteg_bktidx ^= moea64_pteg_mask; 361 index = mphyp_pte_spillable_ident(pteg_bktidx, &evicted); 362 } 363 364 if (index == -1L) { 365 /* No freeable slots in either PTEG? We're hosed. */ 366 panic("mphyp_pte_insert: overflow"); 367 return (-1); 368 } 369 370 if (pteg_bktidx == ptegidx) 371 pvo_pt->pte_hi &= ~LPTE_HID; 372 else 373 pvo_pt->pte_hi |= LPTE_HID; 374 375 /* 376 * Synchronize the sacrifice PTE with its PVO, then mark both 377 * invalid. The PVO will be reused when/if the VM system comes 378 * here after a fault. 379 */ 380 381 if (evicted.pte_hi & LPTE_HID) 382 pteg_bktidx ^= moea64_pteg_mask; /* PTEs indexed by primary */ 383 384 LIST_FOREACH(pvo, &moea64_pvo_table[pteg_bktidx], pvo_olink) { 385 if (pvo->pvo_pte.lpte.pte_hi == evicted.pte_hi) { 386 KASSERT(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID, 387 ("Invalid PVO for valid PTE!")); 388 mphyp_pte_unset(mmu, index, &pvo->pvo_pte.lpte, 389 pvo->pvo_vpn); 390 PVO_PTEGIDX_CLR(pvo); 391 moea64_pte_overflow++; 392 break; 393 } 394 } 395 396 KASSERT((pvo->pvo_pte.lpte.pte_hi | LPTE_VALID) == evicted.pte_hi, 397 ("Unable to find PVO for spilled PTE")); 398 399 /* 400 * Set the new PTE. 401 */ 402 result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pvo_pt->pte_hi, 403 pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); 404 if (result == H_SUCCESS) 405 return (index & 0x07); 406 407 panic("Page replacement error: %ld", result); 408 return (-1); 409 } 410 411 static __inline u_int 412 va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) 413 { 414 uint64_t hash; 415 int shift; 416 417 shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; 418 hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> 419 shift); 420 return (hash & moea64_pteg_mask); 421 } 422 423 static uintptr_t 424 mphyp_pvo_to_pte(mmu_t mmu, const struct pvo_entry *pvo) 425 { 426 uint64_t vsid; 427 u_int ptegidx; 428 429 /* If the PTEG index is not set, then there is no page table entry */ 430 if (!PVO_PTEGIDX_ISSET(pvo)) 431 return (-1); 432 433 vsid = PVO_VSID(pvo); 434 ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); 435 436 /* 437 * We can find the actual pte entry without searching by grabbing 438 * the PTEG index from 3 unused bits in pvo_vaddr and by 439 * noticing the HID bit. 440 */ 441 if (pvo->pvo_pte.lpte.pte_hi & LPTE_HID) 442 ptegidx ^= moea64_pteg_mask; 443 444 return ((ptegidx << 3) | PVO_PTEGIDX_GET(pvo)); 445 } 446 447