1 /* 2 * Copyright (c) 2003-2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $ 35 */ 36 37 /* 38 * pmap invalidation support code. Certain hardware requirements must 39 * be dealt with when manipulating page table entries and page directory 40 * entries within a pmap. In particular, we cannot safely manipulate 41 * page tables which are in active use by another cpu (even if it is 42 * running in userland) for two reasons: First, TLB writebacks will 43 * race against our own modifications and tests. Second, even if we 44 * were to use bus-locked instruction we can still screw up the 45 * target cpu's instruction pipeline due to Intel cpu errata. 46 * 47 * For our virtual page tables, the real kernel will handle SMP interactions 48 * with pmaps that may be active on other cpus. Even so, we have to be 49 * careful about bit setting races particularly when we are trying to clean 50 * a page and test the modified bit to avoid races where the modified bit 51 * might get set after our poll but before we clear the field. 52 */ 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/vmmeter.h> 58 #include <sys/thread2.h> 59 #include <sys/cdefs.h> 60 #include <sys/mman.h> 61 #include <sys/vmspace.h> 62 #include <sys/vmm.h> 63 64 #include <vm/vm.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_object.h> 67 68 #include <machine/cputypes.h> 69 #include <machine/md_var.h> 70 #include <machine/specialreg.h> 71 #include <machine/smp.h> 72 #include <machine/globaldata.h> 73 #include <machine/pmap.h> 74 #include <machine/pmap_inval.h> 75 76 #include <unistd.h> 77 78 #include <vm/vm_page2.h> 79 80 extern int vmm_enabled; 81 82 /* 83 * Invalidate the TLB on the current cpu 84 * 85 * (VMM enabled only) 86 */ 87 static __inline 88 void 89 vmm_cpu_invltlb(void) 90 { 91 #if 0 92 /* not directly supported */ 93 cpu_invltlb(); 94 #else 95 /* vmm_guest_sync_addr(NULL, NULL); */ 96 /* For VMM mode forces vmmexit/resume */ 97 uint64_t rax = -1; 98 __asm __volatile("syscall;" 99 : 100 : "a" (rax) 101 :); 102 #endif 103 } 104 105 static __inline 106 void 107 vmm_cpu_invlpg(void *addr __unused) 108 { 109 vmm_cpu_invltlb(); 110 } 111 112 /* 113 * Invalidate va in the TLB on the current cpu 114 * 115 * (VMM disabled only) 116 */ 117 static __inline 118 void 119 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes) 120 { 121 if (pmap == &kernel_pmap) { 122 madvise((void *)va, bytes, MADV_INVAL); 123 } else { 124 vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0); 125 } 126 } 127 128 /* 129 * This is a bit of a mess because we don't know what virtual cpus are 130 * mapped to real cpus. Basically try to optimize the degenerate cases 131 * (primarily related to user processes with only one thread or only one 132 * running thread), and shunt all the rest to the host cpu. The host cpu 133 * will invalidate all real cpu's the vkernel is running on. 134 * 135 * This can't optimize situations where a pmap is only mapped to some of 136 * the virtual cpus, though shunting to the real host will still be faster 137 * if the virtual kernel processes are running on fewer real-host cpus. 138 * (And probably will be faster anyway since there's no round-trip signaling 139 * overhead). 140 * 141 * NOTE: The critical section protects against preemption while the pmap 142 * is locked, which could otherwise result in a deadlock. 143 */ 144 static __inline 145 void 146 guest_sync_addr(struct pmap *pmap, volatile vpte_t *ptep, vpte_t *srcv) 147 { 148 globaldata_t gd = mycpu; 149 cpulock_t olock; 150 cpulock_t nlock; 151 152 /* 153 * Lock the pmap 154 */ 155 crit_enter(); 156 for (;;) { 157 olock = pmap->pm_active_lock; 158 cpu_ccfence(); 159 if ((olock & CPULOCK_EXCL) == 0) { 160 nlock = olock | CPULOCK_EXCL; 161 if (atomic_cmpset_int(&pmap->pm_active_lock, 162 olock, nlock)) { 163 break; 164 } 165 } 166 cpu_pause(); 167 lwkt_process_ipiq(); 168 vkernel_yield(); 169 } 170 171 /* 172 * Update the pte and synchronize with other cpus. If we can update 173 * it trivially, do so. 174 */ 175 if (CPUMASK_TESTZERO(pmap->pm_active) || 176 CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) { 177 if (ptep) 178 *srcv = atomic_swap_long(ptep, *srcv); 179 vmm_cpu_invltlb(); 180 } else { 181 vmm_guest_sync_addr(__DEVOLATILE(void *, ptep), srcv); 182 } 183 184 /* 185 * Unlock the pmap 186 */ 187 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL); 188 crit_exit(); 189 } 190 191 /* 192 * Invalidate a pte in a pmap and synchronize with target cpus 193 * as required. Throw away the modified and access bits. Use 194 * pmap_clean_pte() to do the same thing but also get an interlocked 195 * modified/access status. 196 * 197 * Clearing the field first (basically clearing VPTE_V) prevents any 198 * new races from occuring while we invalidate the TLB (i.e. the pmap 199 * on the real cpu), then clear it again to clean out any race that 200 * might have occured before the invalidation completed. 201 */ 202 void 203 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 204 { 205 vpte_t pte; 206 207 if (vmm_enabled == 0) { 208 atomic_swap_long(ptep, 0); 209 pmap_inval_cpu(pmap, va, PAGE_SIZE); 210 } else { 211 pte = 0; 212 guest_sync_addr(pmap, ptep, &pte); 213 } 214 } 215 216 /* 217 * Same as pmap_inval_pte() but only synchronize with the current 218 * cpu. For the moment its the same as the non-quick version. 219 */ 220 void 221 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 222 { 223 atomic_swap_long(ptep, 0); 224 if (vmm_enabled == 0) 225 pmap_inval_cpu(pmap, va, PAGE_SIZE); 226 else 227 vmm_cpu_invltlb(); 228 } 229 230 /* 231 * Invalidate the tlb for a range of virtual addresses across all cpus 232 * belonging to the pmap. 233 */ 234 void 235 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 236 { 237 if (vmm_enabled == 0) { 238 pmap_inval_cpu(pmap, sva, eva - sva); 239 } else { 240 guest_sync_addr(pmap, NULL, NULL); 241 } 242 } 243 244 /* 245 * Invalidating page directory entries requires some additional 246 * sophistication. The cachemask must be cleared so the kernel 247 * resynchronizes its temporary page table mappings cache. 248 */ 249 void 250 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 251 { 252 vpte_t pte; 253 254 if (vmm_enabled == 0) { 255 atomic_swap_long(ptep, 0); 256 pmap_inval_cpu(pmap, va, SEG_SIZE); 257 } else if (CPUMASK_TESTMASK(pmap->pm_active, 258 mycpu->gd_other_cpus) == 0) { 259 atomic_swap_long(ptep, 0); 260 vmm_cpu_invltlb(); 261 } else { 262 pte = 0; 263 guest_sync_addr(pmap, ptep, &pte); 264 } 265 } 266 267 void 268 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 269 { 270 pmap_inval_pde(ptep, pmap, va); 271 } 272 273 /* 274 * This is really nasty. 275 * 276 * (1) The vkernel interlocks pte operations with the related vm_page_t 277 * spin-lock (and doesn't handle unmanaged page races). 278 * 279 * (2) The vkernel must also issu an invalidation to the real cpu. It 280 * (nastily) does this while holding the spin-lock too. 281 * 282 * In addition, atomic ops must be used to properly interlock against 283 * other cpus and the real kernel (which could be taking a fault on another 284 * cpu and will adjust VPTE_M and VPTE_A appropriately). 285 * 286 * The atomicc ops do a good job of interlocking against other cpus, but 287 * we still need to lock the pte location (which we use the vm_page spin-lock 288 * for) to avoid races against PG_WRITEABLE and other tests. 289 * 290 * Cleaning the pte involves clearing VPTE_M and VPTE_RW, synchronizing with 291 * the real host, and updating the vm_page appropriately. 292 * 293 * If the caller passes a non-NULL (m), the caller holds the spin-lock, 294 * otherwise we must acquire and release the spin-lock. (m) is only 295 * applicable to managed pages. 296 */ 297 vpte_t 298 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va, 299 vm_page_t m) 300 { 301 vpte_t pte; 302 int spin = 0; 303 304 /* 305 * Acquire (m) and spin-lock it. 306 */ 307 while (m == NULL) { 308 pte = *ptep; 309 if ((pte & VPTE_V) == 0) 310 return pte; 311 if ((pte & VPTE_MANAGED) == 0) 312 break; 313 m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME); 314 vm_page_spin_lock(m); 315 316 pte = *ptep; 317 if ((pte & VPTE_V) == 0) { 318 vm_page_spin_unlock(m); 319 m = NULL; 320 continue; 321 } 322 if ((pte & VPTE_MANAGED) == 0) { 323 vm_page_spin_unlock(m); 324 m = NULL; 325 continue; 326 } 327 if (m != PHYS_TO_VM_PAGE(pte & VPTE_FRAME)) { 328 vm_page_spin_unlock(m); 329 m = NULL; 330 continue; 331 } 332 spin = 1; 333 break; 334 } 335 336 if (vmm_enabled == 0) { 337 for (;;) { 338 pte = *ptep; 339 cpu_ccfence(); 340 if ((pte & VPTE_RW) == 0) 341 break; 342 if (atomic_cmpset_long(ptep, 343 pte, 344 pte & ~(VPTE_RW | VPTE_M))) { 345 pmap_inval_cpu(pmap, va, PAGE_SIZE); 346 break; 347 } 348 } 349 } else { 350 pte = *ptep & ~(VPTE_RW | VPTE_M); 351 guest_sync_addr(pmap, ptep, &pte); 352 } 353 354 if (m) { 355 if (pte & VPTE_A) { 356 vm_page_flag_set(m, PG_REFERENCED); 357 atomic_clear_long(ptep, VPTE_A); 358 } 359 if (pte & VPTE_M) { 360 vm_page_dirty(m); 361 } 362 if (spin) 363 vm_page_spin_unlock(m); 364 } 365 return pte; 366 } 367 368 /* 369 * This is a combination of pmap_inval_pte() and pmap_clean_pte(). 370 * Firts prevent races with the 'A' and 'M' bits, then clean out 371 * the tlb (the real cpu's pmap), then incorporate any races that 372 * may have occured in the mean time, and finally zero out the pte. 373 */ 374 vpte_t 375 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, 376 vm_offset_t va) 377 { 378 vpte_t pte; 379 380 if (vmm_enabled == 0) { 381 pte = atomic_swap_long(ptep, 0); 382 pmap_inval_cpu(pmap, va, PAGE_SIZE); 383 } else { 384 pte = 0; 385 guest_sync_addr(pmap, ptep, &pte); 386 } 387 return(pte); 388 } 389 390 void 391 cpu_invlpg(void *addr) 392 { 393 if (vmm_enabled) 394 vmm_cpu_invlpg(addr); 395 else 396 madvise(addr, PAGE_SIZE, MADV_INVAL); 397 } 398 399 void 400 cpu_invltlb(void) 401 { 402 if (vmm_enabled) 403 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */ 404 else 405 madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL); 406 } 407 408 /* 409 * Invalidate the TLB on all cpus. Instead what the vkernel does is 410 * ignore VM_PROT_NOSYNC on pmap_enter() calls. 411 */ 412 void 413 smp_invltlb(void) 414 { 415 /* do nothing */ 416 } 417 418 void 419 smp_sniff(void) 420 { 421 /* not implemented */ 422 } 423 424 void 425 cpu_sniff(int dcpu __unused) 426 { 427 /* not implemented */ 428 } 429