1 /* 2 * Copyright (c) 2003-2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $ 35 */ 36 37 /* 38 * pmap invalidation support code. Certain hardware requirements must 39 * be dealt with when manipulating page table entries and page directory 40 * entries within a pmap. In particular, we cannot safely manipulate 41 * page tables which are in active use by another cpu (even if it is 42 * running in userland) for two reasons: First, TLB writebacks will 43 * race against our own modifications and tests. Second, even if we 44 * were to use bus-locked instruction we can still screw up the 45 * target cpu's instruction pipeline due to Intel cpu errata. 46 * 47 * For our virtual page tables, the real kernel will handle SMP interactions 48 * with pmaps that may be active on other cpus. Even so, we have to be 49 * careful about bit setting races particularly when we are trying to clean 50 * a page and test the modified bit to avoid races where the modified bit 51 * might get set after our poll but before we clear the field. 52 */ 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/vmmeter.h> 58 #include <sys/thread2.h> 59 #include <sys/cdefs.h> 60 #include <sys/mman.h> 61 #include <sys/vmspace.h> 62 #include <sys/vmm.h> 63 64 #include <vm/vm.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_object.h> 67 68 #include <machine/cputypes.h> 69 #include <machine/md_var.h> 70 #include <machine/specialreg.h> 71 #include <machine/smp.h> 72 #include <machine/globaldata.h> 73 #include <machine/pmap.h> 74 #include <machine/pmap_inval.h> 75 76 #include <unistd.h> 77 #include <pthread.h> 78 79 #include <vm/vm_page2.h> 80 81 extern int vmm_enabled; 82 83 /* 84 * Invalidate the TLB on the current cpu 85 * 86 * (VMM enabled only) 87 */ 88 static __inline 89 void 90 vmm_cpu_invltlb(void) 91 { 92 #if 0 93 /* not directly supported */ 94 cpu_invltlb(); 95 #else 96 /* vmm_guest_sync_addr(NULL, NULL); */ 97 /* For VMM mode forces vmmexit/resume */ 98 uint64_t rax = -1; 99 __asm __volatile("syscall;" 100 : 101 : "a" (rax) 102 :); 103 #endif 104 } 105 106 static __inline 107 void 108 vmm_cpu_invlpg(void *addr __unused) 109 { 110 vmm_cpu_invltlb(); 111 } 112 113 /* 114 * Invalidate va in the TLB on the current cpu 115 * 116 * (VMM disabled only) 117 */ 118 static __inline 119 void 120 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes) 121 { 122 if (pmap == &kernel_pmap) { 123 madvise((void *)va, bytes, MADV_INVAL); 124 } else { 125 vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0); 126 } 127 } 128 129 /* 130 * This is a bit of a mess because we don't know what virtual cpus are 131 * mapped to real cpus. Basically try to optimize the degenerate cases 132 * (primarily related to user processes with only one thread or only one 133 * running thread), and shunt all the rest to the host cpu. The host cpu 134 * will invalidate all real cpu's the vkernel is running on. 135 * 136 * This can't optimize situations where a pmap is only mapped to some of 137 * the virtual cpus, though shunting to the real host will still be faster 138 * if the virtual kernel processes are running on fewer real-host cpus. 139 * (And probably will be faster anyway since there's no round-trip signaling 140 * overhead). 141 * 142 * NOTE: The critical section protects against preemption while the pmap 143 * is locked, which could otherwise result in a deadlock. 144 */ 145 static __inline 146 void 147 guest_sync_addr(struct pmap *pmap, volatile vpte_t *ptep, vpte_t *srcv) 148 { 149 globaldata_t gd = mycpu; 150 cpulock_t olock; 151 cpulock_t nlock; 152 153 /* 154 * Lock the pmap 155 */ 156 crit_enter(); 157 for (;;) { 158 olock = pmap->pm_active_lock; 159 cpu_ccfence(); 160 if ((olock & CPULOCK_EXCL) == 0) { 161 nlock = olock | CPULOCK_EXCL; 162 if (atomic_cmpset_int(&pmap->pm_active_lock, 163 olock, nlock)) { 164 break; 165 } 166 } 167 cpu_pause(); 168 lwkt_process_ipiq(); 169 pthread_yield(); 170 } 171 172 /* 173 * Update the pte and synchronize with other cpus. If we can update 174 * it trivially, do so. 175 */ 176 if (CPUMASK_TESTZERO(pmap->pm_active) || 177 CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) { 178 if (ptep) 179 *srcv = atomic_swap_long(ptep, *srcv); 180 vmm_cpu_invltlb(); 181 } else { 182 vmm_guest_sync_addr(__DEVOLATILE(void *, ptep), srcv); 183 } 184 185 /* 186 * Unlock the pmap 187 */ 188 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL); 189 crit_exit(); 190 } 191 192 /* 193 * Invalidate a pte in a pmap and synchronize with target cpus 194 * as required. Throw away the modified and access bits. Use 195 * pmap_clean_pte() to do the same thing but also get an interlocked 196 * modified/access status. 197 * 198 * Clearing the field first (basically clearing VPTE_V) prevents any 199 * new races from occuring while we invalidate the TLB (i.e. the pmap 200 * on the real cpu), then clear it again to clean out any race that 201 * might have occured before the invalidation completed. 202 */ 203 void 204 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 205 { 206 vpte_t pte; 207 208 if (vmm_enabled == 0) { 209 atomic_swap_long(ptep, 0); 210 pmap_inval_cpu(pmap, va, PAGE_SIZE); 211 } else { 212 pte = 0; 213 guest_sync_addr(pmap, ptep, &pte); 214 } 215 } 216 217 /* 218 * Same as pmap_inval_pte() but only synchronize with the current 219 * cpu. For the moment its the same as the non-quick version. 220 */ 221 void 222 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 223 { 224 atomic_swap_long(ptep, 0); 225 if (vmm_enabled == 0) 226 pmap_inval_cpu(pmap, va, PAGE_SIZE); 227 else 228 vmm_cpu_invltlb(); 229 } 230 231 /* 232 * Invalidate the tlb for a range of virtual addresses across all cpus 233 * belonging to the pmap. 234 */ 235 void 236 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 237 { 238 if (vmm_enabled == 0) { 239 pmap_inval_cpu(pmap, sva, eva - sva); 240 } else { 241 guest_sync_addr(pmap, NULL, NULL); 242 } 243 } 244 245 /* 246 * Invalidating page directory entries requires some additional 247 * sophistication. The cachemask must be cleared so the kernel 248 * resynchronizes its temporary page table mappings cache. 249 */ 250 void 251 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 252 { 253 vpte_t pte; 254 255 if (vmm_enabled == 0) { 256 atomic_swap_long(ptep, 0); 257 pmap_inval_cpu(pmap, va, SEG_SIZE); 258 } else if (CPUMASK_TESTMASK(pmap->pm_active, 259 mycpu->gd_other_cpus) == 0) { 260 atomic_swap_long(ptep, 0); 261 vmm_cpu_invltlb(); 262 } else { 263 pte = 0; 264 guest_sync_addr(pmap, ptep, &pte); 265 } 266 } 267 268 void 269 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 270 { 271 pmap_inval_pde(ptep, pmap, va); 272 } 273 274 /* 275 * This is really nasty. 276 * 277 * (1) The vkernel interlocks pte operations with the related vm_page_t 278 * spin-lock (and doesn't handle unmanaged page races). 279 * 280 * (2) The vkernel must also issu an invalidation to the real cpu. It 281 * (nastily) does this while holding the spin-lock too. 282 * 283 * In addition, atomic ops must be used to properly interlock against 284 * other cpus and the real kernel (which could be taking a fault on another 285 * cpu and will adjust VPTE_M and VPTE_A appropriately). 286 * 287 * The atomicc ops do a good job of interlocking against other cpus, but 288 * we still need to lock the pte location (which we use the vm_page spin-lock 289 * for) to avoid races against PG_WRITEABLE and other tests. 290 * 291 * Cleaning the pte involves clearing VPTE_M and VPTE_RW, synchronizing with 292 * the real host, and updating the vm_page appropriately. 293 * 294 * If the caller passes a non-NULL (m), the caller holds the spin-lock, 295 * otherwise we must acquire and release the spin-lock. (m) is only 296 * applicable to managed pages. 297 */ 298 vpte_t 299 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va, 300 vm_page_t m) 301 { 302 vpte_t pte; 303 int spin = 0; 304 305 /* 306 * Acquire (m) and spin-lock it. 307 */ 308 while (m == NULL) { 309 pte = *ptep; 310 if ((pte & VPTE_V) == 0) 311 return pte; 312 if ((pte & VPTE_MANAGED) == 0) 313 break; 314 m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME); 315 vm_page_spin_lock(m); 316 317 pte = *ptep; 318 if ((pte & VPTE_V) == 0) { 319 vm_page_spin_unlock(m); 320 m = NULL; 321 continue; 322 } 323 if ((pte & VPTE_MANAGED) == 0) { 324 vm_page_spin_unlock(m); 325 m = NULL; 326 continue; 327 } 328 if (m != PHYS_TO_VM_PAGE(pte & VPTE_FRAME)) { 329 vm_page_spin_unlock(m); 330 m = NULL; 331 continue; 332 } 333 spin = 1; 334 break; 335 } 336 337 if (vmm_enabled == 0) { 338 for (;;) { 339 pte = *ptep; 340 cpu_ccfence(); 341 if ((pte & VPTE_RW) == 0) 342 break; 343 if (atomic_cmpset_long(ptep, 344 pte, 345 pte & ~(VPTE_RW | VPTE_M))) { 346 pmap_inval_cpu(pmap, va, PAGE_SIZE); 347 break; 348 } 349 } 350 } else { 351 pte = *ptep & ~(VPTE_RW | VPTE_M); 352 guest_sync_addr(pmap, ptep, &pte); 353 } 354 355 if (m) { 356 if (pte & VPTE_A) { 357 vm_page_flag_set(m, PG_REFERENCED); 358 atomic_clear_long(ptep, VPTE_A); 359 } 360 if (pte & VPTE_M) { 361 vm_page_dirty(m); 362 } 363 if (spin) 364 vm_page_spin_unlock(m); 365 } 366 return pte; 367 } 368 369 /* 370 * This is a combination of pmap_inval_pte() and pmap_clean_pte(). 371 * Firts prevent races with the 'A' and 'M' bits, then clean out 372 * the tlb (the real cpu's pmap), then incorporate any races that 373 * may have occured in the mean time, and finally zero out the pte. 374 */ 375 vpte_t 376 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, 377 vm_offset_t va) 378 { 379 vpte_t pte; 380 381 if (vmm_enabled == 0) { 382 pte = atomic_swap_long(ptep, 0); 383 pmap_inval_cpu(pmap, va, PAGE_SIZE); 384 } else { 385 pte = 0; 386 guest_sync_addr(pmap, ptep, &pte); 387 } 388 return(pte); 389 } 390 391 void 392 cpu_invlpg(void *addr) 393 { 394 if (vmm_enabled) 395 vmm_cpu_invlpg(addr); 396 else 397 madvise(addr, PAGE_SIZE, MADV_INVAL); 398 } 399 400 void 401 cpu_invltlb(void) 402 { 403 if (vmm_enabled) 404 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */ 405 else 406 madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL); 407 } 408 409 /* 410 * Invalidate the TLB on all cpus. Instead what the vkernel does is 411 * ignore VM_PROT_NOSYNC on pmap_enter() calls. 412 */ 413 void 414 smp_invltlb(void) 415 { 416 /* do nothing */ 417 } 418 419 void 420 smp_sniff(void) 421 { 422 /* not implemented */ 423 } 424 425 void 426 cpu_sniff(int dcpu __unused) 427 { 428 /* not implemented */ 429 } 430