1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $ 35 */ 36 37 /* 38 * pmap invalidation support code. Certain hardware requirements must 39 * be dealt with when manipulating page table entries and page directory 40 * entries within a pmap. In particular, we cannot safely manipulate 41 * page tables which are in active use by another cpu (even if it is 42 * running in userland) for two reasons: First, TLB writebacks will 43 * race against our own modifications and tests. Second, even if we 44 * were to use bus-locked instruction we can still screw up the 45 * target cpu's instruction pipeline due to Intel cpu errata. 46 * 47 * For our virtual page tables, the real kernel will handle SMP interactions 48 * with pmaps that may be active on other cpus. Even so, we have to be 49 * careful about bit setting races particularly when we are trying to clean 50 * a page and test the modified bit to avoid races where the modified bit 51 * might get set after our poll but before we clear the field. 52 */ 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/vmmeter.h> 58 #include <sys/thread2.h> 59 #include <sys/cdefs.h> 60 #include <sys/mman.h> 61 #include <sys/vmspace.h> 62 #include <sys/vmm.h> 63 64 #include <vm/vm.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_object.h> 67 68 #include <machine/cputypes.h> 69 #include <machine/md_var.h> 70 #include <machine/specialreg.h> 71 #include <machine/smp.h> 72 #include <machine/globaldata.h> 73 #include <machine/pmap.h> 74 #include <machine/pmap_inval.h> 75 76 #include <unistd.h> 77 #include <pthread.h> 78 79 extern int vmm_enabled; 80 81 static __inline 82 void 83 vmm_cpu_invltlb(void) 84 { 85 /* For VMM mode forces vmmexit/resume */ 86 uint64_t rax = -1; 87 __asm __volatile("syscall;" 88 : 89 : "a" (rax) 90 :); 91 } 92 93 /* 94 * Invalidate va in the TLB on the current cpu 95 */ 96 static __inline 97 void 98 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes) 99 { 100 if (pmap == &kernel_pmap) { 101 madvise((void *)va, bytes, MADV_INVAL); 102 } else { 103 vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0); 104 } 105 } 106 107 /* 108 * This is a bit of a mess because we don't know what virtual cpus are 109 * mapped to real cpus. Basically try to optimize the degenerate cases 110 * (primarily related to user processes with only one thread or only one 111 * running thread), and shunt all the rest to the host cpu. The host cpu 112 * will invalidate all real cpu's the vkernel is running on. 113 * 114 * This can't optimize situations where a pmap is only mapped to some of 115 * the virtual cpus, though shunting to the real host will still be faster 116 * if the virtual kernel processes are running on fewer real-host cpus. 117 * (And probably will be faster anyway since there's no round-trip signaling 118 * overhead). 119 * 120 * NOTE: The critical section protects against preemption while the pmap 121 * is locked, which could otherwise result in a deadlock. 122 */ 123 static __inline 124 void 125 guest_sync_addr(struct pmap *pmap, 126 volatile vpte_t *dst_ptep, volatile vpte_t *src_ptep) 127 { 128 globaldata_t gd = mycpu; 129 cpumask_t oactive; 130 cpumask_t nactive; 131 132 crit_enter(); 133 if (pmap->pm_active == 0 && 134 atomic_cmpset_cpumask(&pmap->pm_active, 0, CPUMASK_LOCK)) { 135 /* 136 * Avoid IPIs if pmap is inactive and we can trivially 137 * lock it. 138 */ 139 *dst_ptep = *src_ptep; 140 vmm_cpu_invltlb(); 141 } else if (pmap->pm_active == gd->gd_cpumask && 142 atomic_cmpset_cpumask(&pmap->pm_active, 143 gd->gd_cpumask, gd->gd_cpumask | CPUMASK_LOCK)) { 144 /* 145 * Avoid IPIs if only our cpu is using the pmap and we 146 * can trivially lock it. 147 */ 148 *dst_ptep = *src_ptep; 149 vmm_cpu_invltlb(); 150 } else { 151 /* 152 * Lock the pmap 153 */ 154 for (;;) { 155 oactive = pmap->pm_active; 156 cpu_ccfence(); 157 if ((oactive & CPUMASK_LOCK) == 0) { 158 nactive = oactive | CPUMASK_LOCK; 159 if (atomic_cmpset_cpumask(&pmap->pm_active, 160 oactive, 161 nactive)) { 162 break; 163 } 164 } 165 cpu_pause(); 166 lwkt_process_ipiq(); 167 pthread_yield(); 168 } 169 vmm_guest_sync_addr(__DEVOLATILE(void *, dst_ptep), 170 __DEVOLATILE(void *, src_ptep)); 171 } 172 atomic_clear_cpumask(&pmap->pm_active, CPUMASK_LOCK); 173 crit_exit(); 174 } 175 176 /* 177 * Invalidate a pte in a pmap and synchronize with target cpus 178 * as required. Throw away the modified and access bits. Use 179 * pmap_clean_pte() to do the same thing but also get an interlocked 180 * modified/access status. 181 * 182 * Clearing the field first (basically clearing VPTE_V) prevents any 183 * new races from occuring while we invalidate the TLB (i.e. the pmap 184 * on the real cpu), then clear it again to clean out any race that 185 * might have occured before the invalidation completed. 186 */ 187 void 188 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 189 { 190 vpte_t pte; 191 192 if (vmm_enabled == 0) { 193 *ptep = 0; 194 pmap_inval_cpu(pmap, va, PAGE_SIZE); 195 } else { 196 pte = 0; 197 guest_sync_addr(pmap, ptep, &pte); 198 } 199 } 200 201 /* 202 * Same as pmap_inval_pte() but only synchronize with the current 203 * cpu. For the moment its the same as the non-quick version. 204 */ 205 void 206 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 207 { 208 *ptep = 0; 209 if (vmm_enabled) 210 vmm_cpu_invltlb(); 211 else 212 pmap_inval_cpu(pmap, va, PAGE_SIZE); 213 } 214 215 /* 216 * Invalidating page directory entries requires some additional 217 * sophistication. The cachemask must be cleared so the kernel 218 * resynchronizes its temporary page table mappings cache. 219 */ 220 void 221 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 222 { 223 vpte_t pte; 224 225 if (vmm_enabled == 0) { 226 *ptep = 0; 227 pmap_inval_cpu(pmap, va, SEG_SIZE); 228 } else if ((pmap->pm_active & mycpu->gd_other_cpus) == 0) { 229 *ptep = 0; 230 vmm_cpu_invltlb(); 231 } else { 232 pte = 0; 233 guest_sync_addr(pmap, ptep, &pte); 234 } 235 } 236 237 void 238 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 239 { 240 pmap_inval_pde(ptep, pmap, va); 241 } 242 243 /* 244 * These carefully handle interactions with other cpus and return 245 * the original vpte. Clearing VPTE_RW prevents us from racing the 246 * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's 247 * pmap) and get good status for VPTE_M. 248 * 249 * When messing with page directory entries we have to clear the cpu 250 * mask to force a reload of the kernel's page table mapping cache. 251 * 252 * clean: clear VPTE_M and VPTE_RW 253 * setro: clear VPTE_RW 254 * load&clear: clear entire field 255 */ 256 #include<stdio.h> 257 vpte_t 258 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 259 { 260 vpte_t pte; 261 262 pte = *ptep; 263 if (pte & VPTE_V) { 264 atomic_clear_long(ptep, VPTE_RW); /* XXX */ 265 if (vmm_enabled == 0) { 266 pmap_inval_cpu(pmap, va, PAGE_SIZE); 267 pte = *ptep; 268 } else { 269 guest_sync_addr(pmap, &pte, ptep); 270 } 271 atomic_clear_long(ptep, VPTE_RW|VPTE_M); 272 } 273 return(pte); 274 } 275 276 vpte_t 277 pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 278 { 279 vpte_t pte; 280 281 pte = *ptep; 282 if (pte & VPTE_V) { 283 atomic_clear_long(ptep, VPTE_RW); 284 if (vmm_enabled == 0) { 285 pmap_inval_cpu(pmap, va, SEG_SIZE); 286 pte = *ptep; 287 } else { 288 guest_sync_addr(pmap, &pte, ptep); 289 } 290 atomic_clear_long(ptep, VPTE_RW|VPTE_M); 291 } 292 return(pte); 293 } 294 295 /* 296 * This is an odd case and I'm not sure whether it even occurs in normal 297 * operation. Turn off write access to the page, clean out the tlb 298 * (the real cpu's pmap), and deal with any VPTE_M race that may have 299 * occured. VPTE_M is not cleared. 300 */ 301 vpte_t 302 pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 303 { 304 vpte_t pte; 305 vpte_t npte; 306 307 pte = *ptep; 308 if (pte & VPTE_V) { 309 atomic_clear_long(ptep, VPTE_RW); 310 if (vmm_enabled == 0) { 311 pmap_inval_cpu(pmap, va, PAGE_SIZE); 312 pte |= *ptep & VPTE_M; 313 } else { 314 guest_sync_addr(pmap, &npte, ptep); 315 pte |= npte & VPTE_M; 316 } 317 } 318 return(pte); 319 } 320 321 /* 322 * This is a combination of pmap_inval_pte() and pmap_clean_pte(). 323 * Firts prevent races with the 'A' and 'M' bits, then clean out 324 * the tlb (the real cpu's pmap), then incorporate any races that 325 * may have occured in the mean time, and finally zero out the pte. 326 */ 327 vpte_t 328 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, 329 vm_offset_t va) 330 { 331 vpte_t pte; 332 vpte_t npte; 333 334 pte = *ptep; 335 if (pte & VPTE_V) { 336 pte = *ptep; 337 atomic_clear_long(ptep, VPTE_RW); 338 if (vmm_enabled == 0) { 339 pmap_inval_cpu(pmap, va, PAGE_SIZE); 340 pte |= *ptep & (VPTE_A | VPTE_M); 341 } else { 342 guest_sync_addr(pmap, &npte, ptep); 343 pte |= npte & (VPTE_A | VPTE_M); 344 } 345 } 346 *ptep = 0; 347 return(pte); 348 } 349 350 /* 351 * Synchronize a kvm mapping originally made for the private use on 352 * some other cpu so it can be used on all cpus. 353 * 354 * XXX add MADV_RESYNC to improve performance. 355 * 356 * We don't need to do anything because our pmap_inval_pte_quick() 357 * synchronizes it immediately. 358 */ 359 void 360 pmap_kenter_sync(vm_offset_t va __unused) 361 { 362 } 363 364 void 365 cpu_invlpg(void *addr) 366 { 367 if (vmm_enabled) 368 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */ 369 else 370 madvise(addr, PAGE_SIZE, MADV_INVAL); 371 } 372 373 void 374 cpu_invltlb(void) 375 { 376 if (vmm_enabled) 377 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */ 378 else 379 madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL); 380 } 381 382 void 383 smp_invltlb(void) 384 { 385 /* XXX must invalidate the tlb on all cpus */ 386 /* at the moment pmap_inval_pte_quick */ 387 /* do nothing */ 388 } 389