1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $ 35 */ 36 37 /* 38 * pmap invalidation support code. Certain hardware requirements must 39 * be dealt with when manipulating page table entries and page directory 40 * entries within a pmap. In particular, we cannot safely manipulate 41 * page tables which are in active use by another cpu (even if it is 42 * running in userland) for two reasons: First, TLB writebacks will 43 * race against our own modifications and tests. Second, even if we 44 * were to use bus-locked instruction we can still screw up the 45 * target cpu's instruction pipeline due to Intel cpu errata. 46 * 47 * For our virtual page tables, the real kernel will handle SMP interactions 48 * with pmaps that may be active on other cpus. Even so, we have to be 49 * careful about bit setting races particularly when we are trying to clean 50 * a page and test the modified bit to avoid races where the modified bit 51 * might get set after our poll but before we clear the field. 52 */ 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/vmmeter.h> 58 #include <sys/thread2.h> 59 #include <sys/cdefs.h> 60 #include <sys/mman.h> 61 #include <sys/vmspace.h> 62 #include <sys/vmm.h> 63 64 #include <vm/vm.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_object.h> 67 68 #include <machine/cputypes.h> 69 #include <machine/md_var.h> 70 #include <machine/specialreg.h> 71 #include <machine/smp.h> 72 #include <machine/globaldata.h> 73 #include <machine/pmap.h> 74 #include <machine/pmap_inval.h> 75 76 #include <unistd.h> 77 #include <pthread.h> 78 79 extern int vmm_enabled; 80 81 static __inline 82 void 83 vmm_cpu_invltlb(void) 84 { 85 /* For VMM mode forces vmmexit/resume */ 86 uint64_t rax = -1; 87 __asm __volatile("syscall;" 88 : 89 : "a" (rax) 90 :); 91 } 92 93 /* 94 * Invalidate va in the TLB on the current cpu 95 */ 96 static __inline 97 void 98 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes) 99 { 100 if (pmap == &kernel_pmap) { 101 madvise((void *)va, bytes, MADV_INVAL); 102 } else { 103 vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0); 104 } 105 } 106 107 /* 108 * This is a bit of a mess because we don't know what virtual cpus are 109 * mapped to real cpus. Basically try to optimize the degenerate cases 110 * (primarily related to user processes with only one thread or only one 111 * running thread), and shunt all the rest to the host cpu. The host cpu 112 * will invalidate all real cpu's the vkernel is running on. 113 * 114 * This can't optimize situations where a pmap is only mapped to some of 115 * the virtual cpus, though shunting to the real host will still be faster 116 * if the virtual kernel processes are running on fewer real-host cpus. 117 * (And probably will be faster anyway since there's no round-trip signaling 118 * overhead). 119 * 120 * NOTE: The critical section protects against preemption while the pmap 121 * is locked, which could otherwise result in a deadlock. 122 */ 123 static __inline 124 void 125 guest_sync_addr(struct pmap *pmap, 126 volatile vpte_t *dst_ptep, volatile vpte_t *src_ptep) 127 { 128 globaldata_t gd = mycpu; 129 cpulock_t olock; 130 cpulock_t nlock; 131 132 /* 133 * Lock the pmap 134 */ 135 crit_enter(); 136 for (;;) { 137 olock = pmap->pm_active_lock; 138 cpu_ccfence(); 139 if ((olock & CPULOCK_EXCL) == 0) { 140 nlock = olock | CPULOCK_EXCL; 141 if (atomic_cmpset_int(&pmap->pm_active_lock, 142 olock, nlock)) { 143 break; 144 } 145 } 146 cpu_pause(); 147 lwkt_process_ipiq(); 148 pthread_yield(); 149 } 150 151 /* 152 * Update the pte and synchronize with other cpus. If we can update 153 * it trivially, do so. 154 */ 155 if (CPUMASK_TESTZERO(pmap->pm_active) || 156 CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) { 157 *dst_ptep = *src_ptep; 158 vmm_cpu_invltlb(); 159 } else { 160 vmm_guest_sync_addr(__DEVOLATILE(void *, dst_ptep), 161 __DEVOLATILE(void *, src_ptep)); 162 } 163 164 /* 165 * Unlock the pmap 166 */ 167 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL); 168 crit_exit(); 169 } 170 171 /* 172 * Invalidate a pte in a pmap and synchronize with target cpus 173 * as required. Throw away the modified and access bits. Use 174 * pmap_clean_pte() to do the same thing but also get an interlocked 175 * modified/access status. 176 * 177 * Clearing the field first (basically clearing VPTE_V) prevents any 178 * new races from occuring while we invalidate the TLB (i.e. the pmap 179 * on the real cpu), then clear it again to clean out any race that 180 * might have occured before the invalidation completed. 181 */ 182 void 183 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 184 { 185 vpte_t pte; 186 187 if (vmm_enabled == 0) { 188 *ptep = 0; 189 pmap_inval_cpu(pmap, va, PAGE_SIZE); 190 } else { 191 pte = 0; 192 guest_sync_addr(pmap, ptep, &pte); 193 } 194 } 195 196 /* 197 * Same as pmap_inval_pte() but only synchronize with the current 198 * cpu. For the moment its the same as the non-quick version. 199 */ 200 void 201 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 202 { 203 *ptep = 0; 204 if (vmm_enabled) 205 vmm_cpu_invltlb(); 206 else 207 pmap_inval_cpu(pmap, va, PAGE_SIZE); 208 } 209 210 /* 211 * Invalidating page directory entries requires some additional 212 * sophistication. The cachemask must be cleared so the kernel 213 * resynchronizes its temporary page table mappings cache. 214 */ 215 void 216 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 217 { 218 vpte_t pte; 219 220 if (vmm_enabled == 0) { 221 *ptep = 0; 222 pmap_inval_cpu(pmap, va, SEG_SIZE); 223 } else if (CPUMASK_TESTMASK(pmap->pm_active, 224 mycpu->gd_other_cpus) == 0) { 225 *ptep = 0; 226 vmm_cpu_invltlb(); 227 } else { 228 pte = 0; 229 guest_sync_addr(pmap, ptep, &pte); 230 } 231 } 232 233 void 234 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 235 { 236 pmap_inval_pde(ptep, pmap, va); 237 } 238 239 /* 240 * These carefully handle interactions with other cpus and return 241 * the original vpte. Clearing VPTE_RW prevents us from racing the 242 * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's 243 * pmap) and get good status for VPTE_M. 244 * 245 * When messing with page directory entries we have to clear the cpu 246 * mask to force a reload of the kernel's page table mapping cache. 247 * 248 * clean: clear VPTE_M and VPTE_RW 249 * setro: clear VPTE_RW 250 * load&clear: clear entire field 251 */ 252 #include<stdio.h> 253 vpte_t 254 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 255 { 256 vpte_t pte; 257 258 pte = *ptep; 259 if (pte & VPTE_V) { 260 atomic_clear_long(ptep, VPTE_RW); /* XXX */ 261 if (vmm_enabled == 0) { 262 pmap_inval_cpu(pmap, va, PAGE_SIZE); 263 pte = *ptep; 264 } else { 265 guest_sync_addr(pmap, &pte, ptep); 266 } 267 atomic_clear_long(ptep, VPTE_RW|VPTE_M); 268 } 269 return(pte); 270 } 271 272 vpte_t 273 pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 274 { 275 vpte_t pte; 276 277 pte = *ptep; 278 if (pte & VPTE_V) { 279 atomic_clear_long(ptep, VPTE_RW); 280 if (vmm_enabled == 0) { 281 pmap_inval_cpu(pmap, va, SEG_SIZE); 282 pte = *ptep; 283 } else { 284 guest_sync_addr(pmap, &pte, ptep); 285 } 286 atomic_clear_long(ptep, VPTE_RW|VPTE_M); 287 } 288 return(pte); 289 } 290 291 /* 292 * This is an odd case and I'm not sure whether it even occurs in normal 293 * operation. Turn off write access to the page, clean out the tlb 294 * (the real cpu's pmap), and deal with any VPTE_M race that may have 295 * occured. VPTE_M is not cleared. 296 */ 297 vpte_t 298 pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 299 { 300 vpte_t pte; 301 vpte_t npte; 302 303 pte = *ptep; 304 if (pte & VPTE_V) { 305 atomic_clear_long(ptep, VPTE_RW); 306 if (vmm_enabled == 0) { 307 pmap_inval_cpu(pmap, va, PAGE_SIZE); 308 pte |= *ptep & VPTE_M; 309 } else { 310 guest_sync_addr(pmap, &npte, ptep); 311 pte |= npte & VPTE_M; 312 } 313 } 314 return(pte); 315 } 316 317 /* 318 * This is a combination of pmap_inval_pte() and pmap_clean_pte(). 319 * Firts prevent races with the 'A' and 'M' bits, then clean out 320 * the tlb (the real cpu's pmap), then incorporate any races that 321 * may have occured in the mean time, and finally zero out the pte. 322 */ 323 vpte_t 324 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, 325 vm_offset_t va) 326 { 327 vpte_t pte; 328 vpte_t npte; 329 330 pte = *ptep; 331 if (pte & VPTE_V) { 332 pte = *ptep; 333 atomic_clear_long(ptep, VPTE_RW); 334 if (vmm_enabled == 0) { 335 pmap_inval_cpu(pmap, va, PAGE_SIZE); 336 pte |= *ptep & (VPTE_A | VPTE_M); 337 } else { 338 guest_sync_addr(pmap, &npte, ptep); 339 pte |= npte & (VPTE_A | VPTE_M); 340 } 341 } 342 *ptep = 0; 343 return(pte); 344 } 345 346 /* 347 * Synchronize a kvm mapping originally made for the private use on 348 * some other cpu so it can be used on all cpus. 349 * 350 * XXX add MADV_RESYNC to improve performance. 351 * 352 * We don't need to do anything because our pmap_inval_pte_quick() 353 * synchronizes it immediately. 354 */ 355 void 356 pmap_kenter_sync(vm_offset_t va __unused) 357 { 358 } 359 360 void 361 cpu_invlpg(void *addr) 362 { 363 if (vmm_enabled) 364 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */ 365 else 366 madvise(addr, PAGE_SIZE, MADV_INVAL); 367 } 368 369 void 370 cpu_invltlb(void) 371 { 372 if (vmm_enabled) 373 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */ 374 else 375 madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL); 376 } 377 378 void 379 smp_invltlb(void) 380 { 381 /* XXX must invalidate the tlb on all cpus */ 382 /* at the moment pmap_inval_pte_quick */ 383 /* do nothing */ 384 } 385