1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $ 35 */ 36 37 /* 38 * pmap invalidation support code. Certain hardware requirements must 39 * be dealt with when manipulating page table entries and page directory 40 * entries within a pmap. In particular, we cannot safely manipulate 41 * page tables which are in active use by another cpu (even if it is 42 * running in userland) for two reasons: First, TLB writebacks will 43 * race against our own modifications and tests. Second, even if we 44 * were to use bus-locked instruction we can still screw up the 45 * target cpu's instruction pipeline due to Intel cpu errata. 46 * 47 * For our virtual page tables, the real kernel will handle SMP interactions 48 * with pmaps that may be active on other cpus. Even so, we have to be 49 * careful about bit setting races particularly when we are trying to clean 50 * a page and test the modified bit to avoid races where the modified bit 51 * might get set after our poll but before we clear the field. 52 */ 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/vmmeter.h> 58 #include <sys/thread2.h> 59 #include <sys/cdefs.h> 60 #include <sys/mman.h> 61 #include <sys/vmspace.h> 62 #include <sys/vmm.h> 63 64 #include <vm/vm.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_object.h> 67 68 #include <machine/cputypes.h> 69 #include <machine/md_var.h> 70 #include <machine/specialreg.h> 71 #include <machine/smp.h> 72 #include <machine/globaldata.h> 73 #include <machine/pmap.h> 74 #include <machine/pmap_inval.h> 75 76 #include <unistd.h> 77 #include <pthread.h> 78 79 extern int vmm_enabled; 80 81 static __inline 82 void 83 vmm_cpu_invltlb(void) 84 { 85 /* For VMM mode forces vmmexit/resume */ 86 uint64_t rax = -1; 87 __asm __volatile("syscall;" 88 : 89 : "a" (rax) 90 :); 91 } 92 93 /* 94 * Invalidate va in the TLB on the current cpu 95 */ 96 static __inline 97 void 98 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes) 99 { 100 if (pmap == &kernel_pmap) { 101 madvise((void *)va, bytes, MADV_INVAL); 102 } else { 103 vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0); 104 } 105 } 106 107 /* 108 * This is a bit of a mess because we don't know what virtual cpus are 109 * mapped to real cpus. Basically try to optimize the degenerate cases 110 * (primarily related to user processes with only one thread or only one 111 * running thread), and shunt all the rest to the host cpu. The host cpu 112 * will invalidate all real cpu's the vkernel is running on. 113 * 114 * This can't optimize situations where a pmap is only mapped to some of 115 * the virtual cpus, though shunting to the real host will still be faster 116 * if the virtual kernel processes are running on fewer real-host cpus. 117 * (And probably will be faster anyway since there's no round-trip signaling 118 * overhead). 119 * 120 * NOTE: The critical section protects against preemption while the pmap 121 * is locked, which could otherwise result in a deadlock. 122 */ 123 static __inline 124 void 125 guest_sync_addr(struct pmap *pmap, 126 volatile vpte_t *dst_ptep, volatile vpte_t *src_ptep) 127 { 128 globaldata_t gd = mycpu; 129 cpulock_t olock; 130 cpulock_t nlock; 131 132 /* 133 * Lock the pmap 134 */ 135 crit_enter(); 136 for (;;) { 137 olock = pmap->pm_active_lock; 138 cpu_ccfence(); 139 if ((olock & CPULOCK_EXCL) == 0) { 140 nlock = olock | CPULOCK_EXCL; 141 if (atomic_cmpset_int(&pmap->pm_active_lock, 142 olock, nlock)) { 143 break; 144 } 145 } 146 cpu_pause(); 147 lwkt_process_ipiq(); 148 pthread_yield(); 149 } 150 151 /* 152 * Update the pte and synchronize with other cpus. If we can update 153 * it trivially, do so. 154 */ 155 if (CPUMASK_TESTZERO(pmap->pm_active) || 156 CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) { 157 if (src_ptep) 158 *dst_ptep = *src_ptep; 159 vmm_cpu_invltlb(); 160 } else { 161 vmm_guest_sync_addr(__DEVOLATILE(void *, dst_ptep), 162 __DEVOLATILE(void *, src_ptep)); 163 } 164 165 /* 166 * Unlock the pmap 167 */ 168 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL); 169 crit_exit(); 170 } 171 172 /* 173 * Invalidate a pte in a pmap and synchronize with target cpus 174 * as required. Throw away the modified and access bits. Use 175 * pmap_clean_pte() to do the same thing but also get an interlocked 176 * modified/access status. 177 * 178 * Clearing the field first (basically clearing VPTE_V) prevents any 179 * new races from occuring while we invalidate the TLB (i.e. the pmap 180 * on the real cpu), then clear it again to clean out any race that 181 * might have occured before the invalidation completed. 182 */ 183 void 184 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 185 { 186 vpte_t pte; 187 188 if (vmm_enabled == 0) { 189 *ptep = 0; 190 pmap_inval_cpu(pmap, va, PAGE_SIZE); 191 } else { 192 pte = 0; 193 guest_sync_addr(pmap, ptep, &pte); 194 } 195 } 196 197 /* 198 * Invalidate the tlb for a range of virtual addresses across all cpus 199 * belonging to the pmap. 200 */ 201 void 202 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 203 { 204 if (vmm_enabled == 0) { 205 pmap_inval_cpu(pmap, sva, eva - sva); 206 } else { 207 guest_sync_addr(pmap, NULL, NULL); 208 } 209 } 210 211 /* 212 * Same as pmap_inval_pte() but only synchronize with the current 213 * cpu. For the moment its the same as the non-quick version. 214 */ 215 void 216 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 217 { 218 *ptep = 0; 219 if (vmm_enabled) 220 vmm_cpu_invltlb(); 221 else 222 pmap_inval_cpu(pmap, va, PAGE_SIZE); 223 } 224 225 /* 226 * Invalidating page directory entries requires some additional 227 * sophistication. The cachemask must be cleared so the kernel 228 * resynchronizes its temporary page table mappings cache. 229 */ 230 void 231 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 232 { 233 vpte_t pte; 234 235 if (vmm_enabled == 0) { 236 *ptep = 0; 237 pmap_inval_cpu(pmap, va, SEG_SIZE); 238 } else if (CPUMASK_TESTMASK(pmap->pm_active, 239 mycpu->gd_other_cpus) == 0) { 240 *ptep = 0; 241 vmm_cpu_invltlb(); 242 } else { 243 pte = 0; 244 guest_sync_addr(pmap, ptep, &pte); 245 } 246 } 247 248 void 249 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 250 { 251 pmap_inval_pde(ptep, pmap, va); 252 } 253 254 /* 255 * These carefully handle interactions with other cpus and return 256 * the original vpte. Clearing VPTE_RW prevents us from racing the 257 * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's 258 * pmap) and get good status for VPTE_M. 259 * 260 * When messing with page directory entries we have to clear the cpu 261 * mask to force a reload of the kernel's page table mapping cache. 262 * 263 * clean: clear VPTE_M and VPTE_RW 264 * setro: clear VPTE_RW 265 * load&clear: clear entire field 266 */ 267 #include<stdio.h> 268 vpte_t 269 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 270 { 271 vpte_t pte; 272 273 pte = *ptep; 274 if (pte & VPTE_V) { 275 atomic_clear_long(ptep, VPTE_RW); /* XXX */ 276 if (vmm_enabled == 0) { 277 pmap_inval_cpu(pmap, va, PAGE_SIZE); 278 pte = *ptep; 279 } else { 280 guest_sync_addr(pmap, &pte, ptep); 281 } 282 atomic_clear_long(ptep, VPTE_RW|VPTE_M); 283 } 284 return(pte); 285 } 286 287 vpte_t 288 pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 289 { 290 vpte_t pte; 291 292 pte = *ptep; 293 if (pte & VPTE_V) { 294 atomic_clear_long(ptep, VPTE_RW); 295 if (vmm_enabled == 0) { 296 pmap_inval_cpu(pmap, va, SEG_SIZE); 297 pte = *ptep; 298 } else { 299 guest_sync_addr(pmap, &pte, ptep); 300 } 301 atomic_clear_long(ptep, VPTE_RW|VPTE_M); 302 } 303 return(pte); 304 } 305 306 /* 307 * This is an odd case and I'm not sure whether it even occurs in normal 308 * operation. Turn off write access to the page, clean out the tlb 309 * (the real cpu's pmap), and deal with any VPTE_M race that may have 310 * occured. VPTE_M is not cleared. 311 */ 312 vpte_t 313 pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va) 314 { 315 vpte_t pte; 316 vpte_t npte; 317 318 pte = *ptep; 319 if (pte & VPTE_V) { 320 atomic_clear_long(ptep, VPTE_RW); 321 if (vmm_enabled == 0) { 322 pmap_inval_cpu(pmap, va, PAGE_SIZE); 323 pte |= *ptep & VPTE_M; 324 } else { 325 guest_sync_addr(pmap, &npte, ptep); 326 pte |= npte & VPTE_M; 327 } 328 } 329 return(pte); 330 } 331 332 /* 333 * This is a combination of pmap_inval_pte() and pmap_clean_pte(). 334 * Firts prevent races with the 'A' and 'M' bits, then clean out 335 * the tlb (the real cpu's pmap), then incorporate any races that 336 * may have occured in the mean time, and finally zero out the pte. 337 */ 338 vpte_t 339 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, 340 vm_offset_t va) 341 { 342 vpte_t pte; 343 vpte_t npte; 344 345 pte = *ptep; 346 if (pte & VPTE_V) { 347 pte = *ptep; 348 atomic_clear_long(ptep, VPTE_RW); 349 if (vmm_enabled == 0) { 350 pmap_inval_cpu(pmap, va, PAGE_SIZE); 351 pte |= *ptep & (VPTE_A | VPTE_M); 352 } else { 353 guest_sync_addr(pmap, &npte, ptep); 354 pte |= npte & (VPTE_A | VPTE_M); 355 } 356 } 357 *ptep = 0; 358 return(pte); 359 } 360 361 void 362 cpu_invlpg(void *addr) 363 { 364 if (vmm_enabled) 365 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */ 366 else 367 madvise(addr, PAGE_SIZE, MADV_INVAL); 368 } 369 370 void 371 cpu_invltlb(void) 372 { 373 if (vmm_enabled) 374 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */ 375 else 376 madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL); 377 } 378 379 void 380 smp_invltlb(void) 381 { 382 /* XXX must invalidate the tlb on all cpus */ 383 /* at the moment pmap_inval_pte_quick */ 384 /* do nothing */ 385 } 386 387 void 388 smp_sniff(void) 389 { 390 /* not implemented */ 391 } 392