1 /*
2  * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $
35  */
36 
37 /*
38  * pmap invalidation support code.  Certain hardware requirements must
39  * be dealt with when manipulating page table entries and page directory
40  * entries within a pmap.  In particular, we cannot safely manipulate
41  * page tables which are in active use by another cpu (even if it is
42  * running in userland) for two reasons: First, TLB writebacks will
43  * race against our own modifications and tests.  Second, even if we
44  * were to use bus-locked instruction we can still screw up the
45  * target cpu's instruction pipeline due to Intel cpu errata.
46  *
47  * For our virtual page tables, the real kernel will handle SMP interactions
48  * with pmaps that may be active on other cpus.  Even so, we have to be
49  * careful about bit setting races particularly when we are trying to clean
50  * a page and test the modified bit to avoid races where the modified bit
51  * might get set after our poll but before we clear the field.
52  */
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
56 #include <sys/proc.h>
57 #include <sys/vmmeter.h>
58 #include <sys/thread2.h>
59 #include <sys/cdefs.h>
60 #include <sys/mman.h>
61 #include <sys/vmspace.h>
62 #include <sys/vmm.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_object.h>
67 
68 #include <machine/cputypes.h>
69 #include <machine/md_var.h>
70 #include <machine/specialreg.h>
71 #include <machine/smp.h>
72 #include <machine/globaldata.h>
73 #include <machine/pmap.h>
74 #include <machine/pmap_inval.h>
75 
76 #include <unistd.h>
77 #include <pthread.h>
78 
79 extern int vmm_enabled;
80 
81 static __inline
82 void
83 vmm_cpu_invltlb(void)
84 {
85 	/* For VMM mode forces vmmexit/resume */
86 	uint64_t rax = -1;
87 	__asm __volatile("syscall;"
88 			:
89 			: "a" (rax)
90 			:);
91 }
92 
93 /*
94  * Invalidate va in the TLB on the current cpu
95  */
96 static __inline
97 void
98 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes)
99 {
100 	if (pmap == &kernel_pmap) {
101 		madvise((void *)va, bytes, MADV_INVAL);
102 	} else {
103 		vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0);
104 	}
105 }
106 
107 /*
108  * This is a bit of a mess because we don't know what virtual cpus are
109  * mapped to real cpus.  Basically try to optimize the degenerate cases
110  * (primarily related to user processes with only one thread or only one
111  * running thread), and shunt all the rest to the host cpu.  The host cpu
112  * will invalidate all real cpu's the vkernel is running on.
113  *
114  * This can't optimize situations where a pmap is only mapped to some of
115  * the virtual cpus, though shunting to the real host will still be faster
116  * if the virtual kernel processes are running on fewer real-host cpus.
117  * (And probably will be faster anyway since there's no round-trip signaling
118  * overhead).
119  *
120  * NOTE: The critical section protects against preemption while the pmap
121  *	 is locked, which could otherwise result in a deadlock.
122  */
123 static __inline
124 void
125 guest_sync_addr(struct pmap *pmap,
126 		volatile vpte_t *dst_ptep, volatile vpte_t *src_ptep)
127 {
128 	globaldata_t gd = mycpu;
129 	cpumask_t oactive;
130 	cpumask_t nactive;
131 
132 	crit_enter();
133 	if (pmap->pm_active == 0 &&
134 	    atomic_cmpset_cpumask(&pmap->pm_active, 0, CPUMASK_LOCK)) {
135 		/*
136 		 * Avoid IPIs if pmap is inactive and we can trivially
137 		 * lock it.
138 		 */
139 		*dst_ptep = *src_ptep;
140 		vmm_cpu_invltlb();
141 	} else if (pmap->pm_active == gd->gd_cpumask &&
142 	    atomic_cmpset_cpumask(&pmap->pm_active,
143 			    gd->gd_cpumask, gd->gd_cpumask | CPUMASK_LOCK)) {
144 		/*
145 		 * Avoid IPIs if only our cpu is using the pmap and we
146 		 * can trivially lock it.
147 		 */
148 		*dst_ptep = *src_ptep;
149 		vmm_cpu_invltlb();
150 	} else {
151 		/*
152 		 * Lock the pmap
153 		 */
154 		for (;;) {
155 			oactive = pmap->pm_active;
156 			cpu_ccfence();
157 			if ((oactive & CPUMASK_LOCK) == 0) {
158 				nactive = oactive | CPUMASK_LOCK;
159 				if (atomic_cmpset_cpumask(&pmap->pm_active,
160 							  oactive,
161 							  nactive)) {
162 					break;
163 				}
164 			}
165 			cpu_pause();
166 			lwkt_process_ipiq();
167 			pthread_yield();
168 		}
169 		vmm_guest_sync_addr(__DEVOLATILE(void *, dst_ptep),
170 				    __DEVOLATILE(void *, src_ptep));
171 	}
172 	atomic_clear_cpumask(&pmap->pm_active, CPUMASK_LOCK);
173 	crit_exit();
174 }
175 
176 /*
177  * Invalidate a pte in a pmap and synchronize with target cpus
178  * as required.  Throw away the modified and access bits.  Use
179  * pmap_clean_pte() to do the same thing but also get an interlocked
180  * modified/access status.
181  *
182  * Clearing the field first (basically clearing VPTE_V) prevents any
183  * new races from occuring while we invalidate the TLB (i.e. the pmap
184  * on the real cpu), then clear it again to clean out any race that
185  * might have occured before the invalidation completed.
186  */
187 void
188 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
189 {
190 	vpte_t pte;
191 
192 	if (vmm_enabled == 0) {
193 		*ptep = 0;
194 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
195 	} else {
196 		pte = 0;
197 		guest_sync_addr(pmap, ptep, &pte);
198 	}
199 }
200 
201 /*
202  * Same as pmap_inval_pte() but only synchronize with the current
203  * cpu.  For the moment its the same as the non-quick version.
204  */
205 void
206 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
207 {
208 	*ptep = 0;
209 	if (vmm_enabled)
210 		vmm_cpu_invltlb();
211 	else
212 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
213 }
214 
215 /*
216  * Invalidating page directory entries requires some additional
217  * sophistication.  The cachemask must be cleared so the kernel
218  * resynchronizes its temporary page table mappings cache.
219  */
220 void
221 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
222 {
223 	vpte_t pte;
224 
225 	if (vmm_enabled == 0) {
226 		*ptep = 0;
227 		pmap_inval_cpu(pmap, va, SEG_SIZE);
228 	} else if ((pmap->pm_active & mycpu->gd_other_cpus) == 0) {
229 		*ptep = 0;
230 		vmm_cpu_invltlb();
231 	} else {
232 		pte = 0;
233 		guest_sync_addr(pmap, ptep, &pte);
234 	}
235 }
236 
237 void
238 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
239 {
240 	pmap_inval_pde(ptep, pmap, va);
241 }
242 
243 /*
244  * These carefully handle interactions with other cpus and return
245  * the original vpte.  Clearing VPTE_RW prevents us from racing the
246  * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's
247  * pmap) and get good status for VPTE_M.
248  *
249  * When messing with page directory entries we have to clear the cpu
250  * mask to force a reload of the kernel's page table mapping cache.
251  *
252  * clean: clear VPTE_M and VPTE_RW
253  * setro: clear VPTE_RW
254  * load&clear: clear entire field
255  */
256 #include<stdio.h>
257 vpte_t
258 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
259 {
260 	vpte_t pte;
261 
262 	pte = *ptep;
263 	if (pte & VPTE_V) {
264 		atomic_clear_long(ptep, VPTE_RW);  /* XXX */
265 		if (vmm_enabled == 0) {
266 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
267 			pte = *ptep;
268 		} else {
269 			guest_sync_addr(pmap, &pte, ptep);
270 		}
271 		atomic_clear_long(ptep, VPTE_RW|VPTE_M);
272 	}
273 	return(pte);
274 }
275 
276 vpte_t
277 pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
278 {
279 	vpte_t pte;
280 
281 	pte = *ptep;
282 	if (pte & VPTE_V) {
283 		atomic_clear_long(ptep, VPTE_RW);
284 		if (vmm_enabled == 0) {
285 			pmap_inval_cpu(pmap, va, SEG_SIZE);
286 			pte = *ptep;
287 		} else {
288 			guest_sync_addr(pmap, &pte, ptep);
289 		}
290 		atomic_clear_long(ptep, VPTE_RW|VPTE_M);
291 	}
292 	return(pte);
293 }
294 
295 /*
296  * This is an odd case and I'm not sure whether it even occurs in normal
297  * operation.  Turn off write access to the page, clean out the tlb
298  * (the real cpu's pmap), and deal with any VPTE_M race that may have
299  * occured.  VPTE_M is not cleared.
300  */
301 vpte_t
302 pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
303 {
304 	vpte_t pte;
305 	vpte_t npte;
306 
307 	pte = *ptep;
308 	if (pte & VPTE_V) {
309 		atomic_clear_long(ptep, VPTE_RW);
310 		if (vmm_enabled == 0) {
311 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
312 			pte |= *ptep & VPTE_M;
313 		} else {
314 			guest_sync_addr(pmap, &npte, ptep);
315 			pte |= npte & VPTE_M;
316 		}
317 	}
318 	return(pte);
319 }
320 
321 /*
322  * This is a combination of pmap_inval_pte() and pmap_clean_pte().
323  * Firts prevent races with the 'A' and 'M' bits, then clean out
324  * the tlb (the real cpu's pmap), then incorporate any races that
325  * may have occured in the mean time, and finally zero out the pte.
326  */
327 vpte_t
328 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap,
329 			vm_offset_t va)
330 {
331 	vpte_t pte;
332 	vpte_t npte;
333 
334 	pte = *ptep;
335 	if (pte & VPTE_V) {
336 		pte = *ptep;
337 		atomic_clear_long(ptep, VPTE_RW);
338 		if (vmm_enabled == 0) {
339 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
340 			pte |= *ptep & (VPTE_A | VPTE_M);
341 		} else {
342 			guest_sync_addr(pmap, &npte, ptep);
343 			pte |= npte & (VPTE_A | VPTE_M);
344 		}
345 	}
346 	*ptep = 0;
347 	return(pte);
348 }
349 
350 /*
351  * Synchronize a kvm mapping originally made for the private use on
352  * some other cpu so it can be used on all cpus.
353  *
354  * XXX add MADV_RESYNC to improve performance.
355  *
356  * We don't need to do anything because our pmap_inval_pte_quick()
357  * synchronizes it immediately.
358  */
359 void
360 pmap_kenter_sync(vm_offset_t va __unused)
361 {
362 }
363 
364 void
365 cpu_invlpg(void *addr)
366 {
367 	if (vmm_enabled)
368 		vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
369 	else
370 		madvise(addr, PAGE_SIZE, MADV_INVAL);
371 }
372 
373 void
374 cpu_invltlb(void)
375 {
376 	if (vmm_enabled)
377 		vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
378 	else
379 		madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL);
380 }
381 
382 void
383 smp_invltlb(void)
384 {
385 	/* XXX must invalidate the tlb on all cpus */
386 	/* at the moment pmap_inval_pte_quick */
387 	/* do nothing */
388 }
389