1 /*
2  * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $
35  */
36 
37 /*
38  * pmap invalidation support code.  Certain hardware requirements must
39  * be dealt with when manipulating page table entries and page directory
40  * entries within a pmap.  In particular, we cannot safely manipulate
41  * page tables which are in active use by another cpu (even if it is
42  * running in userland) for two reasons: First, TLB writebacks will
43  * race against our own modifications and tests.  Second, even if we
44  * were to use bus-locked instruction we can still screw up the
45  * target cpu's instruction pipeline due to Intel cpu errata.
46  *
47  * For our virtual page tables, the real kernel will handle SMP interactions
48  * with pmaps that may be active on other cpus.  Even so, we have to be
49  * careful about bit setting races particularly when we are trying to clean
50  * a page and test the modified bit to avoid races where the modified bit
51  * might get set after our poll but before we clear the field.
52  */
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
56 #include <sys/proc.h>
57 #include <sys/vmmeter.h>
58 #include <sys/thread2.h>
59 #include <sys/cdefs.h>
60 #include <sys/mman.h>
61 #include <sys/vmspace.h>
62 #include <sys/vmm.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_object.h>
67 
68 #include <machine/cputypes.h>
69 #include <machine/md_var.h>
70 #include <machine/specialreg.h>
71 #include <machine/smp.h>
72 #include <machine/globaldata.h>
73 #include <machine/pmap.h>
74 #include <machine/pmap_inval.h>
75 
76 #include <unistd.h>
77 #include <pthread.h>
78 
79 extern int vmm_enabled;
80 
81 static __inline
82 void
83 vmm_cpu_invltlb(void)
84 {
85 	/* For VMM mode forces vmmexit/resume */
86 	uint64_t rax = -1;
87 	__asm __volatile("syscall;"
88 			:
89 			: "a" (rax)
90 			:);
91 }
92 
93 /*
94  * Invalidate va in the TLB on the current cpu
95  */
96 static __inline
97 void
98 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes)
99 {
100 	if (pmap == &kernel_pmap) {
101 		madvise((void *)va, bytes, MADV_INVAL);
102 	} else {
103 		vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0);
104 	}
105 }
106 
107 /*
108  * This is a bit of a mess because we don't know what virtual cpus are
109  * mapped to real cpus.  Basically try to optimize the degenerate cases
110  * (primarily related to user processes with only one thread or only one
111  * running thread), and shunt all the rest to the host cpu.  The host cpu
112  * will invalidate all real cpu's the vkernel is running on.
113  *
114  * This can't optimize situations where a pmap is only mapped to some of
115  * the virtual cpus, though shunting to the real host will still be faster
116  * if the virtual kernel processes are running on fewer real-host cpus.
117  * (And probably will be faster anyway since there's no round-trip signaling
118  * overhead).
119  *
120  * NOTE: The critical section protects against preemption while the pmap
121  *	 is locked, which could otherwise result in a deadlock.
122  */
123 static __inline
124 void
125 guest_sync_addr(struct pmap *pmap,
126 		volatile vpte_t *dst_ptep, volatile vpte_t *src_ptep)
127 {
128 	globaldata_t gd = mycpu;
129 	cpulock_t olock;
130 	cpulock_t nlock;
131 
132 	/*
133 	 * Lock the pmap
134 	 */
135 	crit_enter();
136 	for (;;) {
137 		olock = pmap->pm_active_lock;
138 		cpu_ccfence();
139 		if ((olock & CPULOCK_EXCL) == 0) {
140 			nlock = olock | CPULOCK_EXCL;
141 			if (atomic_cmpset_int(&pmap->pm_active_lock,
142 					      olock, nlock)) {
143 				break;
144 			}
145 		}
146 		cpu_pause();
147 		lwkt_process_ipiq();
148 		pthread_yield();
149 	}
150 
151 	/*
152 	 * Update the pte and synchronize with other cpus.  If we can update
153 	 * it trivially, do so.
154 	 */
155 	if (CPUMASK_TESTZERO(pmap->pm_active) ||
156 	    CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
157 		*dst_ptep = *src_ptep;
158 		vmm_cpu_invltlb();
159 	} else {
160 		vmm_guest_sync_addr(__DEVOLATILE(void *, dst_ptep),
161 				    __DEVOLATILE(void *, src_ptep));
162 	}
163 
164 	/*
165 	 * Unlock the pmap
166 	 */
167 	atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
168 	crit_exit();
169 }
170 
171 /*
172  * Invalidate a pte in a pmap and synchronize with target cpus
173  * as required.  Throw away the modified and access bits.  Use
174  * pmap_clean_pte() to do the same thing but also get an interlocked
175  * modified/access status.
176  *
177  * Clearing the field first (basically clearing VPTE_V) prevents any
178  * new races from occuring while we invalidate the TLB (i.e. the pmap
179  * on the real cpu), then clear it again to clean out any race that
180  * might have occured before the invalidation completed.
181  */
182 void
183 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
184 {
185 	vpte_t pte;
186 
187 	if (vmm_enabled == 0) {
188 		*ptep = 0;
189 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
190 	} else {
191 		pte = 0;
192 		guest_sync_addr(pmap, ptep, &pte);
193 	}
194 }
195 
196 /*
197  * Same as pmap_inval_pte() but only synchronize with the current
198  * cpu.  For the moment its the same as the non-quick version.
199  */
200 void
201 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
202 {
203 	*ptep = 0;
204 	if (vmm_enabled)
205 		vmm_cpu_invltlb();
206 	else
207 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
208 }
209 
210 /*
211  * Invalidating page directory entries requires some additional
212  * sophistication.  The cachemask must be cleared so the kernel
213  * resynchronizes its temporary page table mappings cache.
214  */
215 void
216 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
217 {
218 	vpte_t pte;
219 
220 	if (vmm_enabled == 0) {
221 		*ptep = 0;
222 		pmap_inval_cpu(pmap, va, SEG_SIZE);
223 	} else if (CPUMASK_TESTMASK(pmap->pm_active,
224 				    mycpu->gd_other_cpus) == 0) {
225 		*ptep = 0;
226 		vmm_cpu_invltlb();
227 	} else {
228 		pte = 0;
229 		guest_sync_addr(pmap, ptep, &pte);
230 	}
231 }
232 
233 void
234 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
235 {
236 	pmap_inval_pde(ptep, pmap, va);
237 }
238 
239 /*
240  * These carefully handle interactions with other cpus and return
241  * the original vpte.  Clearing VPTE_RW prevents us from racing the
242  * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's
243  * pmap) and get good status for VPTE_M.
244  *
245  * When messing with page directory entries we have to clear the cpu
246  * mask to force a reload of the kernel's page table mapping cache.
247  *
248  * clean: clear VPTE_M and VPTE_RW
249  * setro: clear VPTE_RW
250  * load&clear: clear entire field
251  */
252 #include<stdio.h>
253 vpte_t
254 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
255 {
256 	vpte_t pte;
257 
258 	pte = *ptep;
259 	if (pte & VPTE_V) {
260 		atomic_clear_long(ptep, VPTE_RW);  /* XXX */
261 		if (vmm_enabled == 0) {
262 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
263 			pte = *ptep;
264 		} else {
265 			guest_sync_addr(pmap, &pte, ptep);
266 		}
267 		atomic_clear_long(ptep, VPTE_RW|VPTE_M);
268 	}
269 	return(pte);
270 }
271 
272 vpte_t
273 pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
274 {
275 	vpte_t pte;
276 
277 	pte = *ptep;
278 	if (pte & VPTE_V) {
279 		atomic_clear_long(ptep, VPTE_RW);
280 		if (vmm_enabled == 0) {
281 			pmap_inval_cpu(pmap, va, SEG_SIZE);
282 			pte = *ptep;
283 		} else {
284 			guest_sync_addr(pmap, &pte, ptep);
285 		}
286 		atomic_clear_long(ptep, VPTE_RW|VPTE_M);
287 	}
288 	return(pte);
289 }
290 
291 /*
292  * This is an odd case and I'm not sure whether it even occurs in normal
293  * operation.  Turn off write access to the page, clean out the tlb
294  * (the real cpu's pmap), and deal with any VPTE_M race that may have
295  * occured.  VPTE_M is not cleared.
296  */
297 vpte_t
298 pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
299 {
300 	vpte_t pte;
301 	vpte_t npte;
302 
303 	pte = *ptep;
304 	if (pte & VPTE_V) {
305 		atomic_clear_long(ptep, VPTE_RW);
306 		if (vmm_enabled == 0) {
307 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
308 			pte |= *ptep & VPTE_M;
309 		} else {
310 			guest_sync_addr(pmap, &npte, ptep);
311 			pte |= npte & VPTE_M;
312 		}
313 	}
314 	return(pte);
315 }
316 
317 /*
318  * This is a combination of pmap_inval_pte() and pmap_clean_pte().
319  * Firts prevent races with the 'A' and 'M' bits, then clean out
320  * the tlb (the real cpu's pmap), then incorporate any races that
321  * may have occured in the mean time, and finally zero out the pte.
322  */
323 vpte_t
324 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap,
325 			vm_offset_t va)
326 {
327 	vpte_t pte;
328 	vpte_t npte;
329 
330 	pte = *ptep;
331 	if (pte & VPTE_V) {
332 		pte = *ptep;
333 		atomic_clear_long(ptep, VPTE_RW);
334 		if (vmm_enabled == 0) {
335 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
336 			pte |= *ptep & (VPTE_A | VPTE_M);
337 		} else {
338 			guest_sync_addr(pmap, &npte, ptep);
339 			pte |= npte & (VPTE_A | VPTE_M);
340 		}
341 	}
342 	*ptep = 0;
343 	return(pte);
344 }
345 
346 /*
347  * Synchronize a kvm mapping originally made for the private use on
348  * some other cpu so it can be used on all cpus.
349  *
350  * XXX add MADV_RESYNC to improve performance.
351  *
352  * We don't need to do anything because our pmap_inval_pte_quick()
353  * synchronizes it immediately.
354  */
355 void
356 pmap_kenter_sync(vm_offset_t va __unused)
357 {
358 }
359 
360 void
361 cpu_invlpg(void *addr)
362 {
363 	if (vmm_enabled)
364 		vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
365 	else
366 		madvise(addr, PAGE_SIZE, MADV_INVAL);
367 }
368 
369 void
370 cpu_invltlb(void)
371 {
372 	if (vmm_enabled)
373 		vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
374 	else
375 		madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL);
376 }
377 
378 void
379 smp_invltlb(void)
380 {
381 	/* XXX must invalidate the tlb on all cpus */
382 	/* at the moment pmap_inval_pte_quick */
383 	/* do nothing */
384 }
385