1 /*
2  * Copyright (c) 2003-2016 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $
35  */
36 
37 /*
38  * pmap invalidation support code.  Certain hardware requirements must
39  * be dealt with when manipulating page table entries and page directory
40  * entries within a pmap.  In particular, we cannot safely manipulate
41  * page tables which are in active use by another cpu (even if it is
42  * running in userland) for two reasons: First, TLB writebacks will
43  * race against our own modifications and tests.  Second, even if we
44  * were to use bus-locked instruction we can still screw up the
45  * target cpu's instruction pipeline due to Intel cpu errata.
46  *
47  * For our virtual page tables, the real kernel will handle SMP interactions
48  * with pmaps that may be active on other cpus.  Even so, we have to be
49  * careful about bit setting races particularly when we are trying to clean
50  * a page and test the modified bit to avoid races where the modified bit
51  * might get set after our poll but before we clear the field.
52  */
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
56 #include <sys/proc.h>
57 #include <sys/vmmeter.h>
58 #include <sys/thread2.h>
59 #include <sys/cdefs.h>
60 #include <sys/mman.h>
61 #include <sys/vmspace.h>
62 #include <sys/vmm.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_object.h>
67 
68 #include <machine/cputypes.h>
69 #include <machine/md_var.h>
70 #include <machine/specialreg.h>
71 #include <machine/smp.h>
72 #include <machine/globaldata.h>
73 #include <machine/pmap.h>
74 #include <machine/pmap_inval.h>
75 
76 #include <unistd.h>
77 #include <pthread.h>
78 
79 extern int vmm_enabled;
80 
81 /*
82  * Invalidate the TLB on the current cpu
83  *
84  * (VMM enabled only)
85  */
86 static __inline
87 void
88 vmm_cpu_invltlb(void)
89 {
90 	vmm_guest_sync_addr(NULL, NULL);
91 #if 0
92 	/* For VMM mode forces vmmexit/resume */
93 	uint64_t rax = -1;
94 	__asm __volatile("syscall;"
95 			:
96 			: "a" (rax)
97 			:);
98 #endif
99 }
100 
101 /*
102  * Invalidate va in the TLB on the current cpu
103  *
104  * (VMM disabled only)
105  */
106 static __inline
107 void
108 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes)
109 {
110 	if (pmap == &kernel_pmap) {
111 		madvise((void *)va, bytes, MADV_INVAL);
112 	} else {
113 		vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0);
114 	}
115 }
116 
117 /*
118  * This is a bit of a mess because we don't know what virtual cpus are
119  * mapped to real cpus.  Basically try to optimize the degenerate cases
120  * (primarily related to user processes with only one thread or only one
121  * running thread), and shunt all the rest to the host cpu.  The host cpu
122  * will invalidate all real cpu's the vkernel is running on.
123  *
124  * This can't optimize situations where a pmap is only mapped to some of
125  * the virtual cpus, though shunting to the real host will still be faster
126  * if the virtual kernel processes are running on fewer real-host cpus.
127  * (And probably will be faster anyway since there's no round-trip signaling
128  * overhead).
129  *
130  * NOTE: The critical section protects against preemption while the pmap
131  *	 is locked, which could otherwise result in a deadlock.
132  */
133 static __inline
134 void
135 guest_sync_addr(struct pmap *pmap,
136 		volatile vpte_t *dst_ptep, volatile vpte_t *src_ptep)
137 {
138 	globaldata_t gd = mycpu;
139 	cpulock_t olock;
140 	cpulock_t nlock;
141 
142 	/*
143 	 * Lock the pmap
144 	 */
145 	crit_enter();
146 	for (;;) {
147 		olock = pmap->pm_active_lock;
148 		cpu_ccfence();
149 		if ((olock & CPULOCK_EXCL) == 0) {
150 			nlock = olock | CPULOCK_EXCL;
151 			if (atomic_cmpset_int(&pmap->pm_active_lock,
152 					      olock, nlock)) {
153 				break;
154 			}
155 		}
156 		cpu_pause();
157 		lwkt_process_ipiq();
158 		pthread_yield();
159 	}
160 
161 	/*
162 	 * Update the pte and synchronize with other cpus.  If we can update
163 	 * it trivially, do so.
164 	 */
165 	if (CPUMASK_TESTZERO(pmap->pm_active) ||
166 	    CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
167 		if (dst_ptep && src_ptep)
168 			*dst_ptep = *src_ptep;
169 		vmm_cpu_invltlb();
170 	} else {
171 		vmm_guest_sync_addr(__DEVOLATILE(void *, dst_ptep),
172 				    __DEVOLATILE(void *, src_ptep));
173 	}
174 
175 	/*
176 	 * Unlock the pmap
177 	 */
178 	atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
179 	crit_exit();
180 }
181 
182 /*
183  * Invalidate a pte in a pmap and synchronize with target cpus
184  * as required.  Throw away the modified and access bits.  Use
185  * pmap_clean_pte() to do the same thing but also get an interlocked
186  * modified/access status.
187  *
188  * Clearing the field first (basically clearing VPTE_V) prevents any
189  * new races from occuring while we invalidate the TLB (i.e. the pmap
190  * on the real cpu), then clear it again to clean out any race that
191  * might have occured before the invalidation completed.
192  */
193 void
194 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
195 {
196 	vpte_t pte;
197 
198 	if (vmm_enabled == 0) {
199 		atomic_swap_long(ptep, 0);
200 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
201 	} else {
202 		pte = 0;
203 		guest_sync_addr(pmap, ptep, &pte);
204 	}
205 }
206 
207 /*
208  * Invalidate the tlb for a range of virtual addresses across all cpus
209  * belonging to the pmap.
210  */
211 void
212 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
213 {
214 	if (vmm_enabled == 0) {
215 		pmap_inval_cpu(pmap, sva, eva - sva);
216 	} else {
217 		guest_sync_addr(pmap, NULL, NULL);
218 	}
219 }
220 
221 /*
222  * Same as pmap_inval_pte() but only synchronize with the current
223  * cpu.  For the moment its the same as the non-quick version.
224  */
225 void
226 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
227 {
228 	atomic_swap_long(ptep, 0);
229 	if (vmm_enabled)
230 		vmm_cpu_invltlb();
231 	else
232 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
233 }
234 
235 /*
236  * Invalidating page directory entries requires some additional
237  * sophistication.  The cachemask must be cleared so the kernel
238  * resynchronizes its temporary page table mappings cache.
239  */
240 void
241 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
242 {
243 	vpte_t pte;
244 
245 	if (vmm_enabled == 0) {
246 		*ptep = 0;
247 		pmap_inval_cpu(pmap, va, SEG_SIZE);
248 	} else if (CPUMASK_TESTMASK(pmap->pm_active,
249 				    mycpu->gd_other_cpus) == 0) {
250 		*ptep = 0;
251 		vmm_cpu_invltlb();
252 	} else {
253 		pte = 0;
254 		guest_sync_addr(pmap, ptep, &pte);
255 	}
256 }
257 
258 void
259 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
260 {
261 	pmap_inval_pde(ptep, pmap, va);
262 }
263 
264 /*
265  * These carefully handle interactions with other cpus and return
266  * the original vpte.  Clearing VPTE_RW prevents us from racing the
267  * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's
268  * pmap) and get good status for VPTE_M.
269  *
270  * When messing with page directory entries we have to clear the cpu
271  * mask to force a reload of the kernel's page table mapping cache.
272  *
273  * clean: clear VPTE_M and VPTE_RW
274  * setro: clear VPTE_RW
275  * load&clear: clear entire field
276  */
277 #include <stdio.h>
278 
279 vpte_t
280 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
281 {
282 	vpte_t pte;
283 
284 	pte = *ptep;
285 	if (pte & VPTE_V) {
286 		atomic_clear_long(ptep, VPTE_RW);
287 		if (vmm_enabled == 0) {
288 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
289 			pte = *ptep;
290 		} else {
291 			guest_sync_addr(pmap, &pte, ptep);
292 		}
293 		atomic_clear_long(ptep, VPTE_RW|VPTE_M);
294 	}
295 	return(pte);
296 }
297 
298 vpte_t
299 pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
300 {
301 	vpte_t pte;
302 
303 	pte = *ptep;
304 	if (pte & VPTE_V) {
305 		atomic_clear_long(ptep, VPTE_RW);
306 		if (vmm_enabled == 0) {
307 			pmap_inval_cpu(pmap, va, SEG_SIZE);
308 			pte = *ptep;
309 		} else {
310 			guest_sync_addr(pmap, &pte, ptep);
311 		}
312 		atomic_clear_long(ptep, VPTE_RW|VPTE_M);
313 	}
314 	return(pte);
315 }
316 
317 /*
318  * This is an odd case and I'm not sure whether it even occurs in normal
319  * operation.  Turn off write access to the page, clean out the tlb
320  * (the real cpu's pmap), and deal with any VPTE_M race that may have
321  * occured.  VPTE_M is not cleared.
322  */
323 vpte_t
324 pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
325 {
326 	vpte_t pte;
327 	vpte_t npte;
328 
329 	pte = *ptep;
330 	if (pte & VPTE_V) {
331 		atomic_clear_long(ptep, VPTE_RW);
332 		if (vmm_enabled == 0) {
333 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
334 			pte |= *ptep & VPTE_M;
335 		} else {
336 			guest_sync_addr(pmap, &npte, ptep);
337 			pte |= npte & VPTE_M;
338 		}
339 	}
340 	return(pte);
341 }
342 
343 /*
344  * This is a combination of pmap_inval_pte() and pmap_clean_pte().
345  * Firts prevent races with the 'A' and 'M' bits, then clean out
346  * the tlb (the real cpu's pmap), then incorporate any races that
347  * may have occured in the mean time, and finally zero out the pte.
348  */
349 vpte_t
350 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap,
351 			vm_offset_t va)
352 {
353 	vpte_t pte;
354 	vpte_t npte;
355 
356 	pte = *ptep;
357 	if (pte & VPTE_V) {
358 		pte = *ptep;
359 		atomic_clear_long(ptep, VPTE_RW);
360 		if (vmm_enabled == 0) {
361 			pmap_inval_cpu(pmap, va, PAGE_SIZE);
362 			pte = (pte & VPTE_RW) | *ptep;
363 		} else {
364 			guest_sync_addr(pmap, &npte, ptep);
365 			pte = (pte & VPTE_RW) | npte;
366 		}
367 	}
368 	atomic_swap_long(ptep, 0);
369 
370 	return(pte);
371 }
372 
373 void
374 cpu_invlpg(void *addr)
375 {
376 	if (vmm_enabled)
377 		vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
378 	else
379 		madvise(addr, PAGE_SIZE, MADV_INVAL);
380 }
381 
382 void
383 cpu_invltlb(void)
384 {
385 	if (vmm_enabled)
386 		vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
387 	else
388 		madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL);
389 }
390 
391 void
392 smp_invltlb(void)
393 {
394 	/* XXX must invalidate the tlb on all cpus */
395 	/* at the moment pmap_inval_pte_quick */
396 	/* do nothing */
397 }
398 
399 void
400 smp_sniff(void)
401 {
402 	/* not implemented */
403 }
404