1 /*
2  * Copyright (c) 2003-2016 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $
35  */
36 
37 /*
38  * pmap invalidation support code.  Certain hardware requirements must
39  * be dealt with when manipulating page table entries and page directory
40  * entries within a pmap.  In particular, we cannot safely manipulate
41  * page tables which are in active use by another cpu (even if it is
42  * running in userland) for two reasons: First, TLB writebacks will
43  * race against our own modifications and tests.  Second, even if we
44  * were to use bus-locked instruction we can still screw up the
45  * target cpu's instruction pipeline due to Intel cpu errata.
46  *
47  * For our virtual page tables, the real kernel will handle SMP interactions
48  * with pmaps that may be active on other cpus.  Even so, we have to be
49  * careful about bit setting races particularly when we are trying to clean
50  * a page and test the modified bit to avoid races where the modified bit
51  * might get set after our poll but before we clear the field.
52  */
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
56 #include <sys/proc.h>
57 #include <sys/vmmeter.h>
58 #include <sys/thread2.h>
59 #include <sys/cdefs.h>
60 #include <sys/mman.h>
61 #include <sys/vmspace.h>
62 #include <sys/vmm.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_object.h>
67 
68 #include <machine/cputypes.h>
69 #include <machine/md_var.h>
70 #include <machine/specialreg.h>
71 #include <machine/smp.h>
72 #include <machine/globaldata.h>
73 #include <machine/pmap.h>
74 #include <machine/pmap_inval.h>
75 
76 #include <unistd.h>
77 
78 #include <vm/vm_page2.h>
79 
80 extern int vmm_enabled;
81 
82 /*
83  * Invalidate the TLB on the current cpu
84  *
85  * (VMM enabled only)
86  */
87 static __inline
88 void
89 vmm_cpu_invltlb(void)
90 {
91 #if 0
92 	/* not directly supported */
93 	cpu_invltlb();
94 #else
95 	/* vmm_guest_sync_addr(NULL, NULL); */
96 	/* For VMM mode forces vmmexit/resume */
97 	uint64_t rax = -1;
98 	__asm __volatile("syscall;"
99 			:
100 			: "a" (rax)
101 			:);
102 #endif
103 }
104 
105 static __inline
106 void
107 vmm_cpu_invlpg(void *addr __unused)
108 {
109 	vmm_cpu_invltlb();
110 }
111 
112 /*
113  * Invalidate va in the TLB on the current cpu
114  *
115  * (VMM disabled only)
116  */
117 static __inline
118 void
119 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes)
120 {
121 	if (pmap == kernel_pmap) {
122 		madvise((void *)va, bytes, MADV_INVAL);
123 	} else {
124 		vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0);
125 	}
126 }
127 
128 /*
129  * This is a bit of a mess because we don't know what virtual cpus are
130  * mapped to real cpus.  Basically try to optimize the degenerate cases
131  * (primarily related to user processes with only one thread or only one
132  * running thread), and shunt all the rest to the host cpu.  The host cpu
133  * will invalidate all real cpu's the vkernel is running on.
134  *
135  * This can't optimize situations where a pmap is only mapped to some of
136  * the virtual cpus, though shunting to the real host will still be faster
137  * if the virtual kernel processes are running on fewer real-host cpus.
138  * (And probably will be faster anyway since there's no round-trip signaling
139  * overhead).
140  *
141  * NOTE: The critical section protects against preemption while the pmap
142  *	 is locked, which could otherwise result in a deadlock.
143  */
144 static __inline
145 void
146 guest_sync_addr(struct pmap *pmap, volatile vpte_t *ptep, vpte_t *srcv)
147 {
148 	globaldata_t gd = mycpu;
149 	cpulock_t olock;
150 	cpulock_t nlock;
151 
152 	/*
153 	 * Lock the pmap
154 	 */
155 	crit_enter();
156 	for (;;) {
157 		olock = pmap->pm_active_lock;
158 		cpu_ccfence();
159 		if ((olock & CPULOCK_EXCL) == 0) {
160 			nlock = olock | CPULOCK_EXCL;
161 			if (atomic_cmpset_int(&pmap->pm_active_lock,
162 					      olock, nlock)) {
163 				break;
164 			}
165 		}
166 		cpu_pause();
167 		lwkt_process_ipiq();
168 		vkernel_yield();
169 	}
170 
171 	/*
172 	 * Update the pte and synchronize with other cpus.  If we can update
173 	 * it trivially, do so.
174 	 */
175 	if (CPUMASK_TESTZERO(pmap->pm_active) ||
176 	    CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
177 		if (ptep)
178 			*srcv = atomic_swap_long(ptep, *srcv);
179 		vmm_cpu_invltlb();
180 	} else {
181 		vmm_guest_sync_addr(__DEVOLATILE(void *, ptep), srcv);
182 	}
183 
184 	/*
185 	 * Unlock the pmap
186 	 */
187 	atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
188 	crit_exit();
189 }
190 
191 /*
192  * Invalidate a pte in a pmap and synchronize with target cpus
193  * as required.  Throw away the modified and access bits.  Use
194  * pmap_clean_pte() to do the same thing but also get an interlocked
195  * modified/access status.
196  *
197  * Clearing the field first (basically clearing VPTE_V) prevents any
198  * new races from occuring while we invalidate the TLB (i.e. the pmap
199  * on the real cpu), then clear it again to clean out any race that
200  * might have occured before the invalidation completed.
201  */
202 void
203 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
204 {
205 	vpte_t pte;
206 
207 	if (vmm_enabled == 0) {
208 		atomic_swap_long(ptep, 0);
209 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
210 	} else {
211 		pte = 0;
212 		guest_sync_addr(pmap, ptep, &pte);
213 	}
214 }
215 
216 /*
217  * Same as pmap_inval_pte() but only synchronize with the current
218  * cpu.  For the moment its the same as the non-quick version.
219  */
220 void
221 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
222 {
223 	atomic_swap_long(ptep, 0);
224 	if (vmm_enabled == 0)
225 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
226 	else
227 		vmm_cpu_invltlb();
228 }
229 
230 /*
231  * Invalidate the tlb for a range of virtual addresses across all cpus
232  * belonging to the pmap.
233  */
234 void
235 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
236 {
237 	if (vmm_enabled == 0) {
238 		pmap_inval_cpu(pmap, sva, eva - sva);
239 	} else {
240 		guest_sync_addr(pmap, NULL, NULL);
241 	}
242 }
243 
244 /*
245  * Invalidating page directory entries requires some additional
246  * sophistication.  The cachemask must be cleared so the kernel
247  * resynchronizes its temporary page table mappings cache.
248  */
249 void
250 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
251 {
252 	vpte_t pte;
253 
254 	if (vmm_enabled == 0) {
255 		atomic_swap_long(ptep, 0);
256 		pmap_inval_cpu(pmap, va, SEG_SIZE);
257 	} else if (CPUMASK_TESTMASK(pmap->pm_active,
258 				    mycpu->gd_other_cpus) == 0) {
259 		atomic_swap_long(ptep, 0);
260 		vmm_cpu_invltlb();
261 	} else {
262 		pte = 0;
263 		guest_sync_addr(pmap, ptep, &pte);
264 	}
265 }
266 
267 void
268 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
269 {
270 	pmap_inval_pde(ptep, pmap, va);
271 }
272 
273 /*
274  * This is really nasty.
275  *
276  * (1) The vkernel interlocks pte operations with the related vm_page_t
277  *     spin-lock (and doesn't handle unmanaged page races).
278  *
279  * (2) The vkernel must also issu an invalidation to the real cpu.  It
280  *     (nastily) does this while holding the spin-lock too.
281  *
282  * In addition, atomic ops must be used to properly interlock against
283  * other cpus and the real kernel (which could be taking a fault on another
284  * cpu and will adjust VPTE_M and VPTE_A appropriately).
285  *
286  * The atomicc ops do a good job of interlocking against other cpus, but
287  * we still need to lock the pte location (which we use the vm_page spin-lock
288  * for) to avoid races against PG_WRITEABLE and other tests.
289  *
290  * Cleaning the pte involves clearing VPTE_M and VPTE_RW, synchronizing with
291  * the real host, and updating the vm_page appropriately.
292  *
293  * If the caller passes a non-NULL (m), the caller holds the spin-lock,
294  * otherwise we must acquire and release the spin-lock.  (m) is only
295  * applicable to managed pages.
296  */
297 vpte_t
298 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va,
299 	       vm_page_t m)
300 {
301 	vpte_t pte;
302 	int spin = 0;
303 
304 	/*
305 	 * Acquire (m) and spin-lock it.
306 	 */
307 	while (m == NULL) {
308 		pte = *ptep;
309 		if ((pte & VPTE_V) == 0)
310 			return pte;
311 		if ((pte & VPTE_MANAGED) == 0)
312 			break;
313 		m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME);
314 		vm_page_spin_lock(m);
315 
316 		pte = *ptep;
317 		if ((pte & VPTE_V) == 0) {
318 			vm_page_spin_unlock(m);
319 			m = NULL;
320 			continue;
321 		}
322 		if ((pte & VPTE_MANAGED) == 0) {
323 			vm_page_spin_unlock(m);
324 			m = NULL;
325 			continue;
326 		}
327 		if (m != PHYS_TO_VM_PAGE(pte & VPTE_FRAME)) {
328 			vm_page_spin_unlock(m);
329 			m = NULL;
330 			continue;
331 		}
332 		spin = 1;
333 		break;
334 	}
335 
336 	if (vmm_enabled == 0) {
337 		for (;;) {
338 			pte = *ptep;
339 			cpu_ccfence();
340 			if ((pte & VPTE_RW) == 0)
341 				break;
342 			if (atomic_cmpset_long(ptep,
343 					       pte,
344 					       pte & ~(VPTE_RW | VPTE_M))) {
345 				pmap_inval_cpu(pmap, va, PAGE_SIZE);
346 				break;
347 			}
348 		}
349 	} else {
350 		pte = *ptep & ~(VPTE_RW | VPTE_M);
351 		guest_sync_addr(pmap, ptep, &pte);
352 	}
353 
354 	if (m) {
355 		if (pte & VPTE_A) {
356 			vm_page_flag_set(m, PG_REFERENCED);
357 			atomic_clear_long(ptep, VPTE_A);
358 		}
359 		if (pte & VPTE_M) {
360 			vm_page_dirty(m);
361 		}
362 		if (spin)
363 			vm_page_spin_unlock(m);
364 	}
365 	return pte;
366 }
367 
368 /*
369  * This is a combination of pmap_inval_pte() and pmap_clean_pte().
370  * Firts prevent races with the 'A' and 'M' bits, then clean out
371  * the tlb (the real cpu's pmap), then incorporate any races that
372  * may have occured in the mean time, and finally zero out the pte.
373  */
374 vpte_t
375 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap,
376 			vm_offset_t va)
377 {
378 	vpte_t pte;
379 
380 	if (vmm_enabled == 0) {
381 		pte = atomic_swap_long(ptep, 0);
382 		pmap_inval_cpu(pmap, va, PAGE_SIZE);
383 	} else {
384 		pte = 0;
385 		guest_sync_addr(pmap, ptep, &pte);
386 	}
387 	return(pte);
388 }
389 
390 void
391 cpu_invlpg(void *addr)
392 {
393 	if (vmm_enabled)
394 		vmm_cpu_invlpg(addr);
395 	else
396 		madvise(addr, PAGE_SIZE, MADV_INVAL);
397 }
398 
399 void
400 cpu_invltlb(void)
401 {
402 	if (vmm_enabled)
403 		vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
404 	else
405 		madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL);
406 }
407 
408 /*
409  * Invalidate the TLB on all cpus.  Instead what the vkernel does is
410  * ignore VM_PROT_NOSYNC on pmap_enter() calls.
411  */
412 void
413 smp_invltlb(void)
414 {
415 	/* do nothing */
416 }
417 
418 void
419 smp_sniff(void)
420 {
421 	/* not implemented */
422 }
423 
424 void
425 cpu_sniff(int dcpu __unused)
426 {
427 	/* not implemented */
428 }
429