xref: /dragonfly/sys/vm/vm_vmspace.c (revision 3a48e5e1)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysmsg.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/thread.h>
44 #include <sys/proc.h>
45 #include <sys/malloc.h>
46 #include <sys/sysctl.h>
47 #include <sys/vkernel.h>
48 #include <sys/vmspace.h>
49 
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/vmparam.h>
54 #include <machine/vmm.h>
55 
56 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
57 						  void *id, int havetoken);
58 static int vmspace_entry_delete(struct vmspace_entry *ve,
59 				 struct vkernel_proc *vkp, int refs);
60 static void vmspace_entry_cache_ref(struct vmspace_entry *ve);
61 static void vmspace_entry_cache_drop(struct vmspace_entry *ve);
62 static void vmspace_entry_drop(struct vmspace_entry *ve);
63 
64 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
65 
66 /*
67  * vmspace_create (void *id, int type, void *data)
68  *
69  * Create a VMSPACE under the control of the caller with the specified id.
70  * An id of NULL cannot be used.  The type and data fields must currently
71  * be 0.
72  *
73  * The vmspace starts out completely empty.  Memory may be mapped into the
74  * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled
75  * with vmspace_mcontrol().
76  *
77  * No requirements.
78  */
79 int
80 sys_vmspace_create(struct sysmsg *sysmsg,
81 		   const struct vmspace_create_args *uap)
82 {
83 	struct vmspace_entry *ve;
84 	struct vkernel_proc *vkp;
85 	struct proc *p = curproc;
86 	int error;
87 
88 	if (vkernel_enable == 0)
89 		return (EOPNOTSUPP);
90 
91 	/*
92 	 * Create a virtual kernel side-structure for the process if one
93 	 * does not exist.
94 	 *
95 	 * Implement a simple resolution for SMP races.
96 	 */
97 	if ((vkp = p->p_vkernel) == NULL) {
98 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
99 		lwkt_gettoken(&p->p_token);
100 		if (p->p_vkernel == NULL) {
101 			vkp->refs = 1;
102 			lwkt_token_init(&vkp->token, "vkernel");
103 			RB_INIT(&vkp->root);
104 			p->p_vkernel = vkp;
105 		} else {
106 			kfree(vkp, M_VKERNEL);
107 			vkp = p->p_vkernel;
108 		}
109 		lwkt_reltoken(&p->p_token);
110 	}
111 
112 	if (curthread->td_vmm)
113 		return 0;
114 
115 	/*
116 	 * Create a new VMSPACE, disallow conflicting ids
117 	 */
118 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
119 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
120 	ve->id = uap->id;
121 	ve->refs = 0;		/* active refs (none) */
122 	ve->cache_refs = 1;	/* on-tree, not deleted (prevent kfree) */
123 	pmap_pinit2(vmspace_pmap(ve->vmspace));
124 
125 	lwkt_gettoken(&vkp->token);
126 	if (RB_INSERT(vmspace_rb_tree, &vkp->root, ve)) {
127 		vmspace_rel(ve->vmspace);
128 		ve->vmspace = NULL; /* safety */
129 		kfree(ve, M_VKERNEL);
130 		error = EEXIST;
131 	} else {
132 		error = 0;
133 	}
134 	lwkt_reltoken(&vkp->token);
135 
136 	return (error);
137 }
138 
139 /*
140  * Destroy a VMSPACE given its identifier.
141  *
142  * No requirements.
143  */
144 int
145 sys_vmspace_destroy(struct sysmsg *sysmsg,
146 		    const struct vmspace_destroy_args *uap)
147 {
148 	struct vkernel_proc *vkp;
149 	struct vmspace_entry *ve;
150 	int error;
151 
152 	if ((vkp = curproc->p_vkernel) == NULL)
153 		return EINVAL;
154 
155 	/*
156 	 * vkp->token protects the deletion against a new RB tree search.
157 	 */
158 	lwkt_gettoken(&vkp->token);
159 	error = ENOENT;
160 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 1)) != NULL) {
161 		error = vmspace_entry_delete(ve, vkp, 1);
162 		if (error == 0)
163 			vmspace_entry_cache_drop(ve);
164 	}
165 	lwkt_reltoken(&vkp->token);
166 
167 	return(error);
168 }
169 
170 /*
171  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
172  *		struct vextframe *vframe);
173  *
174  * Transfer control to a VMSPACE.  Control is returned after the specified
175  * number of microseconds or if a page fault, signal, trap, or system call
176  * occurs.  The context is updated as appropriate.
177  *
178  * No requirements.
179  */
180 int
181 sys_vmspace_ctl(struct sysmsg *sysmsg,
182 		const struct vmspace_ctl_args *uap)
183 {
184 	struct vkernel_proc *vkp;
185 	struct vkernel_lwp *vklp;
186 	struct vmspace_entry *ve = NULL;
187 	struct lwp *lp;
188 	struct proc *p;
189 	int framesz;
190 	int error;
191 
192 	lp = curthread->td_lwp;
193 	p = lp->lwp_proc;
194 
195 	if ((vkp = p->p_vkernel) == NULL)
196 		return (EINVAL);
197 
198 	/*
199 	 * ve only matters when VMM is not used.
200 	 */
201 	if (curthread->td_vmm == NULL) {
202 		if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
203 			error = ENOENT;
204 			goto done;
205 		}
206 	}
207 
208 	switch(uap->cmd) {
209 	case VMSPACE_CTL_RUN:
210 		/*
211 		 * Save the caller's register context, swap VM spaces, and
212 		 * install the passed register context.  Return with
213 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
214 		 */
215 		framesz = sizeof(struct trapframe);
216 		if ((vklp = lp->lwp_vkernel) == NULL) {
217 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
218 				       M_WAITOK|M_ZERO);
219 			lp->lwp_vkernel = vklp;
220 		}
221 		if (ve && vklp->ve_cache != ve) {
222 			vmspace_entry_cache_ref(ve);
223 			if (vklp->ve_cache)
224 				vmspace_entry_cache_drop(vklp->ve_cache);
225 			vklp->ve_cache = ve;
226 		}
227 		vklp->user_trapframe = uap->tframe;
228 		vklp->user_vextframe = uap->vframe;
229 		bcopy(sysmsg->sysmsg_frame, &vklp->save_trapframe, framesz);
230 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
231 		      sizeof(vklp->save_vextframe.vx_tls));
232 		error = copyin(uap->tframe, sysmsg->sysmsg_frame, framesz);
233 		if (error == 0) {
234 			error = copyin(&uap->vframe->vx_tls,
235 				       &curthread->td_tls,
236 				       sizeof(struct savetls));
237 		}
238 		if (error == 0)
239 			error = cpu_sanitize_frame(sysmsg->sysmsg_frame);
240 		if (error == 0)
241 			error = cpu_sanitize_tls(&curthread->td_tls);
242 		if (error) {
243 			bcopy(&vklp->save_trapframe, sysmsg->sysmsg_frame,
244 			      framesz);
245 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
246 			      sizeof(vklp->save_vextframe.vx_tls));
247 			set_user_TLS();
248 		} else {
249 			/*
250 			 * If it's a VMM thread just set the CR3. We also set
251 			 * the vklp->ve to a key to be able to distinguish
252 			 * when a vkernel user process runs and when not
253 			 * (when it's NULL)
254 			 */
255 			if (curthread->td_vmm == NULL) {
256 				vklp->ve = ve;
257 				atomic_add_int(&ve->refs, 1);
258 				pmap_setlwpvm(lp, ve->vmspace);
259 			} else {
260 				vklp->ve = uap->id;
261 				vmm_vm_set_guest_cr3((register_t)uap->id);
262 			}
263 			set_user_TLS();
264 			set_vkernel_fp(sysmsg->sysmsg_frame);
265 			error = EJUSTRETURN;
266 		}
267 		break;
268 	default:
269 		error = EOPNOTSUPP;
270 		break;
271 	}
272 done:
273 	if (ve)
274 		vmspace_entry_drop(ve);
275 
276 	return(error);
277 }
278 
279 /*
280  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
281  *
282  * map memory within a VMSPACE.  This function is just like a normal mmap()
283  * but operates on the vmspace's memory map.  Most callers use this to create
284  * a MAP_VPAGETABLE mapping.
285  *
286  * No requirements.
287  */
288 int
289 sys_vmspace_mmap(struct sysmsg *sysmsg,
290 		 const struct vmspace_mmap_args *uap)
291 {
292 	struct vkernel_proc *vkp;
293 	struct vmspace_entry *ve;
294 	int error;
295 
296 	if ((vkp = curproc->p_vkernel) == NULL) {
297 		error = EINVAL;
298 		goto done2;
299 	}
300 
301 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
302 		error = ENOENT;
303 		goto done2;
304 	}
305 
306 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
307 			  uap->prot, uap->flags,
308 			  uap->fd, uap->offset, &sysmsg->sysmsg_resultp);
309 
310 	vmspace_entry_drop(ve);
311 done2:
312 	return (error);
313 }
314 
315 /*
316  * vmspace_munmap(id, addr, len)
317  *
318  * unmap memory within a VMSPACE.
319  *
320  * No requirements.
321  */
322 int
323 sys_vmspace_munmap(struct sysmsg *sysmsg,
324 		   const struct vmspace_munmap_args *uap)
325 {
326 	struct vkernel_proc *vkp;
327 	struct vmspace_entry *ve;
328 	vm_offset_t addr;
329 	vm_offset_t tmpaddr;
330 	vm_size_t size, pageoff;
331 	vm_map_t map;
332 	int error;
333 
334 	if ((vkp = curproc->p_vkernel) == NULL) {
335 		error = EINVAL;
336 		goto done2;
337 	}
338 
339 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
340 		error = ENOENT;
341 		goto done2;
342 	}
343 
344 	/*
345 	 * NOTE: kern_munmap() can block so we need to temporarily
346 	 *	 ref ve->refs.
347 	 */
348 
349 	/*
350 	 * Copied from sys_munmap()
351 	 */
352 	addr = (vm_offset_t)uap->addr;
353 	size = uap->len;
354 
355 	pageoff = (addr & PAGE_MASK);
356 	addr -= pageoff;
357 	size += pageoff;
358 	size = (vm_size_t)round_page(size);
359 	if (size < uap->len) {		/* wrap */
360 		error = EINVAL;
361 		goto done1;
362 	}
363 	tmpaddr = addr + size;		/* workaround gcc4 opt */
364 	if (tmpaddr < addr) {		/* wrap */
365 		error = EINVAL;
366 		goto done1;
367 	}
368 	if (size == 0) {
369 		error = 0;
370 		goto done1;
371 	}
372 
373 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
374 		error = EINVAL;
375 		goto done1;
376 	}
377 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
378 		error = EINVAL;
379 		goto done1;
380 	}
381 	map = &ve->vmspace->vm_map;
382 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) {
383 		error = EINVAL;
384 		goto done1;
385 	}
386 	vm_map_remove(map, addr, addr + size);
387 	error = 0;
388 done1:
389 	vmspace_entry_drop(ve);
390 done2:
391 	return (error);
392 }
393 
394 /*
395  * vmspace_pread(id, buf, nbyte, flags, offset)
396  *
397  * Read data from a vmspace.  The number of bytes read is returned or
398  * -1 if an unrecoverable error occured.  If the number of bytes read is
399  * less then the request size, a page fault occured in the VMSPACE which
400  * the caller must resolve in order to proceed.
401  *
402  * (not implemented yet)
403  * No requirements.
404  */
405 int
406 sys_vmspace_pread(struct sysmsg *sysmsg,
407 		  const struct vmspace_pread_args *uap)
408 {
409 	struct vkernel_proc *vkp;
410 	struct vmspace_entry *ve;
411 	int error;
412 
413 	if ((vkp = curproc->p_vkernel) == NULL) {
414 		error = EINVAL;
415 		goto done3;
416 	}
417 
418 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
419 		error = ENOENT;
420 		goto done3;
421 	}
422 	vmspace_entry_drop(ve);
423 	error = EINVAL;
424 done3:
425 	return (error);
426 }
427 
428 /*
429  * vmspace_pwrite(id, buf, nbyte, flags, offset)
430  *
431  * Write data to a vmspace.  The number of bytes written is returned or
432  * -1 if an unrecoverable error occured.  If the number of bytes written is
433  * less then the request size, a page fault occured in the VMSPACE which
434  * the caller must resolve in order to proceed.
435  *
436  * (not implemented yet)
437  * No requirements.
438  */
439 int
440 sys_vmspace_pwrite(struct sysmsg *sysmsg,
441 		   const struct vmspace_pwrite_args *uap)
442 {
443 	struct vkernel_proc *vkp;
444 	struct vmspace_entry *ve;
445 	int error;
446 
447 	if ((vkp = curproc->p_vkernel) == NULL) {
448 		error = EINVAL;
449 		goto done3;
450 	}
451 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
452 		error = ENOENT;
453 		goto done3;
454 	}
455 	vmspace_entry_drop(ve);
456 	error = EINVAL;
457 done3:
458 	return (error);
459 }
460 
461 /*
462  * vmspace_mcontrol(id, addr, len, behav, value)
463  *
464  * madvise/mcontrol support for a vmspace.
465  *
466  * No requirements.
467  */
468 int
469 sys_vmspace_mcontrol(struct sysmsg *sysmsg,
470 		     const struct vmspace_mcontrol_args *uap)
471 {
472 	struct vkernel_proc *vkp;
473 	struct vmspace_entry *ve;
474 	struct lwp *lp;
475 	vm_offset_t start, end;
476 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
477 	int error;
478 
479 	lp = curthread->td_lwp;
480 	if ((vkp = curproc->p_vkernel) == NULL) {
481 		error = EINVAL;
482 		goto done3;
483 	}
484 
485 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
486 		error = ENOENT;
487 		goto done3;
488 	}
489 
490 	/*
491 	 * This code is basically copied from sys_mcontrol()
492 	 */
493 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
494 		error = EINVAL;
495 		goto done1;
496 	}
497 
498 	if (tmpaddr < (vm_offset_t)uap->addr) {
499 		error = EINVAL;
500 		goto done1;
501 	}
502 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
503 		error = EINVAL;
504 		goto done1;
505 	}
506         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
507 		error = EINVAL;
508 		goto done1;
509 	}
510 
511 	start = trunc_page((vm_offset_t) uap->addr);
512 	end = round_page(tmpaddr);
513 
514 	error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
515 				uap->behav, uap->value);
516 done1:
517 	vmspace_entry_drop(ve);
518 done3:
519 	return (error);
520 }
521 
522 /*
523  * Red black tree functions
524  */
525 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
526 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
527 
528 /*
529  * a->start is address, and the only field has to be initialized.
530  * The caller must hold vkp->token.
531  *
532  * The caller must hold vkp->token.
533  */
534 static int
535 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
536 {
537         if ((char *)a->id < (char *)b->id)
538                 return(-1);
539         else if ((char *)a->id > (char *)b->id)
540                 return(1);
541         return(0);
542 }
543 
544 /*
545  * The caller must hold vkp->token.
546  */
547 static
548 int
549 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
550 {
551 	struct vkernel_proc *vkp = data;
552 
553 	if (vmspace_entry_delete(ve, vkp, 0) == 0)
554 		vmspace_entry_cache_drop(ve);
555 	else
556 		panic("rb_vmspace_delete: invalid refs %d", ve->refs);
557 	return(0);
558 }
559 
560 /*
561  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
562  * up the pmap, the vm_map, then destroy the vmspace.  We gain control of
563  * the associated cache_refs ref, which the caller will drop for us.
564  *
565  * The ve must not have any active references other than those from the
566  * caller.  If it does, EBUSY is returned.  The ve may still maintain
567  * any number of cache references which will drop as the related LWPs
568  * execute vmspace operations or exit.
569  *
570  * 0 is returned on success, EBUSY on failure.  On success the caller must
571  * drop the last cache_refs.  We have dropped the callers active refs.
572  *
573  * The caller must hold vkp->token.
574  */
575 static
576 int
577 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp,
578 		     int refs)
579 {
580 	/*
581 	 * Interlocked by vkp->token.
582 	 *
583 	 * Drop the callers refs and set VKE_REF_DELETED atomically, if
584 	 * the remaining refs match exactly.  Dropping refs and setting
585 	 * the DELETED flag atomically protects other threads from trying
586 	 * to use the ve.
587 	 *
588 	 * The caller now owns the final cache_ref that was previously
589 	 * associated with the live state of the ve.
590 	 */
591 	if (atomic_cmpset_int(&ve->refs, refs, VKE_REF_DELETED) == 0) {
592 		KKASSERT(ve->refs >= refs);
593 		return EBUSY;
594 	}
595 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
596 
597 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
598 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
599 	vm_map_remove(&ve->vmspace->vm_map,
600 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
601 	vmspace_rel(ve->vmspace);
602 	ve->vmspace = NULL; /* safety */
603 
604 	return 0;
605 }
606 
607 /*
608  * Ref a ve for cache purposes
609  */
610 static
611 void
612 vmspace_entry_cache_ref(struct vmspace_entry *ve)
613 {
614 	atomic_add_int(&ve->cache_refs, 1);
615 }
616 
617 /*
618  * The ve cache_drop is the final word for a ve.  It gains an extra ref
619  * representing it being on the RB tree and not being in a deleted state.
620  * Removal from the RB tree and deletion manipulate this ref.  The last
621  * drop will thus include full deletion of the ve in addition to the last
622  * cached user going away.
623  */
624 static
625 void
626 vmspace_entry_cache_drop(struct vmspace_entry *ve)
627 {
628 	if (atomic_fetchadd_int(&ve->cache_refs, -1) == 1) {
629 		KKASSERT(ve->refs & VKE_REF_DELETED);
630 		kfree(ve, M_VKERNEL);
631 	}
632 }
633 
634 /*
635  * Drop primary reference.  The ve cannot be freed on the 1->0 transition.
636  * Instead, ve deletion interlocks the final kfree() via cache_refs.
637  */
638 static
639 void
640 vmspace_entry_drop(struct vmspace_entry *ve)
641 {
642 	atomic_fetchadd_int(&ve->refs, -1);
643 }
644 
645 /*
646  * Locate the ve for (id), return the ve or NULL.  If found this function
647  * will bump ve->refs which prevents the ve from being immediately destroyed
648  * (but it can still be removed).
649  *
650  * The cache can potentially contain a stale ve, check by testing ve->vmspace.
651  *
652  * The caller must hold vkp->token if excl is non-zero.
653  */
654 static
655 struct vmspace_entry *
656 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id, int excl)
657 {
658 	struct vmspace_entry *ve;
659 	struct vmspace_entry key;
660 	struct vkernel_lwp *vklp;
661 	struct lwp *lp = curthread->td_lwp;
662 
663 	/*
664 	 * Cache check.  Since we already hold a ref on the cache entry
665 	 * the ve cannot be ripped out from under us while we cycle
666 	 * ve->refs.
667 	 */
668 	if ((vklp = lp->lwp_vkernel) != NULL) {
669 		ve = vklp->ve_cache;
670 		if (ve && ve->id == id) {
671 			uint32_t n;
672 
673 			/*
674 			 * Bump active refs, check to see if the cache
675 			 * entry is stale.  If not, we are good.
676 			 */
677 			n = atomic_fetchadd_int(&ve->refs, 1);
678 			if ((n & VKE_REF_DELETED) == 0) {
679 				KKASSERT(ve->vmspace);
680 				return ve;
681 			}
682 
683 			/*
684 			 * Cache is stale, clean it out and fall through
685 			 * to a normal search.
686 			 */
687 			vklp->ve_cache = NULL;
688 			vmspace_entry_drop(ve);
689 			vmspace_entry_cache_drop(ve);
690 		}
691 	}
692 
693 	/*
694 	 * Normal search protected by vkp->token.  No new ve's can be marked
695 	 * DELETED while we hold the token so we are safe.
696 	 */
697 	if (excl == 0)
698 		lwkt_gettoken_shared(&vkp->token);
699 	key.id = id;
700 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
701 	if (ve) {
702 		if (atomic_fetchadd_int(&ve->refs, 1) & VKE_REF_DELETED) {
703 			vmspace_entry_drop(ve);
704 			ve = NULL;
705 		}
706 	}
707 	if (excl == 0)
708 		lwkt_reltoken(&vkp->token);
709 	return (ve);
710 }
711 
712 /*
713  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
714  * a vkernel process.
715  *
716  * No requirements.
717  */
718 void
719 vkernel_inherit(struct proc *p1, struct proc *p2)
720 {
721 	struct vkernel_proc *vkp;
722 
723 	vkp = p1->p_vkernel;
724 	KKASSERT(vkp->refs > 0);
725 	atomic_add_int(&vkp->refs, 1);
726 	p2->p_vkernel = vkp;
727 }
728 
729 /*
730  * No requirements.
731  */
732 void
733 vkernel_exit(struct proc *p)
734 {
735 	struct vkernel_proc *vkp;
736 	struct lwp *lp;
737 
738 	vkp = p->p_vkernel;
739 
740 	/*
741 	 * Restore the original VM context if we are killed while running
742 	 * a different one.
743 	 *
744 	 * This isn't supposed to happen.  What is supposed to happen is
745 	 * that the process should enter vkernel_trap() before the handling
746 	 * the signal.
747 	 */
748 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
749 		vkernel_lwp_exit(lp);
750 	}
751 
752 	/*
753 	 * Dereference the common area
754 	 */
755 	p->p_vkernel = NULL;
756 	KKASSERT(vkp->refs > 0);
757 
758 	if (atomic_fetchadd_int(&vkp->refs, -1) == 1) {
759 		lwkt_gettoken(&vkp->token);
760 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
761 			rb_vmspace_delete, vkp);
762 		lwkt_reltoken(&vkp->token);
763 		kfree(vkp, M_VKERNEL);
764 	}
765 }
766 
767 /*
768  * No requirements.
769  */
770 void
771 vkernel_lwp_exit(struct lwp *lp)
772 {
773 	struct vkernel_lwp *vklp;
774 	struct vmspace_entry *ve;
775 
776 	if ((vklp = lp->lwp_vkernel) != NULL) {
777 		if (lp->lwp_thread->td_vmm == NULL) {
778 			/*
779 			 * vkernel thread
780 			 */
781 			if ((ve = vklp->ve) != NULL) {
782 				kprintf("Warning, pid %d killed with "
783 					"active VC!\n", lp->lwp_proc->p_pid);
784 				pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
785 				vklp->ve = NULL;
786 				KKASSERT(ve->refs > 0);
787 				vmspace_entry_drop(ve);
788 			}
789 		} else {
790 			/*
791 			 * guest thread
792 			 */
793 			vklp->ve = NULL;
794 		}
795 		if ((ve = vklp->ve_cache) != NULL) {
796 			vklp->ve_cache = NULL;
797 			vmspace_entry_cache_drop(ve);
798 		}
799 
800 		lp->lwp_vkernel = NULL;
801 		kfree(vklp, M_VKERNEL);
802 	}
803 }
804 
805 /*
806  * A VM space under virtual kernel control trapped out or made a system call
807  * or otherwise needs to return control to the virtual kernel context.
808  *
809  * No requirements.
810  */
811 void
812 vkernel_trap(struct lwp *lp, struct trapframe *frame)
813 {
814 	struct proc *p = lp->lwp_proc;
815 	struct vmspace_entry *ve;
816 	struct vkernel_lwp *vklp;
817 	int error;
818 
819 	/*
820 	 * Which vmspace entry was running?
821 	 */
822 	vklp = lp->lwp_vkernel;
823 	KKASSERT(vklp);
824 
825 	/* If it's a VMM thread just set the vkernel CR3 back */
826 	if (curthread->td_vmm == NULL) {
827 		ve = vklp->ve;
828 		KKASSERT(ve != NULL);
829 
830 		/*
831 		 * Switch the LWP vmspace back to the virtual kernel's VM space.
832 		 */
833 		vklp->ve = NULL;
834 		pmap_setlwpvm(lp, p->p_vmspace);
835 		KKASSERT(ve->refs > 0);
836 		vmspace_entry_drop(ve);
837 		/* ve is invalid once we kill our ref */
838 	} else {
839 		vklp->ve = NULL;
840 		vmm_vm_set_guest_cr3(p->p_vkernel->vkernel_cr3);
841 	}
842 
843 	/*
844 	 * Copy the emulated process frame to the virtual kernel process.
845 	 * The emulated process cannot change TLS descriptors so don't
846 	 * bother saving them, we already have a copy.
847 	 *
848 	 * Restore the virtual kernel's saved context so the virtual kernel
849 	 * process can resume.
850 	 */
851 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
852 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
853 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
854 	      sizeof(vklp->save_vextframe.vx_tls));
855 	set_user_TLS();
856 	cpu_vkernel_trap(frame, error);
857 }
858