xref: /dragonfly/sys/vm/vm_vmspace.c (revision c9c5aa9e)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysmsg.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/thread.h>
44 #include <sys/proc.h>
45 #include <sys/malloc.h>
46 #include <sys/sysctl.h>
47 #include <sys/vkernel.h>
48 #include <sys/vmspace.h>
49 
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/vmparam.h>
54 #include <machine/vmm.h>
55 
56 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
57 						  void *id, int havetoken);
58 static int vmspace_entry_delete(struct vmspace_entry *ve,
59 				 struct vkernel_proc *vkp, int refs);
60 static void vmspace_entry_cache_ref(struct vmspace_entry *ve);
61 static void vmspace_entry_cache_drop(struct vmspace_entry *ve);
62 static void vmspace_entry_drop(struct vmspace_entry *ve);
63 
64 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
65 
66 /*
67  * vmspace_create (void *id, int type, void *data)
68  *
69  * Create a VMSPACE under the control of the caller with the specified id.
70  * An id of NULL cannot be used.  The type and data fields must currently
71  * be 0.
72  *
73  * The vmspace starts out completely empty.  Memory may be mapped into the
74  * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled
75  * with vmspace_mcontrol().
76  *
77  * No requirements.
78  */
79 int
80 sys_vmspace_create(struct sysmsg *sysmsg,
81 		   const struct vmspace_create_args *uap)
82 {
83 	struct vmspace_entry *ve;
84 	struct vkernel_proc *vkp;
85 	struct proc *p = curproc;
86 	int error;
87 
88 	if (vkernel_enable == 0)
89 		return (EOPNOTSUPP);
90 
91 	/*
92 	 * Create a virtual kernel side-structure for the process if one
93 	 * does not exist.
94 	 *
95 	 * Implement a simple resolution for SMP races.
96 	 */
97 	if ((vkp = p->p_vkernel) == NULL) {
98 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
99 		lwkt_gettoken(&p->p_token);
100 		if (p->p_vkernel == NULL) {
101 			vkp->refs = 1;
102 			lwkt_token_init(&vkp->token, "vkernel");
103 			RB_INIT(&vkp->root);
104 			p->p_vkernel = vkp;
105 		} else {
106 			kfree(vkp, M_VKERNEL);
107 			vkp = p->p_vkernel;
108 		}
109 		lwkt_reltoken(&p->p_token);
110 	}
111 
112 	if (curthread->td_vmm)
113 		return 0;
114 
115 	/*
116 	 * Create a new VMSPACE, disallow conflicting ids
117 	 */
118 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
119 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
120 	ve->id = uap->id;
121 	ve->refs = 0;		/* active refs (none) */
122 	ve->cache_refs = 1;	/* on-tree, not deleted (prevent kfree) */
123 	pmap_pinit2(vmspace_pmap(ve->vmspace));
124 
125 	lwkt_gettoken(&vkp->token);
126 	if (RB_INSERT(vmspace_rb_tree, &vkp->root, ve)) {
127 		vmspace_rel(ve->vmspace);
128 		ve->vmspace = NULL; /* safety */
129 		kfree(ve, M_VKERNEL);
130 		error = EEXIST;
131 	} else {
132 		error = 0;
133 	}
134 	lwkt_reltoken(&vkp->token);
135 
136 	return (error);
137 }
138 
139 /*
140  * Destroy a VMSPACE given its identifier.
141  *
142  * No requirements.
143  */
144 int
145 sys_vmspace_destroy(struct sysmsg *sysmsg,
146 		    const struct vmspace_destroy_args *uap)
147 {
148 	struct vkernel_proc *vkp;
149 	struct vmspace_entry *ve;
150 	int error;
151 
152 	if ((vkp = curproc->p_vkernel) == NULL)
153 		return EINVAL;
154 
155 	/*
156 	 * vkp->token protects the deletion against a new RB tree search.
157 	 */
158 	lwkt_gettoken(&vkp->token);
159 	error = ENOENT;
160 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 1)) != NULL) {
161 		error = vmspace_entry_delete(ve, vkp, 1);
162 		if (error == 0)
163 			vmspace_entry_cache_drop(ve);
164 	}
165 	lwkt_reltoken(&vkp->token);
166 
167 	return(error);
168 }
169 
170 /*
171  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
172  *		struct vextframe *vframe);
173  *
174  * Transfer control to a VMSPACE.  Control is returned after the specified
175  * number of microseconds or if a page fault, signal, trap, or system call
176  * occurs.  The context is updated as appropriate.
177  *
178  * No requirements.
179  */
180 int
181 sys_vmspace_ctl(struct sysmsg *sysmsg,
182 		const struct vmspace_ctl_args *uap)
183 {
184 	struct vmspace_ctl_args ua = *uap;
185 	struct vkernel_proc *vkp;
186 	struct vkernel_lwp *vklp;
187 	struct vmspace_entry *ve = NULL;
188 	struct lwp *lp;
189 	struct proc *p;
190 	int framesz;
191 	int error;
192 
193 	lp = curthread->td_lwp;
194 	p = lp->lwp_proc;
195 
196 	if ((vkp = p->p_vkernel) == NULL)
197 		return (EINVAL);
198 
199 	/*
200 	 * ve only matters when VMM is not used.
201 	 *
202 	 * NOTE: We have to copy *uap into ua because uap is an aliased
203 	 *	 pointer into the sysframe, which we are replacing.
204 	 */
205 	if (curthread->td_vmm == NULL) {
206 		if ((ve = vkernel_find_vmspace(vkp, ua.id, 0)) == NULL) {
207 			error = ENOENT;
208 			goto done;
209 		}
210 	}
211 
212 	switch(ua.cmd) {
213 	case VMSPACE_CTL_RUN:
214 		/*
215 		 * Save the caller's register context, swap VM spaces, and
216 		 * install the passed register context.  Return with
217 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
218 		 */
219 		framesz = sizeof(struct trapframe);
220 		if ((vklp = lp->lwp_vkernel) == NULL) {
221 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
222 				       M_WAITOK|M_ZERO);
223 			lp->lwp_vkernel = vklp;
224 		}
225 		if (ve && vklp->ve_cache != ve) {
226 			vmspace_entry_cache_ref(ve);
227 			if (vklp->ve_cache)
228 				vmspace_entry_cache_drop(vklp->ve_cache);
229 			vklp->ve_cache = ve;
230 		}
231 		vklp->user_trapframe = ua.tframe;
232 		vklp->user_vextframe = ua.vframe;
233 		bcopy(sysmsg->sysmsg_frame, &vklp->save_trapframe, framesz);
234 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
235 		      sizeof(vklp->save_vextframe.vx_tls));
236 		error = copyin(ua.tframe, sysmsg->sysmsg_frame, framesz);
237 		if (error == 0) {
238 			error = copyin(&ua.vframe->vx_tls,
239 				       &curthread->td_tls,
240 				       sizeof(struct savetls));
241 		}
242 		if (error == 0)
243 			error = cpu_sanitize_frame(sysmsg->sysmsg_frame);
244 		if (error == 0)
245 			error = cpu_sanitize_tls(&curthread->td_tls);
246 		if (error) {
247 			bcopy(&vklp->save_trapframe, sysmsg->sysmsg_frame,
248 			      framesz);
249 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
250 			      sizeof(vklp->save_vextframe.vx_tls));
251 			set_user_TLS();
252 		} else {
253 			/*
254 			 * If it's a VMM thread just set the CR3. We also set
255 			 * the vklp->ve to a key to be able to distinguish
256 			 * when a vkernel user process runs and when not
257 			 * (when it's NULL)
258 			 */
259 			if (curthread->td_vmm == NULL) {
260 				vklp->ve = ve;
261 				atomic_add_int(&ve->refs, 1);
262 				pmap_setlwpvm(lp, ve->vmspace);
263 			} else {
264 				vklp->ve = ua.id;
265 				vmm_vm_set_guest_cr3((register_t)ua.id);
266 			}
267 			set_user_TLS();
268 			set_vkernel_fp(sysmsg->sysmsg_frame);
269 			error = EJUSTRETURN;
270 		}
271 		break;
272 	default:
273 		error = EOPNOTSUPP;
274 		break;
275 	}
276 done:
277 	if (ve)
278 		vmspace_entry_drop(ve);
279 
280 	return(error);
281 }
282 
283 /*
284  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
285  *
286  * map memory within a VMSPACE.  This function is just like a normal mmap()
287  * but operates on the vmspace's memory map.  Most callers use this to create
288  * a MAP_VPAGETABLE mapping.
289  *
290  * No requirements.
291  */
292 int
293 sys_vmspace_mmap(struct sysmsg *sysmsg,
294 		 const struct vmspace_mmap_args *uap)
295 {
296 	struct vkernel_proc *vkp;
297 	struct vmspace_entry *ve;
298 	int error;
299 
300 	if ((vkp = curproc->p_vkernel) == NULL) {
301 		error = EINVAL;
302 		goto done2;
303 	}
304 
305 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
306 		error = ENOENT;
307 		goto done2;
308 	}
309 
310 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
311 			  uap->prot, uap->flags,
312 			  uap->fd, uap->offset, &sysmsg->sysmsg_resultp);
313 
314 	vmspace_entry_drop(ve);
315 done2:
316 	return (error);
317 }
318 
319 /*
320  * vmspace_munmap(id, addr, len)
321  *
322  * unmap memory within a VMSPACE.
323  *
324  * No requirements.
325  */
326 int
327 sys_vmspace_munmap(struct sysmsg *sysmsg,
328 		   const struct vmspace_munmap_args *uap)
329 {
330 	struct vkernel_proc *vkp;
331 	struct vmspace_entry *ve;
332 	vm_offset_t addr;
333 	vm_offset_t tmpaddr;
334 	vm_size_t size, pageoff;
335 	vm_map_t map;
336 	int error;
337 
338 	if ((vkp = curproc->p_vkernel) == NULL) {
339 		error = EINVAL;
340 		goto done2;
341 	}
342 
343 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
344 		error = ENOENT;
345 		goto done2;
346 	}
347 
348 	/*
349 	 * NOTE: kern_munmap() can block so we need to temporarily
350 	 *	 ref ve->refs.
351 	 */
352 
353 	/*
354 	 * Copied from sys_munmap()
355 	 */
356 	addr = (vm_offset_t)uap->addr;
357 	size = uap->len;
358 
359 	pageoff = (addr & PAGE_MASK);
360 	addr -= pageoff;
361 	size += pageoff;
362 	size = (vm_size_t)round_page(size);
363 	if (size < uap->len) {		/* wrap */
364 		error = EINVAL;
365 		goto done1;
366 	}
367 	tmpaddr = addr + size;		/* workaround gcc4 opt */
368 	if (tmpaddr < addr) {		/* wrap */
369 		error = EINVAL;
370 		goto done1;
371 	}
372 	if (size == 0) {
373 		error = 0;
374 		goto done1;
375 	}
376 
377 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
378 		error = EINVAL;
379 		goto done1;
380 	}
381 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
382 		error = EINVAL;
383 		goto done1;
384 	}
385 	map = &ve->vmspace->vm_map;
386 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) {
387 		error = EINVAL;
388 		goto done1;
389 	}
390 	vm_map_remove(map, addr, addr + size);
391 	error = 0;
392 done1:
393 	vmspace_entry_drop(ve);
394 done2:
395 	return (error);
396 }
397 
398 /*
399  * vmspace_pread(id, buf, nbyte, flags, offset)
400  *
401  * Read data from a vmspace.  The number of bytes read is returned or
402  * -1 if an unrecoverable error occured.  If the number of bytes read is
403  * less then the request size, a page fault occured in the VMSPACE which
404  * the caller must resolve in order to proceed.
405  *
406  * (not implemented yet)
407  * No requirements.
408  */
409 int
410 sys_vmspace_pread(struct sysmsg *sysmsg,
411 		  const struct vmspace_pread_args *uap)
412 {
413 	struct vkernel_proc *vkp;
414 	struct vmspace_entry *ve;
415 	int error;
416 
417 	if ((vkp = curproc->p_vkernel) == NULL) {
418 		error = EINVAL;
419 		goto done3;
420 	}
421 
422 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
423 		error = ENOENT;
424 		goto done3;
425 	}
426 	vmspace_entry_drop(ve);
427 	error = EINVAL;
428 done3:
429 	return (error);
430 }
431 
432 /*
433  * vmspace_pwrite(id, buf, nbyte, flags, offset)
434  *
435  * Write data to a vmspace.  The number of bytes written is returned or
436  * -1 if an unrecoverable error occured.  If the number of bytes written is
437  * less then the request size, a page fault occured in the VMSPACE which
438  * the caller must resolve in order to proceed.
439  *
440  * (not implemented yet)
441  * No requirements.
442  */
443 int
444 sys_vmspace_pwrite(struct sysmsg *sysmsg,
445 		   const struct vmspace_pwrite_args *uap)
446 {
447 	struct vkernel_proc *vkp;
448 	struct vmspace_entry *ve;
449 	int error;
450 
451 	if ((vkp = curproc->p_vkernel) == NULL) {
452 		error = EINVAL;
453 		goto done3;
454 	}
455 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
456 		error = ENOENT;
457 		goto done3;
458 	}
459 	vmspace_entry_drop(ve);
460 	error = EINVAL;
461 done3:
462 	return (error);
463 }
464 
465 /*
466  * vmspace_mcontrol(id, addr, len, behav, value)
467  *
468  * madvise/mcontrol support for a vmspace.
469  *
470  * No requirements.
471  */
472 int
473 sys_vmspace_mcontrol(struct sysmsg *sysmsg,
474 		     const struct vmspace_mcontrol_args *uap)
475 {
476 	struct vkernel_proc *vkp;
477 	struct vmspace_entry *ve;
478 	struct lwp *lp;
479 	vm_offset_t start, end;
480 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
481 	int error;
482 
483 	lp = curthread->td_lwp;
484 	if ((vkp = curproc->p_vkernel) == NULL) {
485 		error = EINVAL;
486 		goto done3;
487 	}
488 
489 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
490 		error = ENOENT;
491 		goto done3;
492 	}
493 
494 	/*
495 	 * This code is basically copied from sys_mcontrol()
496 	 */
497 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
498 		error = EINVAL;
499 		goto done1;
500 	}
501 
502 	if (tmpaddr < (vm_offset_t)uap->addr) {
503 		error = EINVAL;
504 		goto done1;
505 	}
506 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
507 		error = EINVAL;
508 		goto done1;
509 	}
510         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
511 		error = EINVAL;
512 		goto done1;
513 	}
514 
515 	start = trunc_page((vm_offset_t) uap->addr);
516 	end = round_page(tmpaddr);
517 
518 	error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
519 				uap->behav, uap->value);
520 done1:
521 	vmspace_entry_drop(ve);
522 done3:
523 	return (error);
524 }
525 
526 /*
527  * Red black tree functions
528  */
529 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
530 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
531 
532 /*
533  * a->start is address, and the only field has to be initialized.
534  * The caller must hold vkp->token.
535  *
536  * The caller must hold vkp->token.
537  */
538 static int
539 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
540 {
541         if ((char *)a->id < (char *)b->id)
542                 return(-1);
543         else if ((char *)a->id > (char *)b->id)
544                 return(1);
545         return(0);
546 }
547 
548 /*
549  * The caller must hold vkp->token.
550  */
551 static
552 int
553 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
554 {
555 	struct vkernel_proc *vkp = data;
556 
557 	if (vmspace_entry_delete(ve, vkp, 0) == 0)
558 		vmspace_entry_cache_drop(ve);
559 	else
560 		panic("rb_vmspace_delete: invalid refs %d", ve->refs);
561 	return(0);
562 }
563 
564 /*
565  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
566  * up the pmap, the vm_map, then destroy the vmspace.  We gain control of
567  * the associated cache_refs ref, which the caller will drop for us.
568  *
569  * The ve must not have any active references other than those from the
570  * caller.  If it does, EBUSY is returned.  The ve may still maintain
571  * any number of cache references which will drop as the related LWPs
572  * execute vmspace operations or exit.
573  *
574  * 0 is returned on success, EBUSY on failure.  On success the caller must
575  * drop the last cache_refs.  We have dropped the callers active refs.
576  *
577  * The caller must hold vkp->token.
578  */
579 static
580 int
581 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp,
582 		     int refs)
583 {
584 	/*
585 	 * Interlocked by vkp->token.
586 	 *
587 	 * Drop the callers refs and set VKE_REF_DELETED atomically, if
588 	 * the remaining refs match exactly.  Dropping refs and setting
589 	 * the DELETED flag atomically protects other threads from trying
590 	 * to use the ve.
591 	 *
592 	 * The caller now owns the final cache_ref that was previously
593 	 * associated with the live state of the ve.
594 	 */
595 	if (atomic_cmpset_int(&ve->refs, refs, VKE_REF_DELETED) == 0) {
596 		KKASSERT(ve->refs >= refs);
597 		return EBUSY;
598 	}
599 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
600 
601 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
602 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
603 	vm_map_remove(&ve->vmspace->vm_map,
604 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
605 	vmspace_rel(ve->vmspace);
606 	ve->vmspace = NULL; /* safety */
607 
608 	return 0;
609 }
610 
611 /*
612  * Ref a ve for cache purposes
613  */
614 static
615 void
616 vmspace_entry_cache_ref(struct vmspace_entry *ve)
617 {
618 	atomic_add_int(&ve->cache_refs, 1);
619 }
620 
621 /*
622  * The ve cache_drop is the final word for a ve.  It gains an extra ref
623  * representing it being on the RB tree and not being in a deleted state.
624  * Removal from the RB tree and deletion manipulate this ref.  The last
625  * drop will thus include full deletion of the ve in addition to the last
626  * cached user going away.
627  */
628 static
629 void
630 vmspace_entry_cache_drop(struct vmspace_entry *ve)
631 {
632 	if (atomic_fetchadd_int(&ve->cache_refs, -1) == 1) {
633 		KKASSERT(ve->refs & VKE_REF_DELETED);
634 		kfree(ve, M_VKERNEL);
635 	}
636 }
637 
638 /*
639  * Drop primary reference.  The ve cannot be freed on the 1->0 transition.
640  * Instead, ve deletion interlocks the final kfree() via cache_refs.
641  */
642 static
643 void
644 vmspace_entry_drop(struct vmspace_entry *ve)
645 {
646 	atomic_fetchadd_int(&ve->refs, -1);
647 }
648 
649 /*
650  * Locate the ve for (id), return the ve or NULL.  If found this function
651  * will bump ve->refs which prevents the ve from being immediately destroyed
652  * (but it can still be removed).
653  *
654  * The cache can potentially contain a stale ve, check by testing ve->vmspace.
655  *
656  * The caller must hold vkp->token if excl is non-zero.
657  */
658 static
659 struct vmspace_entry *
660 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id, int excl)
661 {
662 	struct vmspace_entry *ve;
663 	struct vmspace_entry key;
664 	struct vkernel_lwp *vklp;
665 	struct lwp *lp = curthread->td_lwp;
666 
667 	/*
668 	 * Cache check.  Since we already hold a ref on the cache entry
669 	 * the ve cannot be ripped out from under us while we cycle
670 	 * ve->refs.
671 	 */
672 	if ((vklp = lp->lwp_vkernel) != NULL) {
673 		ve = vklp->ve_cache;
674 		if (ve && ve->id == id) {
675 			uint32_t n;
676 
677 			/*
678 			 * Bump active refs, check to see if the cache
679 			 * entry is stale.  If not, we are good.
680 			 */
681 			n = atomic_fetchadd_int(&ve->refs, 1);
682 			if ((n & VKE_REF_DELETED) == 0) {
683 				KKASSERT(ve->vmspace);
684 				return ve;
685 			}
686 
687 			/*
688 			 * Cache is stale, clean it out and fall through
689 			 * to a normal search.
690 			 */
691 			vklp->ve_cache = NULL;
692 			vmspace_entry_drop(ve);
693 			vmspace_entry_cache_drop(ve);
694 		}
695 	}
696 
697 	/*
698 	 * Normal search protected by vkp->token.  No new ve's can be marked
699 	 * DELETED while we hold the token so we are safe.
700 	 */
701 	if (excl == 0)
702 		lwkt_gettoken_shared(&vkp->token);
703 	key.id = id;
704 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
705 	if (ve) {
706 		if (atomic_fetchadd_int(&ve->refs, 1) & VKE_REF_DELETED) {
707 			vmspace_entry_drop(ve);
708 			ve = NULL;
709 		}
710 	}
711 	if (excl == 0)
712 		lwkt_reltoken(&vkp->token);
713 	return (ve);
714 }
715 
716 /*
717  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
718  * a vkernel process.
719  *
720  * No requirements.
721  */
722 void
723 vkernel_inherit(struct proc *p1, struct proc *p2)
724 {
725 	struct vkernel_proc *vkp;
726 
727 	vkp = p1->p_vkernel;
728 	KKASSERT(vkp->refs > 0);
729 	atomic_add_int(&vkp->refs, 1);
730 	p2->p_vkernel = vkp;
731 }
732 
733 /*
734  * No requirements.
735  */
736 void
737 vkernel_exit(struct proc *p)
738 {
739 	struct vkernel_proc *vkp;
740 	struct lwp *lp;
741 
742 	vkp = p->p_vkernel;
743 
744 	/*
745 	 * Restore the original VM context if we are killed while running
746 	 * a different one.
747 	 *
748 	 * This isn't supposed to happen.  What is supposed to happen is
749 	 * that the process should enter vkernel_trap() before the handling
750 	 * the signal.
751 	 */
752 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
753 		vkernel_lwp_exit(lp);
754 	}
755 
756 	/*
757 	 * Dereference the common area
758 	 */
759 	p->p_vkernel = NULL;
760 	KKASSERT(vkp->refs > 0);
761 
762 	if (atomic_fetchadd_int(&vkp->refs, -1) == 1) {
763 		lwkt_gettoken(&vkp->token);
764 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
765 			rb_vmspace_delete, vkp);
766 		lwkt_reltoken(&vkp->token);
767 		kfree(vkp, M_VKERNEL);
768 	}
769 }
770 
771 /*
772  * No requirements.
773  */
774 void
775 vkernel_lwp_exit(struct lwp *lp)
776 {
777 	struct vkernel_lwp *vklp;
778 	struct vmspace_entry *ve;
779 
780 	if ((vklp = lp->lwp_vkernel) != NULL) {
781 		if (lp->lwp_thread->td_vmm == NULL) {
782 			/*
783 			 * vkernel thread
784 			 */
785 			if ((ve = vklp->ve) != NULL) {
786 				kprintf("Warning, pid %d killed with "
787 					"active VC!\n", lp->lwp_proc->p_pid);
788 				pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
789 				vklp->ve = NULL;
790 				KKASSERT(ve->refs > 0);
791 				vmspace_entry_drop(ve);
792 			}
793 		} else {
794 			/*
795 			 * guest thread
796 			 */
797 			vklp->ve = NULL;
798 		}
799 		if ((ve = vklp->ve_cache) != NULL) {
800 			vklp->ve_cache = NULL;
801 			vmspace_entry_cache_drop(ve);
802 		}
803 
804 		lp->lwp_vkernel = NULL;
805 		kfree(vklp, M_VKERNEL);
806 	}
807 }
808 
809 /*
810  * A VM space under virtual kernel control trapped out or made a system call
811  * or otherwise needs to return control to the virtual kernel context.
812  *
813  * No requirements.
814  */
815 void
816 vkernel_trap(struct lwp *lp, struct trapframe *frame)
817 {
818 	struct proc *p = lp->lwp_proc;
819 	struct vmspace_entry *ve;
820 	struct vkernel_lwp *vklp;
821 	int error;
822 
823 	/*
824 	 * Which vmspace entry was running?
825 	 */
826 	vklp = lp->lwp_vkernel;
827 	KKASSERT(vklp);
828 
829 	/* If it's a VMM thread just set the vkernel CR3 back */
830 	if (curthread->td_vmm == NULL) {
831 		ve = vklp->ve;
832 		KKASSERT(ve != NULL);
833 
834 		/*
835 		 * Switch the LWP vmspace back to the virtual kernel's VM space.
836 		 */
837 		vklp->ve = NULL;
838 		pmap_setlwpvm(lp, p->p_vmspace);
839 		KKASSERT(ve->refs > 0);
840 		vmspace_entry_drop(ve);
841 		/* ve is invalid once we kill our ref */
842 	} else {
843 		vklp->ve = NULL;
844 		vmm_vm_set_guest_cr3(p->p_vkernel->vkernel_cr3);
845 	}
846 
847 	/*
848 	 * Copy the emulated process frame to the virtual kernel process.
849 	 * The emulated process cannot change TLS descriptors so don't
850 	 * bother saving them, we already have a copy.
851 	 *
852 	 * Restore the virtual kernel's saved context so the virtual kernel
853 	 * process can resume.
854 	 */
855 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
856 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
857 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
858 	      sizeof(vklp->save_vextframe.vx_tls));
859 	set_user_TLS();
860 	cpu_vkernel_trap(frame, error);
861 }
862