xref: /dragonfly/sys/vm/vm_vmspace.c (revision d8d5b238)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysproto.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/thread.h>
44 #include <sys/proc.h>
45 #include <sys/malloc.h>
46 #include <sys/sysctl.h>
47 #include <sys/vkernel.h>
48 #include <sys/vmspace.h>
49 
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/vmparam.h>
54 #include <machine/vmm.h>
55 
56 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
57 						  void *id, int havetoken);
58 static int vmspace_entry_delete(struct vmspace_entry *ve,
59 				 struct vkernel_proc *vkp, int refs);
60 static void vmspace_entry_cache_ref(struct vmspace_entry *ve);
61 static void vmspace_entry_cache_drop(struct vmspace_entry *ve);
62 static void vmspace_entry_drop(struct vmspace_entry *ve);
63 
64 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
65 
66 /*
67  * vmspace_create (void *id, int type, void *data)
68  *
69  * Create a VMSPACE under the control of the caller with the specified id.
70  * An id of NULL cannot be used.  The type and data fields must currently
71  * be 0.
72  *
73  * The vmspace starts out completely empty.  Memory may be mapped into the
74  * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled
75  * with vmspace_mcontrol().
76  *
77  * No requirements.
78  */
79 int
80 sys_vmspace_create(struct vmspace_create_args *uap)
81 {
82 	struct vmspace_entry *ve;
83 	struct vkernel_proc *vkp;
84 	struct proc *p = curproc;
85 	int error;
86 
87 	if (vkernel_enable == 0)
88 		return (EOPNOTSUPP);
89 
90 	/*
91 	 * Create a virtual kernel side-structure for the process if one
92 	 * does not exist.
93 	 *
94 	 * Implement a simple resolution for SMP races.
95 	 */
96 	if ((vkp = p->p_vkernel) == NULL) {
97 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
98 		lwkt_gettoken(&p->p_token);
99 		if (p->p_vkernel == NULL) {
100 			vkp->refs = 1;
101 			lwkt_token_init(&vkp->token, "vkernel");
102 			RB_INIT(&vkp->root);
103 			p->p_vkernel = vkp;
104 		} else {
105 			kfree(vkp, M_VKERNEL);
106 			vkp = p->p_vkernel;
107 		}
108 		lwkt_reltoken(&p->p_token);
109 	}
110 
111 	if (curthread->td_vmm)
112 		return 0;
113 
114 	/*
115 	 * Create a new VMSPACE, disallow conflicting ids
116 	 */
117 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
118 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
119 	ve->id = uap->id;
120 	ve->refs = 0;		/* active refs (none) */
121 	ve->cache_refs = 1;	/* on-tree, not deleted (prevent kfree) */
122 	pmap_pinit2(vmspace_pmap(ve->vmspace));
123 
124 	lwkt_gettoken(&vkp->token);
125 	if (RB_INSERT(vmspace_rb_tree, &vkp->root, ve)) {
126 		vmspace_rel(ve->vmspace);
127 		ve->vmspace = NULL; /* safety */
128 		kfree(ve, M_VKERNEL);
129 		error = EEXIST;
130 	} else {
131 		error = 0;
132 	}
133 	lwkt_reltoken(&vkp->token);
134 
135 	return (error);
136 }
137 
138 /*
139  * Destroy a VMSPACE given its identifier.
140  *
141  * No requirements.
142  */
143 int
144 sys_vmspace_destroy(struct vmspace_destroy_args *uap)
145 {
146 	struct vkernel_proc *vkp;
147 	struct vmspace_entry *ve;
148 	int error;
149 
150 	if ((vkp = curproc->p_vkernel) == NULL)
151 		return EINVAL;
152 
153 	/*
154 	 * vkp->token protects the deletion against a new RB tree search.
155 	 */
156 	lwkt_gettoken(&vkp->token);
157 	error = ENOENT;
158 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 1)) != NULL) {
159 		error = vmspace_entry_delete(ve, vkp, 1);
160 		if (error == 0)
161 			vmspace_entry_cache_drop(ve);
162 	}
163 	lwkt_reltoken(&vkp->token);
164 
165 	return(error);
166 }
167 
168 /*
169  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
170  *		struct vextframe *vframe);
171  *
172  * Transfer control to a VMSPACE.  Control is returned after the specified
173  * number of microseconds or if a page fault, signal, trap, or system call
174  * occurs.  The context is updated as appropriate.
175  *
176  * No requirements.
177  */
178 int
179 sys_vmspace_ctl(struct vmspace_ctl_args *uap)
180 {
181 	struct vkernel_proc *vkp;
182 	struct vkernel_lwp *vklp;
183 	struct vmspace_entry *ve = NULL;
184 	struct lwp *lp;
185 	struct proc *p;
186 	int framesz;
187 	int error;
188 
189 	lp = curthread->td_lwp;
190 	p = lp->lwp_proc;
191 
192 	if ((vkp = p->p_vkernel) == NULL)
193 		return (EINVAL);
194 
195 	/*
196 	 * ve only matters when VMM is not used.
197 	 */
198 	if (curthread->td_vmm == NULL) {
199 		if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
200 			error = ENOENT;
201 			goto done;
202 		}
203 	}
204 
205 	switch(uap->cmd) {
206 	case VMSPACE_CTL_RUN:
207 		/*
208 		 * Save the caller's register context, swap VM spaces, and
209 		 * install the passed register context.  Return with
210 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
211 		 */
212 		framesz = sizeof(struct trapframe);
213 		if ((vklp = lp->lwp_vkernel) == NULL) {
214 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
215 				       M_WAITOK|M_ZERO);
216 			lp->lwp_vkernel = vklp;
217 		}
218 		if (ve && vklp->ve_cache != ve) {
219 			vmspace_entry_cache_ref(ve);
220 			if (vklp->ve_cache)
221 				vmspace_entry_cache_drop(vklp->ve_cache);
222 			vklp->ve_cache = ve;
223 		}
224 		vklp->user_trapframe = uap->tframe;
225 		vklp->user_vextframe = uap->vframe;
226 		bcopy(uap->sysmsg_frame, &vklp->save_trapframe, framesz);
227 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
228 		      sizeof(vklp->save_vextframe.vx_tls));
229 		error = copyin(uap->tframe, uap->sysmsg_frame, framesz);
230 		if (error == 0) {
231 			error = copyin(&uap->vframe->vx_tls,
232 				       &curthread->td_tls,
233 				       sizeof(struct savetls));
234 		}
235 		if (error == 0)
236 			error = cpu_sanitize_frame(uap->sysmsg_frame);
237 		if (error == 0)
238 			error = cpu_sanitize_tls(&curthread->td_tls);
239 		if (error) {
240 			bcopy(&vklp->save_trapframe, uap->sysmsg_frame,
241 			      framesz);
242 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
243 			      sizeof(vklp->save_vextframe.vx_tls));
244 			set_user_TLS();
245 		} else {
246 			/*
247 			 * If it's a VMM thread just set the CR3. We also set
248 			 * the vklp->ve to a key to be able to distinguish
249 			 * when a vkernel user process runs and when not
250 			 * (when it's NULL)
251 			 */
252 			if (curthread->td_vmm == NULL) {
253 				vklp->ve = ve;
254 				atomic_add_int(&ve->refs, 1);
255 				pmap_setlwpvm(lp, ve->vmspace);
256 			} else {
257 				vklp->ve = uap->id;
258 				vmm_vm_set_guest_cr3((register_t)uap->id);
259 			}
260 			set_user_TLS();
261 			set_vkernel_fp(uap->sysmsg_frame);
262 			error = EJUSTRETURN;
263 		}
264 		break;
265 	default:
266 		error = EOPNOTSUPP;
267 		break;
268 	}
269 done:
270 	if (ve)
271 		vmspace_entry_drop(ve);
272 
273 	return(error);
274 }
275 
276 /*
277  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
278  *
279  * map memory within a VMSPACE.  This function is just like a normal mmap()
280  * but operates on the vmspace's memory map.  Most callers use this to create
281  * a MAP_VPAGETABLE mapping.
282  *
283  * No requirements.
284  */
285 int
286 sys_vmspace_mmap(struct vmspace_mmap_args *uap)
287 {
288 	struct vkernel_proc *vkp;
289 	struct vmspace_entry *ve;
290 	int error;
291 
292 	if ((vkp = curproc->p_vkernel) == NULL) {
293 		error = EINVAL;
294 		goto done2;
295 	}
296 
297 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
298 		error = ENOENT;
299 		goto done2;
300 	}
301 
302 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
303 			  uap->prot, uap->flags,
304 			  uap->fd, uap->offset, &uap->sysmsg_resultp);
305 
306 	vmspace_entry_drop(ve);
307 done2:
308 	return (error);
309 }
310 
311 /*
312  * vmspace_munmap(id, addr, len)
313  *
314  * unmap memory within a VMSPACE.
315  *
316  * No requirements.
317  */
318 int
319 sys_vmspace_munmap(struct vmspace_munmap_args *uap)
320 {
321 	struct vkernel_proc *vkp;
322 	struct vmspace_entry *ve;
323 	vm_offset_t addr;
324 	vm_offset_t tmpaddr;
325 	vm_size_t size, pageoff;
326 	vm_map_t map;
327 	int error;
328 
329 	if ((vkp = curproc->p_vkernel) == NULL) {
330 		error = EINVAL;
331 		goto done2;
332 	}
333 
334 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
335 		error = ENOENT;
336 		goto done2;
337 	}
338 
339 	/*
340 	 * NOTE: kern_munmap() can block so we need to temporarily
341 	 *	 ref ve->refs.
342 	 */
343 
344 	/*
345 	 * Copied from sys_munmap()
346 	 */
347 	addr = (vm_offset_t)uap->addr;
348 	size = uap->len;
349 
350 	pageoff = (addr & PAGE_MASK);
351 	addr -= pageoff;
352 	size += pageoff;
353 	size = (vm_size_t)round_page(size);
354 	if (size < uap->len) {		/* wrap */
355 		error = EINVAL;
356 		goto done1;
357 	}
358 	tmpaddr = addr + size;		/* workaround gcc4 opt */
359 	if (tmpaddr < addr) {		/* wrap */
360 		error = EINVAL;
361 		goto done1;
362 	}
363 	if (size == 0) {
364 		error = 0;
365 		goto done1;
366 	}
367 
368 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
369 		error = EINVAL;
370 		goto done1;
371 	}
372 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
373 		error = EINVAL;
374 		goto done1;
375 	}
376 	map = &ve->vmspace->vm_map;
377 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) {
378 		error = EINVAL;
379 		goto done1;
380 	}
381 	vm_map_remove(map, addr, addr + size);
382 	error = 0;
383 done1:
384 	vmspace_entry_drop(ve);
385 done2:
386 	return (error);
387 }
388 
389 /*
390  * vmspace_pread(id, buf, nbyte, flags, offset)
391  *
392  * Read data from a vmspace.  The number of bytes read is returned or
393  * -1 if an unrecoverable error occured.  If the number of bytes read is
394  * less then the request size, a page fault occured in the VMSPACE which
395  * the caller must resolve in order to proceed.
396  *
397  * (not implemented yet)
398  * No requirements.
399  */
400 int
401 sys_vmspace_pread(struct vmspace_pread_args *uap)
402 {
403 	struct vkernel_proc *vkp;
404 	struct vmspace_entry *ve;
405 	int error;
406 
407 	if ((vkp = curproc->p_vkernel) == NULL) {
408 		error = EINVAL;
409 		goto done3;
410 	}
411 
412 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
413 		error = ENOENT;
414 		goto done3;
415 	}
416 	vmspace_entry_drop(ve);
417 	error = EINVAL;
418 done3:
419 	return (error);
420 }
421 
422 /*
423  * vmspace_pwrite(id, buf, nbyte, flags, offset)
424  *
425  * Write data to a vmspace.  The number of bytes written is returned or
426  * -1 if an unrecoverable error occured.  If the number of bytes written is
427  * less then the request size, a page fault occured in the VMSPACE which
428  * the caller must resolve in order to proceed.
429  *
430  * (not implemented yet)
431  * No requirements.
432  */
433 int
434 sys_vmspace_pwrite(struct vmspace_pwrite_args *uap)
435 {
436 	struct vkernel_proc *vkp;
437 	struct vmspace_entry *ve;
438 	int error;
439 
440 	if ((vkp = curproc->p_vkernel) == NULL) {
441 		error = EINVAL;
442 		goto done3;
443 	}
444 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
445 		error = ENOENT;
446 		goto done3;
447 	}
448 	vmspace_entry_drop(ve);
449 	error = EINVAL;
450 done3:
451 	return (error);
452 }
453 
454 /*
455  * vmspace_mcontrol(id, addr, len, behav, value)
456  *
457  * madvise/mcontrol support for a vmspace.
458  *
459  * No requirements.
460  */
461 int
462 sys_vmspace_mcontrol(struct vmspace_mcontrol_args *uap)
463 {
464 	struct vkernel_proc *vkp;
465 	struct vmspace_entry *ve;
466 	struct lwp *lp;
467 	vm_offset_t start, end;
468 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
469 	int error;
470 
471 	lp = curthread->td_lwp;
472 	if ((vkp = curproc->p_vkernel) == NULL) {
473 		error = EINVAL;
474 		goto done3;
475 	}
476 
477 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
478 		error = ENOENT;
479 		goto done3;
480 	}
481 
482 	/*
483 	 * This code is basically copied from sys_mcontrol()
484 	 */
485 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
486 		error = EINVAL;
487 		goto done1;
488 	}
489 
490 	if (tmpaddr < (vm_offset_t)uap->addr) {
491 		error = EINVAL;
492 		goto done1;
493 	}
494 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
495 		error = EINVAL;
496 		goto done1;
497 	}
498         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
499 		error = EINVAL;
500 		goto done1;
501 	}
502 
503 	start = trunc_page((vm_offset_t) uap->addr);
504 	end = round_page(tmpaddr);
505 
506 	error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
507 				uap->behav, uap->value);
508 done1:
509 	vmspace_entry_drop(ve);
510 done3:
511 	return (error);
512 }
513 
514 /*
515  * Red black tree functions
516  */
517 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
518 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
519 
520 /*
521  * a->start is address, and the only field has to be initialized.
522  * The caller must hold vkp->token.
523  *
524  * The caller must hold vkp->token.
525  */
526 static int
527 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
528 {
529         if ((char *)a->id < (char *)b->id)
530                 return(-1);
531         else if ((char *)a->id > (char *)b->id)
532                 return(1);
533         return(0);
534 }
535 
536 /*
537  * The caller must hold vkp->token.
538  */
539 static
540 int
541 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
542 {
543 	struct vkernel_proc *vkp = data;
544 
545 	if (vmspace_entry_delete(ve, vkp, 0) == 0)
546 		vmspace_entry_cache_drop(ve);
547 	else
548 		panic("rb_vmspace_delete: invalid refs %d", ve->refs);
549 	return(0);
550 }
551 
552 /*
553  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
554  * up the pmap, the vm_map, then destroy the vmspace.  We gain control of
555  * the associated cache_refs ref, which the caller will drop for us.
556  *
557  * The ve must not have any active references other than those from the
558  * caller.  If it does, EBUSY is returned.  The ve may still maintain
559  * any number of cache references which will drop as the related LWPs
560  * execute vmspace operations or exit.
561  *
562  * 0 is returned on success, EBUSY on failure.  On success the caller must
563  * drop the last cache_refs.  We have dropped the callers active refs.
564  *
565  * The caller must hold vkp->token.
566  */
567 static
568 int
569 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp,
570 		     int refs)
571 {
572 	/*
573 	 * Interlocked by vkp->token.
574 	 *
575 	 * Drop the callers refs and set VKE_REF_DELETED atomically, if
576 	 * the remaining refs match exactly.  Dropping refs and setting
577 	 * the DELETED flag atomically protects other threads from trying
578 	 * to use the ve.
579 	 *
580 	 * The caller now owns the final cache_ref that was previously
581 	 * associated with the live state of the ve.
582 	 */
583 	if (atomic_cmpset_int(&ve->refs, refs, VKE_REF_DELETED) == 0) {
584 		KKASSERT(ve->refs >= refs);
585 		return EBUSY;
586 	}
587 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
588 
589 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
590 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
591 	vm_map_remove(&ve->vmspace->vm_map,
592 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
593 	vmspace_rel(ve->vmspace);
594 	ve->vmspace = NULL; /* safety */
595 
596 	return 0;
597 }
598 
599 /*
600  * Ref a ve for cache purposes
601  */
602 static
603 void
604 vmspace_entry_cache_ref(struct vmspace_entry *ve)
605 {
606 	atomic_add_int(&ve->cache_refs, 1);
607 }
608 
609 /*
610  * The ve cache_drop is the final word for a ve.  It gains an extra ref
611  * representing it being on the RB tree and not being in a deleted state.
612  * Removal from the RB tree and deletion manipulate this ref.  The last
613  * drop will thus include full deletion of the ve in addition to the last
614  * cached user going away.
615  */
616 static
617 void
618 vmspace_entry_cache_drop(struct vmspace_entry *ve)
619 {
620 	if (atomic_fetchadd_int(&ve->cache_refs, -1) == 1) {
621 		KKASSERT(ve->refs & VKE_REF_DELETED);
622 		kfree(ve, M_VKERNEL);
623 	}
624 }
625 
626 /*
627  * Drop primary reference.  The ve cannot be freed on the 1->0 transition.
628  * Instead, ve deletion interlocks the final kfree() via cache_refs.
629  */
630 static
631 void
632 vmspace_entry_drop(struct vmspace_entry *ve)
633 {
634 	atomic_fetchadd_int(&ve->refs, -1);
635 }
636 
637 /*
638  * Locate the ve for (id), return the ve or NULL.  If found this function
639  * will bump ve->refs which prevents the ve from being immediately destroyed
640  * (but it can still be removed).
641  *
642  * The cache can potentially contain a stale ve, check by testing ve->vmspace.
643  *
644  * The caller must hold vkp->token if excl is non-zero.
645  */
646 static
647 struct vmspace_entry *
648 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id, int excl)
649 {
650 	struct vmspace_entry *ve;
651 	struct vmspace_entry key;
652 	struct vkernel_lwp *vklp;
653 	struct lwp *lp = curthread->td_lwp;
654 
655 	/*
656 	 * Cache check.  Since we already hold a ref on the cache entry
657 	 * the ve cannot be ripped out from under us while we cycle
658 	 * ve->refs.
659 	 */
660 	if ((vklp = lp->lwp_vkernel) != NULL) {
661 		ve = vklp->ve_cache;
662 		if (ve && ve->id == id) {
663 			uint32_t n;
664 
665 			/*
666 			 * Bump active refs, check to see if the cache
667 			 * entry is stale.  If not, we are good.
668 			 */
669 			n = atomic_fetchadd_int(&ve->refs, 1);
670 			if ((n & VKE_REF_DELETED) == 0) {
671 				KKASSERT(ve->vmspace);
672 				return ve;
673 			}
674 
675 			/*
676 			 * Cache is stale, clean it out and fall through
677 			 * to a normal search.
678 			 */
679 			vklp->ve_cache = NULL;
680 			vmspace_entry_drop(ve);
681 			vmspace_entry_cache_drop(ve);
682 		}
683 	}
684 
685 	/*
686 	 * Normal search protected by vkp->token.  No new ve's can be marked
687 	 * DELETED while we hold the token so we are safe.
688 	 */
689 	if (excl == 0)
690 		lwkt_gettoken_shared(&vkp->token);
691 	key.id = id;
692 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
693 	if (ve) {
694 		if (atomic_fetchadd_int(&ve->refs, 1) & VKE_REF_DELETED) {
695 			vmspace_entry_drop(ve);
696 			ve = NULL;
697 		}
698 	}
699 	if (excl == 0)
700 		lwkt_reltoken(&vkp->token);
701 	return (ve);
702 }
703 
704 /*
705  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
706  * a vkernel process.
707  *
708  * No requirements.
709  */
710 void
711 vkernel_inherit(struct proc *p1, struct proc *p2)
712 {
713 	struct vkernel_proc *vkp;
714 
715 	vkp = p1->p_vkernel;
716 	KKASSERT(vkp->refs > 0);
717 	atomic_add_int(&vkp->refs, 1);
718 	p2->p_vkernel = vkp;
719 }
720 
721 /*
722  * No requirements.
723  */
724 void
725 vkernel_exit(struct proc *p)
726 {
727 	struct vkernel_proc *vkp;
728 	struct lwp *lp;
729 
730 	vkp = p->p_vkernel;
731 
732 	/*
733 	 * Restore the original VM context if we are killed while running
734 	 * a different one.
735 	 *
736 	 * This isn't supposed to happen.  What is supposed to happen is
737 	 * that the process should enter vkernel_trap() before the handling
738 	 * the signal.
739 	 */
740 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
741 		vkernel_lwp_exit(lp);
742 	}
743 
744 	/*
745 	 * Dereference the common area
746 	 */
747 	p->p_vkernel = NULL;
748 	KKASSERT(vkp->refs > 0);
749 
750 	if (atomic_fetchadd_int(&vkp->refs, -1) == 1) {
751 		lwkt_gettoken(&vkp->token);
752 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
753 			rb_vmspace_delete, vkp);
754 		lwkt_reltoken(&vkp->token);
755 		kfree(vkp, M_VKERNEL);
756 	}
757 }
758 
759 /*
760  * No requirements.
761  */
762 void
763 vkernel_lwp_exit(struct lwp *lp)
764 {
765 	struct vkernel_lwp *vklp;
766 	struct vmspace_entry *ve;
767 
768 	if ((vklp = lp->lwp_vkernel) != NULL) {
769 		if (lp->lwp_thread->td_vmm == NULL) {
770 			/*
771 			 * vkernel thread
772 			 */
773 			if ((ve = vklp->ve) != NULL) {
774 				kprintf("Warning, pid %d killed with "
775 					"active VC!\n", lp->lwp_proc->p_pid);
776 				pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
777 				vklp->ve = NULL;
778 				KKASSERT(ve->refs > 0);
779 				vmspace_entry_drop(ve);
780 			}
781 		} else {
782 			/*
783 			 * guest thread
784 			 */
785 			vklp->ve = NULL;
786 		}
787 		if ((ve = vklp->ve_cache) != NULL) {
788 			vklp->ve_cache = NULL;
789 			vmspace_entry_cache_drop(ve);
790 		}
791 
792 		lp->lwp_vkernel = NULL;
793 		kfree(vklp, M_VKERNEL);
794 	}
795 }
796 
797 /*
798  * A VM space under virtual kernel control trapped out or made a system call
799  * or otherwise needs to return control to the virtual kernel context.
800  *
801  * No requirements.
802  */
803 void
804 vkernel_trap(struct lwp *lp, struct trapframe *frame)
805 {
806 	struct proc *p = lp->lwp_proc;
807 	struct vmspace_entry *ve;
808 	struct vkernel_lwp *vklp;
809 	int error;
810 
811 	/*
812 	 * Which vmspace entry was running?
813 	 */
814 	vklp = lp->lwp_vkernel;
815 	KKASSERT(vklp);
816 
817 	/* If it's a VMM thread just set the vkernel CR3 back */
818 	if (curthread->td_vmm == NULL) {
819 		ve = vklp->ve;
820 		KKASSERT(ve != NULL);
821 
822 		/*
823 		 * Switch the LWP vmspace back to the virtual kernel's VM space.
824 		 */
825 		vklp->ve = NULL;
826 		pmap_setlwpvm(lp, p->p_vmspace);
827 		KKASSERT(ve->refs > 0);
828 		vmspace_entry_drop(ve);
829 		/* ve is invalid once we kill our ref */
830 	} else {
831 		vklp->ve = NULL;
832 		vmm_vm_set_guest_cr3(p->p_vkernel->vkernel_cr3);
833 	}
834 
835 	/*
836 	 * Copy the emulated process frame to the virtual kernel process.
837 	 * The emulated process cannot change TLS descriptors so don't
838 	 * bother saving them, we already have a copy.
839 	 *
840 	 * Restore the virtual kernel's saved context so the virtual kernel
841 	 * process can resume.
842 	 */
843 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
844 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
845 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
846 	      sizeof(vklp->save_vextframe.vx_tls));
847 	set_user_TLS();
848 	cpu_vkernel_trap(frame, error);
849 }
850