xref: /dragonfly/sys/vm/vm_vmspace.c (revision 655933d6)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysmsg.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/thread.h>
44 #include <sys/proc.h>
45 #include <sys/malloc.h>
46 #include <sys/sysctl.h>
47 #include <sys/vkernel.h>
48 #include <sys/vmspace.h>
49 
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/vmparam.h>
54 
55 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
56 						  void *id, int havetoken);
57 static int vmspace_entry_delete(struct vmspace_entry *ve,
58 				 struct vkernel_proc *vkp, int refs);
59 static void vmspace_entry_cache_ref(struct vmspace_entry *ve);
60 static void vmspace_entry_cache_drop(struct vmspace_entry *ve);
61 static void vmspace_entry_drop(struct vmspace_entry *ve);
62 
63 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
64 
65 /*
66  * vmspace_create (void *id, int type, void *data)
67  *
68  * Create a VMSPACE under the control of the caller with the specified id.
69  * An id of NULL cannot be used.  The type and data fields must currently
70  * be 0.
71  *
72  * The vmspace starts out completely empty.  Memory may be mapped into the
73  * VMSPACE with vmspace_mmap().
74  *
75  * No requirements.
76  */
77 int
78 sys_vmspace_create(struct sysmsg *sysmsg,
79 		   const struct vmspace_create_args *uap)
80 {
81 	struct vmspace_entry *ve;
82 	struct vkernel_proc *vkp;
83 	struct proc *p = curproc;
84 	int error;
85 
86 	if (vkernel_enable == 0)
87 		return (EOPNOTSUPP);
88 
89 	/*
90 	 * Create a virtual kernel side-structure for the process if one
91 	 * does not exist.
92 	 *
93 	 * Implement a simple resolution for SMP races.
94 	 */
95 	if ((vkp = p->p_vkernel) == NULL) {
96 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
97 		lwkt_gettoken(&p->p_token);
98 		if (p->p_vkernel == NULL) {
99 			vkp->refs = 1;
100 			lwkt_token_init(&vkp->token, "vkernel");
101 			RB_INIT(&vkp->root);
102 			p->p_vkernel = vkp;
103 		} else {
104 			kfree(vkp, M_VKERNEL);
105 			vkp = p->p_vkernel;
106 		}
107 		lwkt_reltoken(&p->p_token);
108 	}
109 
110 	/*
111 	 * Create a new VMSPACE, disallow conflicting ids
112 	 */
113 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
114 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
115 	ve->id = uap->id;
116 	ve->refs = 0;		/* active refs (none) */
117 	ve->cache_refs = 1;	/* on-tree, not deleted (prevent kfree) */
118 	pmap_pinit2(vmspace_pmap(ve->vmspace));
119 
120 	lwkt_gettoken(&vkp->token);
121 	if (RB_INSERT(vmspace_rb_tree, &vkp->root, ve)) {
122 		vmspace_rel(ve->vmspace);
123 		ve->vmspace = NULL; /* safety */
124 		kfree(ve, M_VKERNEL);
125 		error = EEXIST;
126 	} else {
127 		error = 0;
128 	}
129 	lwkt_reltoken(&vkp->token);
130 
131 	return (error);
132 }
133 
134 /*
135  * Destroy a VMSPACE given its identifier.
136  *
137  * No requirements.
138  */
139 int
140 sys_vmspace_destroy(struct sysmsg *sysmsg,
141 		    const struct vmspace_destroy_args *uap)
142 {
143 	struct vkernel_proc *vkp;
144 	struct vmspace_entry *ve;
145 	int error;
146 
147 	if ((vkp = curproc->p_vkernel) == NULL)
148 		return EINVAL;
149 
150 	/*
151 	 * vkp->token protects the deletion against a new RB tree search.
152 	 */
153 	lwkt_gettoken(&vkp->token);
154 	error = ENOENT;
155 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 1)) != NULL) {
156 		error = vmspace_entry_delete(ve, vkp, 1);
157 		if (error == 0)
158 			vmspace_entry_cache_drop(ve);
159 	}
160 	lwkt_reltoken(&vkp->token);
161 
162 	return(error);
163 }
164 
165 /*
166  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
167  *		struct vextframe *vframe);
168  *
169  * Transfer control to a VMSPACE.  Control is returned after the specified
170  * number of microseconds or if a page fault, signal, trap, or system call
171  * occurs.  The context is updated as appropriate.
172  *
173  * No requirements.
174  */
175 int
176 sys_vmspace_ctl(struct sysmsg *sysmsg,
177 		const struct vmspace_ctl_args *uap)
178 {
179 	struct vmspace_ctl_args ua = *uap;
180 	struct vkernel_proc *vkp;
181 	struct vkernel_lwp *vklp;
182 	struct vmspace_entry *ve = NULL;
183 	struct lwp *lp;
184 	struct proc *p;
185 	int framesz;
186 	int error;
187 
188 	lp = curthread->td_lwp;
189 	p = lp->lwp_proc;
190 
191 	if ((vkp = p->p_vkernel) == NULL)
192 		return (EINVAL);
193 
194 	/*
195 	 * NOTE: We have to copy *uap into ua because uap is an aliased
196 	 *	 pointer into the sysframe, which we are replacing.
197 	 */
198 	if ((ve = vkernel_find_vmspace(vkp, ua.id, 0)) == NULL) {
199 		error = ENOENT;
200 		goto done;
201 	}
202 
203 	switch(ua.cmd) {
204 	case VMSPACE_CTL_RUN:
205 		/*
206 		 * Save the caller's register context, swap VM spaces, and
207 		 * install the passed register context.  Return with
208 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
209 		 */
210 		framesz = sizeof(struct trapframe);
211 		if ((vklp = lp->lwp_vkernel) == NULL) {
212 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
213 				       M_WAITOK|M_ZERO);
214 			lp->lwp_vkernel = vklp;
215 		}
216 		if (ve && vklp->ve_cache != ve) {
217 			vmspace_entry_cache_ref(ve);
218 			if (vklp->ve_cache)
219 				vmspace_entry_cache_drop(vklp->ve_cache);
220 			vklp->ve_cache = ve;
221 		}
222 		vklp->user_trapframe = ua.tframe;
223 		vklp->user_vextframe = ua.vframe;
224 		bcopy(sysmsg->sysmsg_frame, &vklp->save_trapframe, framesz);
225 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
226 		      sizeof(vklp->save_vextframe.vx_tls));
227 		error = copyin(ua.tframe, sysmsg->sysmsg_frame, framesz);
228 		if (error == 0) {
229 			error = copyin(&ua.vframe->vx_tls,
230 				       &curthread->td_tls,
231 				       sizeof(struct savetls));
232 		}
233 		if (error == 0)
234 			error = cpu_sanitize_frame(sysmsg->sysmsg_frame);
235 		if (error == 0)
236 			error = cpu_sanitize_tls(&curthread->td_tls);
237 		if (error) {
238 			bcopy(&vklp->save_trapframe, sysmsg->sysmsg_frame,
239 			      framesz);
240 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
241 			      sizeof(vklp->save_vextframe.vx_tls));
242 			set_user_TLS();
243 		} else {
244 			vklp->ve = ve;
245 			atomic_add_int(&ve->refs, 1);
246 			pmap_setlwpvm(lp, ve->vmspace);
247 			set_user_TLS();
248 			set_vkernel_fp(sysmsg->sysmsg_frame);
249 			error = EJUSTRETURN;
250 		}
251 		break;
252 	default:
253 		error = EOPNOTSUPP;
254 		break;
255 	}
256 done:
257 	if (ve)
258 		vmspace_entry_drop(ve);
259 
260 	return(error);
261 }
262 
263 /*
264  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
265  *
266  * map memory within a VMSPACE.  This function is just like a normal mmap()
267  * but operates on the vmspace's memory map.
268  *
269  * No requirements.
270  */
271 int
272 sys_vmspace_mmap(struct sysmsg *sysmsg,
273 		 const struct vmspace_mmap_args *uap)
274 {
275 	struct vkernel_proc *vkp;
276 	struct vmspace_entry *ve;
277 	int error;
278 
279 	if ((vkp = curproc->p_vkernel) == NULL) {
280 		error = EINVAL;
281 		goto done2;
282 	}
283 
284 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
285 		error = ENOENT;
286 		goto done2;
287 	}
288 
289 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
290 			  uap->prot, uap->flags,
291 			  uap->fd, uap->offset, &sysmsg->sysmsg_resultp);
292 
293 	vmspace_entry_drop(ve);
294 done2:
295 	return (error);
296 }
297 
298 /*
299  * vmspace_munmap(id, addr, len)
300  *
301  * unmap memory within a VMSPACE.
302  *
303  * No requirements.
304  */
305 int
306 sys_vmspace_munmap(struct sysmsg *sysmsg,
307 		   const struct vmspace_munmap_args *uap)
308 {
309 	struct vkernel_proc *vkp;
310 	struct vmspace_entry *ve;
311 	vm_offset_t addr;
312 	vm_offset_t tmpaddr;
313 	vm_size_t size, pageoff;
314 	vm_map_t map;
315 	int error;
316 
317 	if ((vkp = curproc->p_vkernel) == NULL) {
318 		error = EINVAL;
319 		goto done2;
320 	}
321 
322 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
323 		error = ENOENT;
324 		goto done2;
325 	}
326 
327 	/*
328 	 * NOTE: kern_munmap() can block so we need to temporarily
329 	 *	 ref ve->refs.
330 	 */
331 
332 	/*
333 	 * Copied from sys_munmap()
334 	 */
335 	addr = (vm_offset_t)uap->addr;
336 	size = uap->len;
337 
338 	pageoff = (addr & PAGE_MASK);
339 	addr -= pageoff;
340 	size += pageoff;
341 	size = (vm_size_t)round_page(size);
342 	if (size < uap->len) {		/* wrap */
343 		error = EINVAL;
344 		goto done1;
345 	}
346 	tmpaddr = addr + size;		/* workaround gcc4 opt */
347 	if (tmpaddr < addr) {		/* wrap */
348 		error = EINVAL;
349 		goto done1;
350 	}
351 	if (size == 0) {
352 		error = 0;
353 		goto done1;
354 	}
355 
356 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
357 		error = EINVAL;
358 		goto done1;
359 	}
360 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
361 		error = EINVAL;
362 		goto done1;
363 	}
364 	map = &ve->vmspace->vm_map;
365 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) {
366 		error = EINVAL;
367 		goto done1;
368 	}
369 	vm_map_remove(map, addr, addr + size);
370 	error = 0;
371 done1:
372 	vmspace_entry_drop(ve);
373 done2:
374 	return (error);
375 }
376 
377 /*
378  * vmspace_pread(id, buf, nbyte, flags, offset)
379  *
380  * Read data from a vmspace.  The number of bytes read is returned or
381  * -1 if an unrecoverable error occured.  If the number of bytes read is
382  * less then the request size, a page fault occured in the VMSPACE which
383  * the caller must resolve in order to proceed.
384  *
385  * (not implemented yet)
386  * No requirements.
387  */
388 int
389 sys_vmspace_pread(struct sysmsg *sysmsg,
390 		  const struct vmspace_pread_args *uap)
391 {
392 	struct vkernel_proc *vkp;
393 	struct vmspace_entry *ve;
394 	int error;
395 
396 	if ((vkp = curproc->p_vkernel) == NULL) {
397 		error = EINVAL;
398 		goto done3;
399 	}
400 
401 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
402 		error = ENOENT;
403 		goto done3;
404 	}
405 	vmspace_entry_drop(ve);
406 	error = EINVAL;
407 done3:
408 	return (error);
409 }
410 
411 /*
412  * vmspace_pwrite(id, buf, nbyte, flags, offset)
413  *
414  * Write data to a vmspace.  The number of bytes written is returned or
415  * -1 if an unrecoverable error occured.  If the number of bytes written is
416  * less then the request size, a page fault occured in the VMSPACE which
417  * the caller must resolve in order to proceed.
418  *
419  * (not implemented yet)
420  * No requirements.
421  */
422 int
423 sys_vmspace_pwrite(struct sysmsg *sysmsg,
424 		   const struct vmspace_pwrite_args *uap)
425 {
426 	struct vkernel_proc *vkp;
427 	struct vmspace_entry *ve;
428 	int error;
429 
430 	if ((vkp = curproc->p_vkernel) == NULL) {
431 		error = EINVAL;
432 		goto done3;
433 	}
434 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
435 		error = ENOENT;
436 		goto done3;
437 	}
438 	vmspace_entry_drop(ve);
439 	error = EINVAL;
440 done3:
441 	return (error);
442 }
443 
444 /*
445  * vmspace_mcontrol(id, addr, len, behav, value)
446  *
447  * madvise/mcontrol support for a vmspace.
448  *
449  * No requirements.
450  */
451 int
452 sys_vmspace_mcontrol(struct sysmsg *sysmsg,
453 		     const struct vmspace_mcontrol_args *uap)
454 {
455 	struct vkernel_proc *vkp;
456 	struct vmspace_entry *ve;
457 	struct lwp *lp;
458 	vm_offset_t start, end;
459 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
460 	int error;
461 
462 	lp = curthread->td_lwp;
463 	if ((vkp = curproc->p_vkernel) == NULL) {
464 		error = EINVAL;
465 		goto done3;
466 	}
467 
468 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
469 		error = ENOENT;
470 		goto done3;
471 	}
472 
473 	/*
474 	 * This code is basically copied from sys_mcontrol()
475 	 */
476 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
477 		error = EINVAL;
478 		goto done1;
479 	}
480 
481 	if (tmpaddr < (vm_offset_t)uap->addr) {
482 		error = EINVAL;
483 		goto done1;
484 	}
485 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
486 		error = EINVAL;
487 		goto done1;
488 	}
489         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
490 		error = EINVAL;
491 		goto done1;
492 	}
493 
494 	start = trunc_page((vm_offset_t) uap->addr);
495 	end = round_page(tmpaddr);
496 
497 	error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
498 				uap->behav, uap->value);
499 done1:
500 	vmspace_entry_drop(ve);
501 done3:
502 	return (error);
503 }
504 
505 /*
506  * Red black tree functions
507  */
508 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
509 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
510 
511 /*
512  * a->start is address, and the only field has to be initialized.
513  * The caller must hold vkp->token.
514  *
515  * The caller must hold vkp->token.
516  */
517 static int
518 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
519 {
520         if ((char *)a->id < (char *)b->id)
521                 return(-1);
522         else if ((char *)a->id > (char *)b->id)
523                 return(1);
524         return(0);
525 }
526 
527 /*
528  * The caller must hold vkp->token.
529  */
530 static
531 int
532 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
533 {
534 	struct vkernel_proc *vkp = data;
535 
536 	if (vmspace_entry_delete(ve, vkp, 0) == 0)
537 		vmspace_entry_cache_drop(ve);
538 	else
539 		panic("rb_vmspace_delete: invalid refs %d", ve->refs);
540 	return(0);
541 }
542 
543 /*
544  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
545  * up the pmap, the vm_map, then destroy the vmspace.  We gain control of
546  * the associated cache_refs ref, which the caller will drop for us.
547  *
548  * The ve must not have any active references other than those from the
549  * caller.  If it does, EBUSY is returned.  The ve may still maintain
550  * any number of cache references which will drop as the related LWPs
551  * execute vmspace operations or exit.
552  *
553  * 0 is returned on success, EBUSY on failure.  On success the caller must
554  * drop the last cache_refs.  We have dropped the callers active refs.
555  *
556  * The caller must hold vkp->token.
557  */
558 static
559 int
560 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp,
561 		     int refs)
562 {
563 	/*
564 	 * Interlocked by vkp->token.
565 	 *
566 	 * Drop the callers refs and set VKE_REF_DELETED atomically, if
567 	 * the remaining refs match exactly.  Dropping refs and setting
568 	 * the DELETED flag atomically protects other threads from trying
569 	 * to use the ve.
570 	 *
571 	 * The caller now owns the final cache_ref that was previously
572 	 * associated with the live state of the ve.
573 	 */
574 	if (atomic_cmpset_int(&ve->refs, refs, VKE_REF_DELETED) == 0) {
575 		KKASSERT(ve->refs >= refs);
576 		return EBUSY;
577 	}
578 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
579 
580 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
581 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
582 	vm_map_remove(&ve->vmspace->vm_map,
583 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
584 	vmspace_rel(ve->vmspace);
585 	ve->vmspace = NULL; /* safety */
586 
587 	return 0;
588 }
589 
590 /*
591  * Ref a ve for cache purposes
592  */
593 static
594 void
595 vmspace_entry_cache_ref(struct vmspace_entry *ve)
596 {
597 	atomic_add_int(&ve->cache_refs, 1);
598 }
599 
600 /*
601  * The ve cache_drop is the final word for a ve.  It gains an extra ref
602  * representing it being on the RB tree and not being in a deleted state.
603  * Removal from the RB tree and deletion manipulate this ref.  The last
604  * drop will thus include full deletion of the ve in addition to the last
605  * cached user going away.
606  */
607 static
608 void
609 vmspace_entry_cache_drop(struct vmspace_entry *ve)
610 {
611 	if (atomic_fetchadd_int(&ve->cache_refs, -1) == 1) {
612 		KKASSERT(ve->refs & VKE_REF_DELETED);
613 		kfree(ve, M_VKERNEL);
614 	}
615 }
616 
617 /*
618  * Drop primary reference.  The ve cannot be freed on the 1->0 transition.
619  * Instead, ve deletion interlocks the final kfree() via cache_refs.
620  */
621 static
622 void
623 vmspace_entry_drop(struct vmspace_entry *ve)
624 {
625 	atomic_fetchadd_int(&ve->refs, -1);
626 }
627 
628 /*
629  * Locate the ve for (id), return the ve or NULL.  If found this function
630  * will bump ve->refs which prevents the ve from being immediately destroyed
631  * (but it can still be removed).
632  *
633  * The cache can potentially contain a stale ve, check by testing ve->vmspace.
634  *
635  * The caller must hold vkp->token if excl is non-zero.
636  */
637 static
638 struct vmspace_entry *
639 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id, int excl)
640 {
641 	struct vmspace_entry *ve;
642 	struct vmspace_entry key;
643 	struct vkernel_lwp *vklp;
644 	struct lwp *lp = curthread->td_lwp;
645 
646 	/*
647 	 * Cache check.  Since we already hold a ref on the cache entry
648 	 * the ve cannot be ripped out from under us while we cycle
649 	 * ve->refs.
650 	 */
651 	if ((vklp = lp->lwp_vkernel) != NULL) {
652 		ve = vklp->ve_cache;
653 		if (ve && ve->id == id) {
654 			uint32_t n;
655 
656 			/*
657 			 * Bump active refs, check to see if the cache
658 			 * entry is stale.  If not, we are good.
659 			 */
660 			n = atomic_fetchadd_int(&ve->refs, 1);
661 			if ((n & VKE_REF_DELETED) == 0) {
662 				KKASSERT(ve->vmspace);
663 				return ve;
664 			}
665 
666 			/*
667 			 * Cache is stale, clean it out and fall through
668 			 * to a normal search.
669 			 */
670 			vklp->ve_cache = NULL;
671 			vmspace_entry_drop(ve);
672 			vmspace_entry_cache_drop(ve);
673 		}
674 	}
675 
676 	/*
677 	 * Normal search protected by vkp->token.  No new ve's can be marked
678 	 * DELETED while we hold the token so we are safe.
679 	 */
680 	if (excl == 0)
681 		lwkt_gettoken_shared(&vkp->token);
682 	key.id = id;
683 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
684 	if (ve) {
685 		if (atomic_fetchadd_int(&ve->refs, 1) & VKE_REF_DELETED) {
686 			vmspace_entry_drop(ve);
687 			ve = NULL;
688 		}
689 	}
690 	if (excl == 0)
691 		lwkt_reltoken(&vkp->token);
692 	return (ve);
693 }
694 
695 /*
696  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
697  * a vkernel process.
698  *
699  * No requirements.
700  */
701 void
702 vkernel_inherit(struct proc *p1, struct proc *p2)
703 {
704 	struct vkernel_proc *vkp;
705 
706 	vkp = p1->p_vkernel;
707 	KKASSERT(vkp->refs > 0);
708 	atomic_add_int(&vkp->refs, 1);
709 	p2->p_vkernel = vkp;
710 }
711 
712 /*
713  * No requirements.
714  */
715 void
716 vkernel_exit(struct proc *p)
717 {
718 	struct vkernel_proc *vkp;
719 	struct lwp *lp;
720 
721 	vkp = p->p_vkernel;
722 
723 	/*
724 	 * Restore the original VM context if we are killed while running
725 	 * a different one.
726 	 *
727 	 * This isn't supposed to happen.  What is supposed to happen is
728 	 * that the process should enter vkernel_trap() before the handling
729 	 * the signal.
730 	 */
731 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
732 		vkernel_lwp_exit(lp);
733 	}
734 
735 	/*
736 	 * Dereference the common area
737 	 */
738 	p->p_vkernel = NULL;
739 	KKASSERT(vkp->refs > 0);
740 
741 	if (atomic_fetchadd_int(&vkp->refs, -1) == 1) {
742 		lwkt_gettoken(&vkp->token);
743 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
744 			rb_vmspace_delete, vkp);
745 		lwkt_reltoken(&vkp->token);
746 		kfree(vkp, M_VKERNEL);
747 	}
748 }
749 
750 /*
751  * No requirements.
752  */
753 void
754 vkernel_lwp_exit(struct lwp *lp)
755 {
756 	struct vkernel_lwp *vklp;
757 	struct vmspace_entry *ve;
758 
759 	if ((vklp = lp->lwp_vkernel) != NULL) {
760 		/*
761 		 * vkernel thread
762 		 */
763 		if ((ve = vklp->ve) != NULL) {
764 			kprintf("Warning, pid %d killed with "
765 			    "active VC!\n", lp->lwp_proc->p_pid);
766 			pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
767 			vklp->ve = NULL;
768 			KKASSERT(ve->refs > 0);
769 			vmspace_entry_drop(ve);
770 		}
771 		if ((ve = vklp->ve_cache) != NULL) {
772 			vklp->ve_cache = NULL;
773 			vmspace_entry_cache_drop(ve);
774 		}
775 
776 		lp->lwp_vkernel = NULL;
777 		kfree(vklp, M_VKERNEL);
778 	}
779 }
780 
781 /*
782  * A VM space under virtual kernel control trapped out or made a system call
783  * or otherwise needs to return control to the virtual kernel context.
784  *
785  * No requirements.
786  */
787 void
788 vkernel_trap(struct lwp *lp, struct trapframe *frame)
789 {
790 	struct proc *p = lp->lwp_proc;
791 	struct vmspace_entry *ve;
792 	struct vkernel_lwp *vklp;
793 	int error;
794 
795 	/*
796 	 * Which vmspace entry was running?
797 	 */
798 	vklp = lp->lwp_vkernel;
799 	KKASSERT(vklp);
800 
801 	ve = vklp->ve;
802 	KKASSERT(ve != NULL);
803 
804 	/*
805 	 * Switch the LWP vmspace back to the virtual kernel's VM space.
806 	 */
807 	vklp->ve = NULL;
808 	pmap_setlwpvm(lp, p->p_vmspace);
809 	KKASSERT(ve->refs > 0);
810 	vmspace_entry_drop(ve);
811 	/* ve is invalid once we kill our ref */
812 
813 	/*
814 	 * Copy the emulated process frame to the virtual kernel process.
815 	 * The emulated process cannot change TLS descriptors so don't
816 	 * bother saving them, we already have a copy.
817 	 *
818 	 * Restore the virtual kernel's saved context so the virtual kernel
819 	 * process can resume.
820 	 */
821 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
822 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
823 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
824 	      sizeof(vklp->save_vextframe.vx_tls));
825 	set_user_TLS();
826 	cpu_vkernel_trap(frame, error);
827 }
828