xref: /dragonfly/sys/vm/vm_vmspace.c (revision e4adeac1)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysmsg.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/thread.h>
44 #include <sys/proc.h>
45 #include <sys/malloc.h>
46 #include <sys/sysctl.h>
47 #include <sys/vkernel.h>
48 #include <sys/vmspace.h>
49 
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/vmparam.h>
54 #include <machine/vmm.h>
55 
56 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
57 						  void *id, int havetoken);
58 static int vmspace_entry_delete(struct vmspace_entry *ve,
59 				 struct vkernel_proc *vkp, int refs);
60 static void vmspace_entry_cache_ref(struct vmspace_entry *ve);
61 static void vmspace_entry_cache_drop(struct vmspace_entry *ve);
62 static void vmspace_entry_drop(struct vmspace_entry *ve);
63 
64 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
65 
66 /*
67  * vmspace_create (void *id, int type, void *data)
68  *
69  * Create a VMSPACE under the control of the caller with the specified id.
70  * An id of NULL cannot be used.  The type and data fields must currently
71  * be 0.
72  *
73  * The vmspace starts out completely empty.  Memory may be mapped into the
74  * VMSPACE with vmspace_mmap().
75  *
76  * No requirements.
77  */
78 int
79 sys_vmspace_create(struct sysmsg *sysmsg,
80 		   const struct vmspace_create_args *uap)
81 {
82 	struct vmspace_entry *ve;
83 	struct vkernel_proc *vkp;
84 	struct proc *p = curproc;
85 	int error;
86 
87 	if (vkernel_enable == 0)
88 		return (EOPNOTSUPP);
89 
90 	/*
91 	 * Create a virtual kernel side-structure for the process if one
92 	 * does not exist.
93 	 *
94 	 * Implement a simple resolution for SMP races.
95 	 */
96 	if ((vkp = p->p_vkernel) == NULL) {
97 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
98 		lwkt_gettoken(&p->p_token);
99 		if (p->p_vkernel == NULL) {
100 			vkp->refs = 1;
101 			lwkt_token_init(&vkp->token, "vkernel");
102 			RB_INIT(&vkp->root);
103 			p->p_vkernel = vkp;
104 		} else {
105 			kfree(vkp, M_VKERNEL);
106 			vkp = p->p_vkernel;
107 		}
108 		lwkt_reltoken(&p->p_token);
109 	}
110 
111 	if (curthread->td_vmm)
112 		return 0;
113 
114 	/*
115 	 * Create a new VMSPACE, disallow conflicting ids
116 	 */
117 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
118 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
119 	ve->id = uap->id;
120 	ve->refs = 0;		/* active refs (none) */
121 	ve->cache_refs = 1;	/* on-tree, not deleted (prevent kfree) */
122 	pmap_pinit2(vmspace_pmap(ve->vmspace));
123 
124 	lwkt_gettoken(&vkp->token);
125 	if (RB_INSERT(vmspace_rb_tree, &vkp->root, ve)) {
126 		vmspace_rel(ve->vmspace);
127 		ve->vmspace = NULL; /* safety */
128 		kfree(ve, M_VKERNEL);
129 		error = EEXIST;
130 	} else {
131 		error = 0;
132 	}
133 	lwkt_reltoken(&vkp->token);
134 
135 	return (error);
136 }
137 
138 /*
139  * Destroy a VMSPACE given its identifier.
140  *
141  * No requirements.
142  */
143 int
144 sys_vmspace_destroy(struct sysmsg *sysmsg,
145 		    const struct vmspace_destroy_args *uap)
146 {
147 	struct vkernel_proc *vkp;
148 	struct vmspace_entry *ve;
149 	int error;
150 
151 	if ((vkp = curproc->p_vkernel) == NULL)
152 		return EINVAL;
153 
154 	/*
155 	 * vkp->token protects the deletion against a new RB tree search.
156 	 */
157 	lwkt_gettoken(&vkp->token);
158 	error = ENOENT;
159 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 1)) != NULL) {
160 		error = vmspace_entry_delete(ve, vkp, 1);
161 		if (error == 0)
162 			vmspace_entry_cache_drop(ve);
163 	}
164 	lwkt_reltoken(&vkp->token);
165 
166 	return(error);
167 }
168 
169 /*
170  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
171  *		struct vextframe *vframe);
172  *
173  * Transfer control to a VMSPACE.  Control is returned after the specified
174  * number of microseconds or if a page fault, signal, trap, or system call
175  * occurs.  The context is updated as appropriate.
176  *
177  * No requirements.
178  */
179 int
180 sys_vmspace_ctl(struct sysmsg *sysmsg,
181 		const struct vmspace_ctl_args *uap)
182 {
183 	struct vmspace_ctl_args ua = *uap;
184 	struct vkernel_proc *vkp;
185 	struct vkernel_lwp *vklp;
186 	struct vmspace_entry *ve = NULL;
187 	struct lwp *lp;
188 	struct proc *p;
189 	int framesz;
190 	int error;
191 
192 	lp = curthread->td_lwp;
193 	p = lp->lwp_proc;
194 
195 	if ((vkp = p->p_vkernel) == NULL)
196 		return (EINVAL);
197 
198 	/*
199 	 * ve only matters when VMM is not used.
200 	 *
201 	 * NOTE: We have to copy *uap into ua because uap is an aliased
202 	 *	 pointer into the sysframe, which we are replacing.
203 	 */
204 	if (curthread->td_vmm == NULL) {
205 		if ((ve = vkernel_find_vmspace(vkp, ua.id, 0)) == NULL) {
206 			error = ENOENT;
207 			goto done;
208 		}
209 	}
210 
211 	switch(ua.cmd) {
212 	case VMSPACE_CTL_RUN:
213 		/*
214 		 * Save the caller's register context, swap VM spaces, and
215 		 * install the passed register context.  Return with
216 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
217 		 */
218 		framesz = sizeof(struct trapframe);
219 		if ((vklp = lp->lwp_vkernel) == NULL) {
220 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
221 				       M_WAITOK|M_ZERO);
222 			lp->lwp_vkernel = vklp;
223 		}
224 		if (ve && vklp->ve_cache != ve) {
225 			vmspace_entry_cache_ref(ve);
226 			if (vklp->ve_cache)
227 				vmspace_entry_cache_drop(vklp->ve_cache);
228 			vklp->ve_cache = ve;
229 		}
230 		vklp->user_trapframe = ua.tframe;
231 		vklp->user_vextframe = ua.vframe;
232 		bcopy(sysmsg->sysmsg_frame, &vklp->save_trapframe, framesz);
233 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
234 		      sizeof(vklp->save_vextframe.vx_tls));
235 		error = copyin(ua.tframe, sysmsg->sysmsg_frame, framesz);
236 		if (error == 0) {
237 			error = copyin(&ua.vframe->vx_tls,
238 				       &curthread->td_tls,
239 				       sizeof(struct savetls));
240 		}
241 		if (error == 0)
242 			error = cpu_sanitize_frame(sysmsg->sysmsg_frame);
243 		if (error == 0)
244 			error = cpu_sanitize_tls(&curthread->td_tls);
245 		if (error) {
246 			bcopy(&vklp->save_trapframe, sysmsg->sysmsg_frame,
247 			      framesz);
248 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
249 			      sizeof(vklp->save_vextframe.vx_tls));
250 			set_user_TLS();
251 		} else {
252 			/*
253 			 * If it's a VMM thread just set the CR3. We also set
254 			 * the vklp->ve to a key to be able to distinguish
255 			 * when a vkernel user process runs and when not
256 			 * (when it's NULL)
257 			 */
258 			if (curthread->td_vmm == NULL) {
259 				vklp->ve = ve;
260 				atomic_add_int(&ve->refs, 1);
261 				pmap_setlwpvm(lp, ve->vmspace);
262 			} else {
263 				vklp->ve = ua.id;
264 				vmm_vm_set_guest_cr3((register_t)ua.id);
265 			}
266 			set_user_TLS();
267 			set_vkernel_fp(sysmsg->sysmsg_frame);
268 			error = EJUSTRETURN;
269 		}
270 		break;
271 	default:
272 		error = EOPNOTSUPP;
273 		break;
274 	}
275 done:
276 	if (ve)
277 		vmspace_entry_drop(ve);
278 
279 	return(error);
280 }
281 
282 /*
283  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
284  *
285  * map memory within a VMSPACE.  This function is just like a normal mmap()
286  * but operates on the vmspace's memory map.
287  *
288  * No requirements.
289  */
290 int
291 sys_vmspace_mmap(struct sysmsg *sysmsg,
292 		 const struct vmspace_mmap_args *uap)
293 {
294 	struct vkernel_proc *vkp;
295 	struct vmspace_entry *ve;
296 	int error;
297 
298 	if ((vkp = curproc->p_vkernel) == NULL) {
299 		error = EINVAL;
300 		goto done2;
301 	}
302 
303 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
304 		error = ENOENT;
305 		goto done2;
306 	}
307 
308 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
309 			  uap->prot, uap->flags,
310 			  uap->fd, uap->offset, &sysmsg->sysmsg_resultp);
311 
312 	vmspace_entry_drop(ve);
313 done2:
314 	return (error);
315 }
316 
317 /*
318  * vmspace_munmap(id, addr, len)
319  *
320  * unmap memory within a VMSPACE.
321  *
322  * No requirements.
323  */
324 int
325 sys_vmspace_munmap(struct sysmsg *sysmsg,
326 		   const struct vmspace_munmap_args *uap)
327 {
328 	struct vkernel_proc *vkp;
329 	struct vmspace_entry *ve;
330 	vm_offset_t addr;
331 	vm_offset_t tmpaddr;
332 	vm_size_t size, pageoff;
333 	vm_map_t map;
334 	int error;
335 
336 	if ((vkp = curproc->p_vkernel) == NULL) {
337 		error = EINVAL;
338 		goto done2;
339 	}
340 
341 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
342 		error = ENOENT;
343 		goto done2;
344 	}
345 
346 	/*
347 	 * NOTE: kern_munmap() can block so we need to temporarily
348 	 *	 ref ve->refs.
349 	 */
350 
351 	/*
352 	 * Copied from sys_munmap()
353 	 */
354 	addr = (vm_offset_t)uap->addr;
355 	size = uap->len;
356 
357 	pageoff = (addr & PAGE_MASK);
358 	addr -= pageoff;
359 	size += pageoff;
360 	size = (vm_size_t)round_page(size);
361 	if (size < uap->len) {		/* wrap */
362 		error = EINVAL;
363 		goto done1;
364 	}
365 	tmpaddr = addr + size;		/* workaround gcc4 opt */
366 	if (tmpaddr < addr) {		/* wrap */
367 		error = EINVAL;
368 		goto done1;
369 	}
370 	if (size == 0) {
371 		error = 0;
372 		goto done1;
373 	}
374 
375 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
376 		error = EINVAL;
377 		goto done1;
378 	}
379 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
380 		error = EINVAL;
381 		goto done1;
382 	}
383 	map = &ve->vmspace->vm_map;
384 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) {
385 		error = EINVAL;
386 		goto done1;
387 	}
388 	vm_map_remove(map, addr, addr + size);
389 	error = 0;
390 done1:
391 	vmspace_entry_drop(ve);
392 done2:
393 	return (error);
394 }
395 
396 /*
397  * vmspace_pread(id, buf, nbyte, flags, offset)
398  *
399  * Read data from a vmspace.  The number of bytes read is returned or
400  * -1 if an unrecoverable error occured.  If the number of bytes read is
401  * less then the request size, a page fault occured in the VMSPACE which
402  * the caller must resolve in order to proceed.
403  *
404  * (not implemented yet)
405  * No requirements.
406  */
407 int
408 sys_vmspace_pread(struct sysmsg *sysmsg,
409 		  const struct vmspace_pread_args *uap)
410 {
411 	struct vkernel_proc *vkp;
412 	struct vmspace_entry *ve;
413 	int error;
414 
415 	if ((vkp = curproc->p_vkernel) == NULL) {
416 		error = EINVAL;
417 		goto done3;
418 	}
419 
420 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
421 		error = ENOENT;
422 		goto done3;
423 	}
424 	vmspace_entry_drop(ve);
425 	error = EINVAL;
426 done3:
427 	return (error);
428 }
429 
430 /*
431  * vmspace_pwrite(id, buf, nbyte, flags, offset)
432  *
433  * Write data to a vmspace.  The number of bytes written is returned or
434  * -1 if an unrecoverable error occured.  If the number of bytes written is
435  * less then the request size, a page fault occured in the VMSPACE which
436  * the caller must resolve in order to proceed.
437  *
438  * (not implemented yet)
439  * No requirements.
440  */
441 int
442 sys_vmspace_pwrite(struct sysmsg *sysmsg,
443 		   const struct vmspace_pwrite_args *uap)
444 {
445 	struct vkernel_proc *vkp;
446 	struct vmspace_entry *ve;
447 	int error;
448 
449 	if ((vkp = curproc->p_vkernel) == NULL) {
450 		error = EINVAL;
451 		goto done3;
452 	}
453 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
454 		error = ENOENT;
455 		goto done3;
456 	}
457 	vmspace_entry_drop(ve);
458 	error = EINVAL;
459 done3:
460 	return (error);
461 }
462 
463 /*
464  * vmspace_mcontrol(id, addr, len, behav, value)
465  *
466  * madvise/mcontrol support for a vmspace.
467  *
468  * No requirements.
469  */
470 int
471 sys_vmspace_mcontrol(struct sysmsg *sysmsg,
472 		     const struct vmspace_mcontrol_args *uap)
473 {
474 	struct vkernel_proc *vkp;
475 	struct vmspace_entry *ve;
476 	struct lwp *lp;
477 	vm_offset_t start, end;
478 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
479 	int error;
480 
481 	lp = curthread->td_lwp;
482 	if ((vkp = curproc->p_vkernel) == NULL) {
483 		error = EINVAL;
484 		goto done3;
485 	}
486 
487 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
488 		error = ENOENT;
489 		goto done3;
490 	}
491 
492 	/*
493 	 * This code is basically copied from sys_mcontrol()
494 	 */
495 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
496 		error = EINVAL;
497 		goto done1;
498 	}
499 
500 	if (tmpaddr < (vm_offset_t)uap->addr) {
501 		error = EINVAL;
502 		goto done1;
503 	}
504 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
505 		error = EINVAL;
506 		goto done1;
507 	}
508         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
509 		error = EINVAL;
510 		goto done1;
511 	}
512 
513 	start = trunc_page((vm_offset_t) uap->addr);
514 	end = round_page(tmpaddr);
515 
516 	error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
517 				uap->behav, uap->value);
518 done1:
519 	vmspace_entry_drop(ve);
520 done3:
521 	return (error);
522 }
523 
524 /*
525  * Red black tree functions
526  */
527 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
528 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
529 
530 /*
531  * a->start is address, and the only field has to be initialized.
532  * The caller must hold vkp->token.
533  *
534  * The caller must hold vkp->token.
535  */
536 static int
537 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
538 {
539         if ((char *)a->id < (char *)b->id)
540                 return(-1);
541         else if ((char *)a->id > (char *)b->id)
542                 return(1);
543         return(0);
544 }
545 
546 /*
547  * The caller must hold vkp->token.
548  */
549 static
550 int
551 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
552 {
553 	struct vkernel_proc *vkp = data;
554 
555 	if (vmspace_entry_delete(ve, vkp, 0) == 0)
556 		vmspace_entry_cache_drop(ve);
557 	else
558 		panic("rb_vmspace_delete: invalid refs %d", ve->refs);
559 	return(0);
560 }
561 
562 /*
563  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
564  * up the pmap, the vm_map, then destroy the vmspace.  We gain control of
565  * the associated cache_refs ref, which the caller will drop for us.
566  *
567  * The ve must not have any active references other than those from the
568  * caller.  If it does, EBUSY is returned.  The ve may still maintain
569  * any number of cache references which will drop as the related LWPs
570  * execute vmspace operations or exit.
571  *
572  * 0 is returned on success, EBUSY on failure.  On success the caller must
573  * drop the last cache_refs.  We have dropped the callers active refs.
574  *
575  * The caller must hold vkp->token.
576  */
577 static
578 int
579 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp,
580 		     int refs)
581 {
582 	/*
583 	 * Interlocked by vkp->token.
584 	 *
585 	 * Drop the callers refs and set VKE_REF_DELETED atomically, if
586 	 * the remaining refs match exactly.  Dropping refs and setting
587 	 * the DELETED flag atomically protects other threads from trying
588 	 * to use the ve.
589 	 *
590 	 * The caller now owns the final cache_ref that was previously
591 	 * associated with the live state of the ve.
592 	 */
593 	if (atomic_cmpset_int(&ve->refs, refs, VKE_REF_DELETED) == 0) {
594 		KKASSERT(ve->refs >= refs);
595 		return EBUSY;
596 	}
597 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
598 
599 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
600 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
601 	vm_map_remove(&ve->vmspace->vm_map,
602 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
603 	vmspace_rel(ve->vmspace);
604 	ve->vmspace = NULL; /* safety */
605 
606 	return 0;
607 }
608 
609 /*
610  * Ref a ve for cache purposes
611  */
612 static
613 void
614 vmspace_entry_cache_ref(struct vmspace_entry *ve)
615 {
616 	atomic_add_int(&ve->cache_refs, 1);
617 }
618 
619 /*
620  * The ve cache_drop is the final word for a ve.  It gains an extra ref
621  * representing it being on the RB tree and not being in a deleted state.
622  * Removal from the RB tree and deletion manipulate this ref.  The last
623  * drop will thus include full deletion of the ve in addition to the last
624  * cached user going away.
625  */
626 static
627 void
628 vmspace_entry_cache_drop(struct vmspace_entry *ve)
629 {
630 	if (atomic_fetchadd_int(&ve->cache_refs, -1) == 1) {
631 		KKASSERT(ve->refs & VKE_REF_DELETED);
632 		kfree(ve, M_VKERNEL);
633 	}
634 }
635 
636 /*
637  * Drop primary reference.  The ve cannot be freed on the 1->0 transition.
638  * Instead, ve deletion interlocks the final kfree() via cache_refs.
639  */
640 static
641 void
642 vmspace_entry_drop(struct vmspace_entry *ve)
643 {
644 	atomic_fetchadd_int(&ve->refs, -1);
645 }
646 
647 /*
648  * Locate the ve for (id), return the ve or NULL.  If found this function
649  * will bump ve->refs which prevents the ve from being immediately destroyed
650  * (but it can still be removed).
651  *
652  * The cache can potentially contain a stale ve, check by testing ve->vmspace.
653  *
654  * The caller must hold vkp->token if excl is non-zero.
655  */
656 static
657 struct vmspace_entry *
658 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id, int excl)
659 {
660 	struct vmspace_entry *ve;
661 	struct vmspace_entry key;
662 	struct vkernel_lwp *vklp;
663 	struct lwp *lp = curthread->td_lwp;
664 
665 	/*
666 	 * Cache check.  Since we already hold a ref on the cache entry
667 	 * the ve cannot be ripped out from under us while we cycle
668 	 * ve->refs.
669 	 */
670 	if ((vklp = lp->lwp_vkernel) != NULL) {
671 		ve = vklp->ve_cache;
672 		if (ve && ve->id == id) {
673 			uint32_t n;
674 
675 			/*
676 			 * Bump active refs, check to see if the cache
677 			 * entry is stale.  If not, we are good.
678 			 */
679 			n = atomic_fetchadd_int(&ve->refs, 1);
680 			if ((n & VKE_REF_DELETED) == 0) {
681 				KKASSERT(ve->vmspace);
682 				return ve;
683 			}
684 
685 			/*
686 			 * Cache is stale, clean it out and fall through
687 			 * to a normal search.
688 			 */
689 			vklp->ve_cache = NULL;
690 			vmspace_entry_drop(ve);
691 			vmspace_entry_cache_drop(ve);
692 		}
693 	}
694 
695 	/*
696 	 * Normal search protected by vkp->token.  No new ve's can be marked
697 	 * DELETED while we hold the token so we are safe.
698 	 */
699 	if (excl == 0)
700 		lwkt_gettoken_shared(&vkp->token);
701 	key.id = id;
702 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
703 	if (ve) {
704 		if (atomic_fetchadd_int(&ve->refs, 1) & VKE_REF_DELETED) {
705 			vmspace_entry_drop(ve);
706 			ve = NULL;
707 		}
708 	}
709 	if (excl == 0)
710 		lwkt_reltoken(&vkp->token);
711 	return (ve);
712 }
713 
714 /*
715  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
716  * a vkernel process.
717  *
718  * No requirements.
719  */
720 void
721 vkernel_inherit(struct proc *p1, struct proc *p2)
722 {
723 	struct vkernel_proc *vkp;
724 
725 	vkp = p1->p_vkernel;
726 	KKASSERT(vkp->refs > 0);
727 	atomic_add_int(&vkp->refs, 1);
728 	p2->p_vkernel = vkp;
729 }
730 
731 /*
732  * No requirements.
733  */
734 void
735 vkernel_exit(struct proc *p)
736 {
737 	struct vkernel_proc *vkp;
738 	struct lwp *lp;
739 
740 	vkp = p->p_vkernel;
741 
742 	/*
743 	 * Restore the original VM context if we are killed while running
744 	 * a different one.
745 	 *
746 	 * This isn't supposed to happen.  What is supposed to happen is
747 	 * that the process should enter vkernel_trap() before the handling
748 	 * the signal.
749 	 */
750 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
751 		vkernel_lwp_exit(lp);
752 	}
753 
754 	/*
755 	 * Dereference the common area
756 	 */
757 	p->p_vkernel = NULL;
758 	KKASSERT(vkp->refs > 0);
759 
760 	if (atomic_fetchadd_int(&vkp->refs, -1) == 1) {
761 		lwkt_gettoken(&vkp->token);
762 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
763 			rb_vmspace_delete, vkp);
764 		lwkt_reltoken(&vkp->token);
765 		kfree(vkp, M_VKERNEL);
766 	}
767 }
768 
769 /*
770  * No requirements.
771  */
772 void
773 vkernel_lwp_exit(struct lwp *lp)
774 {
775 	struct vkernel_lwp *vklp;
776 	struct vmspace_entry *ve;
777 
778 	if ((vklp = lp->lwp_vkernel) != NULL) {
779 		if (lp->lwp_thread->td_vmm == NULL) {
780 			/*
781 			 * vkernel thread
782 			 */
783 			if ((ve = vklp->ve) != NULL) {
784 				kprintf("Warning, pid %d killed with "
785 					"active VC!\n", lp->lwp_proc->p_pid);
786 				pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
787 				vklp->ve = NULL;
788 				KKASSERT(ve->refs > 0);
789 				vmspace_entry_drop(ve);
790 			}
791 		} else {
792 			/*
793 			 * guest thread
794 			 */
795 			vklp->ve = NULL;
796 		}
797 		if ((ve = vklp->ve_cache) != NULL) {
798 			vklp->ve_cache = NULL;
799 			vmspace_entry_cache_drop(ve);
800 		}
801 
802 		lp->lwp_vkernel = NULL;
803 		kfree(vklp, M_VKERNEL);
804 	}
805 }
806 
807 /*
808  * A VM space under virtual kernel control trapped out or made a system call
809  * or otherwise needs to return control to the virtual kernel context.
810  *
811  * No requirements.
812  */
813 void
814 vkernel_trap(struct lwp *lp, struct trapframe *frame)
815 {
816 	struct proc *p = lp->lwp_proc;
817 	struct vmspace_entry *ve;
818 	struct vkernel_lwp *vklp;
819 	int error;
820 
821 	/*
822 	 * Which vmspace entry was running?
823 	 */
824 	vklp = lp->lwp_vkernel;
825 	KKASSERT(vklp);
826 
827 	/* If it's a VMM thread just set the vkernel CR3 back */
828 	if (curthread->td_vmm == NULL) {
829 		ve = vklp->ve;
830 		KKASSERT(ve != NULL);
831 
832 		/*
833 		 * Switch the LWP vmspace back to the virtual kernel's VM space.
834 		 */
835 		vklp->ve = NULL;
836 		pmap_setlwpvm(lp, p->p_vmspace);
837 		KKASSERT(ve->refs > 0);
838 		vmspace_entry_drop(ve);
839 		/* ve is invalid once we kill our ref */
840 	} else {
841 		vklp->ve = NULL;
842 		vmm_vm_set_guest_cr3(p->p_vkernel->vkernel_cr3);
843 	}
844 
845 	/*
846 	 * Copy the emulated process frame to the virtual kernel process.
847 	 * The emulated process cannot change TLS descriptors so don't
848 	 * bother saving them, we already have a copy.
849 	 *
850 	 * Restore the virtual kernel's saved context so the virtual kernel
851 	 * process can resume.
852 	 */
853 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
854 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
855 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
856 	      sizeof(vklp->save_vextframe.vx_tls));
857 	set_user_TLS();
858 	cpu_vkernel_trap(frame, error);
859 }
860