xref: /dragonfly/sys/vm/vm_vmspace.c (revision 59b0b316)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysproto.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/thread.h>
44 #include <sys/proc.h>
45 #include <sys/malloc.h>
46 #include <sys/sysctl.h>
47 #include <sys/vkernel.h>
48 #include <sys/vmspace.h>
49 
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/vmparam.h>
54 #include <machine/vmm.h>
55 
56 #include <sys/sysref2.h>
57 
58 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
59 						  void *id, int havetoken);
60 static int vmspace_entry_delete(struct vmspace_entry *ve,
61 				 struct vkernel_proc *vkp, int refs);
62 static void vmspace_entry_cache_ref(struct vmspace_entry *ve);
63 static void vmspace_entry_cache_drop(struct vmspace_entry *ve);
64 static void vmspace_entry_drop(struct vmspace_entry *ve);
65 
66 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
67 
68 /*
69  * vmspace_create (void *id, int type, void *data)
70  *
71  * Create a VMSPACE under the control of the caller with the specified id.
72  * An id of NULL cannot be used.  The type and data fields must currently
73  * be 0.
74  *
75  * The vmspace starts out completely empty.  Memory may be mapped into the
76  * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled
77  * with vmspace_mcontrol().
78  *
79  * No requirements.
80  */
81 int
82 sys_vmspace_create(struct vmspace_create_args *uap)
83 {
84 	struct vmspace_entry *ve;
85 	struct vkernel_proc *vkp;
86 	struct proc *p = curproc;
87 	int error;
88 
89 	if (vkernel_enable == 0)
90 		return (EOPNOTSUPP);
91 
92 	/*
93 	 * Create a virtual kernel side-structure for the process if one
94 	 * does not exist.
95 	 *
96 	 * Implement a simple resolution for SMP races.
97 	 */
98 	if ((vkp = p->p_vkernel) == NULL) {
99 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
100 		lwkt_gettoken(&p->p_token);
101 		if (p->p_vkernel == NULL) {
102 			vkp->refs = 1;
103 			lwkt_token_init(&vkp->token, "vkernel");
104 			RB_INIT(&vkp->root);
105 			p->p_vkernel = vkp;
106 		} else {
107 			kfree(vkp, M_VKERNEL);
108 			vkp = p->p_vkernel;
109 		}
110 		lwkt_reltoken(&p->p_token);
111 	}
112 
113 	if (curthread->td_vmm)
114 		return 0;
115 
116 	/*
117 	 * Create a new VMSPACE, disallow conflicting ids
118 	 */
119 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
120 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
121 	ve->id = uap->id;
122 	ve->refs = 0;		/* active refs (none) */
123 	ve->cache_refs = 1;	/* on-tree, not deleted (prevent kfree) */
124 	pmap_pinit2(vmspace_pmap(ve->vmspace));
125 
126 	lwkt_gettoken(&vkp->token);
127 	if (RB_INSERT(vmspace_rb_tree, &vkp->root, ve)) {
128 		vmspace_rel(ve->vmspace);
129 		ve->vmspace = NULL; /* safety */
130 		kfree(ve, M_VKERNEL);
131 		error = EEXIST;
132 	} else {
133 		error = 0;
134 	}
135 	lwkt_reltoken(&vkp->token);
136 
137 	return (error);
138 }
139 
140 /*
141  * Destroy a VMSPACE given its identifier.
142  *
143  * No requirements.
144  */
145 int
146 sys_vmspace_destroy(struct vmspace_destroy_args *uap)
147 {
148 	struct vkernel_proc *vkp;
149 	struct vmspace_entry *ve;
150 	int error;
151 
152 	if ((vkp = curproc->p_vkernel) == NULL)
153 		return EINVAL;
154 
155 	/*
156 	 * vkp->token protects the deletion against a new RB tree search.
157 	 */
158 	lwkt_gettoken(&vkp->token);
159 	error = ENOENT;
160 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 1)) != NULL) {
161 		error = vmspace_entry_delete(ve, vkp, 1);
162 		if (error == 0)
163 			vmspace_entry_cache_drop(ve);
164 	}
165 	lwkt_reltoken(&vkp->token);
166 
167 	return(error);
168 }
169 
170 /*
171  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
172  *		struct vextframe *vframe);
173  *
174  * Transfer control to a VMSPACE.  Control is returned after the specified
175  * number of microseconds or if a page fault, signal, trap, or system call
176  * occurs.  The context is updated as appropriate.
177  *
178  * No requirements.
179  */
180 int
181 sys_vmspace_ctl(struct vmspace_ctl_args *uap)
182 {
183 	struct vkernel_proc *vkp;
184 	struct vkernel_lwp *vklp;
185 	struct vmspace_entry *ve = NULL;
186 	struct lwp *lp;
187 	struct proc *p;
188 	int framesz;
189 	int error;
190 
191 	lp = curthread->td_lwp;
192 	p = lp->lwp_proc;
193 
194 	if ((vkp = p->p_vkernel) == NULL)
195 		return (EINVAL);
196 
197 	/*
198 	 * ve only matters when VMM is not used.
199 	 */
200 	if (curthread->td_vmm == NULL) {
201 		if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
202 			error = ENOENT;
203 			goto done;
204 		}
205 	}
206 
207 	switch(uap->cmd) {
208 	case VMSPACE_CTL_RUN:
209 		/*
210 		 * Save the caller's register context, swap VM spaces, and
211 		 * install the passed register context.  Return with
212 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
213 		 */
214 		framesz = sizeof(struct trapframe);
215 		if ((vklp = lp->lwp_vkernel) == NULL) {
216 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
217 				       M_WAITOK|M_ZERO);
218 			lp->lwp_vkernel = vklp;
219 		}
220 		if (ve && vklp->ve_cache != ve) {
221 			vmspace_entry_cache_ref(ve);
222 			if (vklp->ve_cache)
223 				vmspace_entry_cache_drop(vklp->ve_cache);
224 			vklp->ve_cache = ve;
225 		}
226 		vklp->user_trapframe = uap->tframe;
227 		vklp->user_vextframe = uap->vframe;
228 		bcopy(uap->sysmsg_frame, &vklp->save_trapframe, framesz);
229 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
230 		      sizeof(vklp->save_vextframe.vx_tls));
231 		error = copyin(uap->tframe, uap->sysmsg_frame, framesz);
232 		if (error == 0) {
233 			error = copyin(&uap->vframe->vx_tls,
234 				       &curthread->td_tls,
235 				       sizeof(struct savetls));
236 		}
237 		if (error == 0)
238 			error = cpu_sanitize_frame(uap->sysmsg_frame);
239 		if (error == 0)
240 			error = cpu_sanitize_tls(&curthread->td_tls);
241 		if (error) {
242 			bcopy(&vklp->save_trapframe, uap->sysmsg_frame,
243 			      framesz);
244 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
245 			      sizeof(vklp->save_vextframe.vx_tls));
246 			set_user_TLS();
247 		} else {
248 			/*
249 			 * If it's a VMM thread just set the CR3. We also set
250 			 * the vklp->ve to a key to be able to distinguish
251 			 * when a vkernel user process runs and when not
252 			 * (when it's NULL)
253 			 */
254 			if (curthread->td_vmm == NULL) {
255 				vklp->ve = ve;
256 				atomic_add_int(&ve->refs, 1);
257 				pmap_setlwpvm(lp, ve->vmspace);
258 			} else {
259 				vklp->ve = uap->id;
260 				vmm_vm_set_guest_cr3((register_t)uap->id);
261 			}
262 			set_user_TLS();
263 			set_vkernel_fp(uap->sysmsg_frame);
264 			error = EJUSTRETURN;
265 		}
266 		break;
267 	default:
268 		error = EOPNOTSUPP;
269 		break;
270 	}
271 done:
272 	if (ve)
273 		vmspace_entry_drop(ve);
274 
275 	return(error);
276 }
277 
278 /*
279  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
280  *
281  * map memory within a VMSPACE.  This function is just like a normal mmap()
282  * but operates on the vmspace's memory map.  Most callers use this to create
283  * a MAP_VPAGETABLE mapping.
284  *
285  * No requirements.
286  */
287 int
288 sys_vmspace_mmap(struct vmspace_mmap_args *uap)
289 {
290 	struct vkernel_proc *vkp;
291 	struct vmspace_entry *ve;
292 	int error;
293 
294 	if ((vkp = curproc->p_vkernel) == NULL) {
295 		error = EINVAL;
296 		goto done2;
297 	}
298 
299 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
300 		error = ENOENT;
301 		goto done2;
302 	}
303 
304 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
305 			  uap->prot, uap->flags,
306 			  uap->fd, uap->offset, &uap->sysmsg_resultp);
307 
308 	vmspace_entry_drop(ve);
309 done2:
310 	return (error);
311 }
312 
313 /*
314  * vmspace_munmap(id, addr, len)
315  *
316  * unmap memory within a VMSPACE.
317  *
318  * No requirements.
319  */
320 int
321 sys_vmspace_munmap(struct vmspace_munmap_args *uap)
322 {
323 	struct vkernel_proc *vkp;
324 	struct vmspace_entry *ve;
325 	vm_offset_t addr;
326 	vm_offset_t tmpaddr;
327 	vm_size_t size, pageoff;
328 	vm_map_t map;
329 	int error;
330 
331 	if ((vkp = curproc->p_vkernel) == NULL) {
332 		error = EINVAL;
333 		goto done2;
334 	}
335 
336 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
337 		error = ENOENT;
338 		goto done2;
339 	}
340 
341 	/*
342 	 * NOTE: kern_munmap() can block so we need to temporarily
343 	 *	 ref ve->refs.
344 	 */
345 
346 	/*
347 	 * Copied from sys_munmap()
348 	 */
349 	addr = (vm_offset_t)uap->addr;
350 	size = uap->len;
351 
352 	pageoff = (addr & PAGE_MASK);
353 	addr -= pageoff;
354 	size += pageoff;
355 	size = (vm_size_t)round_page(size);
356 	if (size < uap->len) {		/* wrap */
357 		error = EINVAL;
358 		goto done1;
359 	}
360 	tmpaddr = addr + size;		/* workaround gcc4 opt */
361 	if (tmpaddr < addr) {		/* wrap */
362 		error = EINVAL;
363 		goto done1;
364 	}
365 	if (size == 0) {
366 		error = 0;
367 		goto done1;
368 	}
369 
370 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
371 		error = EINVAL;
372 		goto done1;
373 	}
374 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
375 		error = EINVAL;
376 		goto done1;
377 	}
378 	map = &ve->vmspace->vm_map;
379 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) {
380 		error = EINVAL;
381 		goto done1;
382 	}
383 	vm_map_remove(map, addr, addr + size);
384 	error = 0;
385 done1:
386 	vmspace_entry_drop(ve);
387 done2:
388 	return (error);
389 }
390 
391 /*
392  * vmspace_pread(id, buf, nbyte, flags, offset)
393  *
394  * Read data from a vmspace.  The number of bytes read is returned or
395  * -1 if an unrecoverable error occured.  If the number of bytes read is
396  * less then the request size, a page fault occured in the VMSPACE which
397  * the caller must resolve in order to proceed.
398  *
399  * (not implemented yet)
400  * No requirements.
401  */
402 int
403 sys_vmspace_pread(struct vmspace_pread_args *uap)
404 {
405 	struct vkernel_proc *vkp;
406 	struct vmspace_entry *ve;
407 	int error;
408 
409 	if ((vkp = curproc->p_vkernel) == NULL) {
410 		error = EINVAL;
411 		goto done3;
412 	}
413 
414 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
415 		error = ENOENT;
416 		goto done3;
417 	}
418 	vmspace_entry_drop(ve);
419 	error = EINVAL;
420 done3:
421 	return (error);
422 }
423 
424 /*
425  * vmspace_pwrite(id, buf, nbyte, flags, offset)
426  *
427  * Write data to a vmspace.  The number of bytes written is returned or
428  * -1 if an unrecoverable error occured.  If the number of bytes written is
429  * less then the request size, a page fault occured in the VMSPACE which
430  * the caller must resolve in order to proceed.
431  *
432  * (not implemented yet)
433  * No requirements.
434  */
435 int
436 sys_vmspace_pwrite(struct vmspace_pwrite_args *uap)
437 {
438 	struct vkernel_proc *vkp;
439 	struct vmspace_entry *ve;
440 	int error;
441 
442 	if ((vkp = curproc->p_vkernel) == NULL) {
443 		error = EINVAL;
444 		goto done3;
445 	}
446 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
447 		error = ENOENT;
448 		goto done3;
449 	}
450 	vmspace_entry_drop(ve);
451 	error = EINVAL;
452 done3:
453 	return (error);
454 }
455 
456 /*
457  * vmspace_mcontrol(id, addr, len, behav, value)
458  *
459  * madvise/mcontrol support for a vmspace.
460  *
461  * No requirements.
462  */
463 int
464 sys_vmspace_mcontrol(struct vmspace_mcontrol_args *uap)
465 {
466 	struct vkernel_proc *vkp;
467 	struct vmspace_entry *ve;
468 	struct lwp *lp;
469 	vm_offset_t start, end;
470 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
471 	int error;
472 
473 	lp = curthread->td_lwp;
474 	if ((vkp = curproc->p_vkernel) == NULL) {
475 		error = EINVAL;
476 		goto done3;
477 	}
478 
479 	if ((ve = vkernel_find_vmspace(vkp, uap->id, 0)) == NULL) {
480 		error = ENOENT;
481 		goto done3;
482 	}
483 
484 	/*
485 	 * This code is basically copied from sys_mcontrol()
486 	 */
487 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
488 		error = EINVAL;
489 		goto done1;
490 	}
491 
492 	if (tmpaddr < (vm_offset_t)uap->addr) {
493 		error = EINVAL;
494 		goto done1;
495 	}
496 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
497 		error = EINVAL;
498 		goto done1;
499 	}
500         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
501 		error = EINVAL;
502 		goto done1;
503 	}
504 
505 	start = trunc_page((vm_offset_t) uap->addr);
506 	end = round_page(tmpaddr);
507 
508 	error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
509 				uap->behav, uap->value);
510 done1:
511 	vmspace_entry_drop(ve);
512 done3:
513 	return (error);
514 }
515 
516 /*
517  * Red black tree functions
518  */
519 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
520 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
521 
522 /*
523  * a->start is address, and the only field has to be initialized.
524  * The caller must hold vkp->token.
525  *
526  * The caller must hold vkp->token.
527  */
528 static int
529 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
530 {
531         if ((char *)a->id < (char *)b->id)
532                 return(-1);
533         else if ((char *)a->id > (char *)b->id)
534                 return(1);
535         return(0);
536 }
537 
538 /*
539  * The caller must hold vkp->token.
540  */
541 static
542 int
543 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
544 {
545 	struct vkernel_proc *vkp = data;
546 
547 	if (vmspace_entry_delete(ve, vkp, 0) == 0)
548 		vmspace_entry_cache_drop(ve);
549 	else
550 		panic("rb_vmspace_delete: invalid refs %d", ve->refs);
551 	return(0);
552 }
553 
554 /*
555  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
556  * up the pmap, the vm_map, then destroy the vmspace.  We gain control of
557  * the associated cache_refs ref, which the caller will drop for us.
558  *
559  * The ve must not have any active references other than those from the
560  * caller.  If it does, EBUSY is returned.  The ve may still maintain
561  * any number of cache references which will drop as the related LWPs
562  * execute vmspace operations or exit.
563  *
564  * 0 is returned on success, EBUSY on failure.  On success the caller must
565  * drop the last cache_refs.  We have dropped the callers active refs.
566  *
567  * The caller must hold vkp->token.
568  */
569 static
570 int
571 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp,
572 		     int refs)
573 {
574 	/*
575 	 * Interlocked by vkp->token.
576 	 *
577 	 * Drop the callers refs and set VKE_REF_DELETED atomically, if
578 	 * the remaining refs match exactly.  Dropping refs and setting
579 	 * the DELETED flag atomically protects other threads from trying
580 	 * to use the ve.
581 	 *
582 	 * The caller now owns the final cache_ref that was previously
583 	 * associated with the live state of the ve.
584 	 */
585 	if (atomic_cmpset_int(&ve->refs, refs, VKE_REF_DELETED) == 0) {
586 		KKASSERT(ve->refs >= refs);
587 		return EBUSY;
588 	}
589 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
590 
591 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
592 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
593 	vm_map_remove(&ve->vmspace->vm_map,
594 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
595 	vmspace_rel(ve->vmspace);
596 	ve->vmspace = NULL; /* safety */
597 
598 	return 0;
599 }
600 
601 /*
602  * Ref a ve for cache purposes
603  */
604 static
605 void
606 vmspace_entry_cache_ref(struct vmspace_entry *ve)
607 {
608 	atomic_add_int(&ve->cache_refs, 1);
609 }
610 
611 /*
612  * The ve cache_drop is the final word for a ve.  It gains an extra ref
613  * representing it being on the RB tree and not being in a deleted state.
614  * Removal from the RB tree and deletion manipulate this ref.  The last
615  * drop will thus include full deletion of the ve in addition to the last
616  * cached user going away.
617  */
618 static
619 void
620 vmspace_entry_cache_drop(struct vmspace_entry *ve)
621 {
622 	if (atomic_fetchadd_int(&ve->cache_refs, -1) == 1) {
623 		KKASSERT(ve->refs & VKE_REF_DELETED);
624 		kfree(ve, M_VKERNEL);
625 	}
626 }
627 
628 /*
629  * Drop primary reference.  The ve cannot be freed on the 1->0 transition.
630  * Instead, ve deletion interlocks the final kfree() via cache_refs.
631  */
632 static
633 void
634 vmspace_entry_drop(struct vmspace_entry *ve)
635 {
636 	atomic_fetchadd_int(&ve->refs, -1);
637 }
638 
639 /*
640  * Locate the ve for (id), return the ve or NULL.  If found this function
641  * will bump ve->refs which prevents the ve from being immediately destroyed
642  * (but it can still be removed).
643  *
644  * The cache can potentially contain a stale ve, check by testing ve->vmspace.
645  *
646  * The caller must hold vkp->token if excl is non-zero.
647  */
648 static
649 struct vmspace_entry *
650 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id, int excl)
651 {
652 	struct vmspace_entry *ve;
653 	struct vmspace_entry key;
654 	struct vkernel_lwp *vklp;
655 	struct lwp *lp = curthread->td_lwp;
656 
657 	/*
658 	 * Cache check.  Since we already hold a ref on the cache entry
659 	 * the ve cannot be ripped out from under us while we cycle
660 	 * ve->refs.
661 	 */
662 	if ((vklp = lp->lwp_vkernel) != NULL) {
663 		ve = vklp->ve_cache;
664 		if (ve && ve->id == id) {
665 			uint32_t n;
666 
667 			/*
668 			 * Bump active refs, check to see if the cache
669 			 * entry is stale.  If not, we are good.
670 			 */
671 			n = atomic_fetchadd_int(&ve->refs, 1);
672 			if ((n & VKE_REF_DELETED) == 0) {
673 				KKASSERT(ve->vmspace);
674 				return ve;
675 			}
676 
677 			/*
678 			 * Cache is stale, clean it out and fall through
679 			 * to a normal search.
680 			 */
681 			vklp->ve_cache = NULL;
682 			vmspace_entry_drop(ve);
683 			vmspace_entry_cache_drop(ve);
684 		}
685 	}
686 
687 	/*
688 	 * Normal search protected by vkp->token.  No new ve's can be marked
689 	 * DELETED while we hold the token so we are safe.
690 	 */
691 	if (excl == 0)
692 		lwkt_gettoken_shared(&vkp->token);
693 	key.id = id;
694 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
695 	if (ve) {
696 		if (atomic_fetchadd_int(&ve->refs, 1) & VKE_REF_DELETED) {
697 			vmspace_entry_drop(ve);
698 			ve = NULL;
699 		}
700 	}
701 	if (excl == 0)
702 		lwkt_reltoken(&vkp->token);
703 	return (ve);
704 }
705 
706 /*
707  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
708  * a vkernel process.
709  *
710  * No requirements.
711  */
712 void
713 vkernel_inherit(struct proc *p1, struct proc *p2)
714 {
715 	struct vkernel_proc *vkp;
716 
717 	vkp = p1->p_vkernel;
718 	KKASSERT(vkp->refs > 0);
719 	atomic_add_int(&vkp->refs, 1);
720 	p2->p_vkernel = vkp;
721 }
722 
723 /*
724  * No requirements.
725  */
726 void
727 vkernel_exit(struct proc *p)
728 {
729 	struct vkernel_proc *vkp;
730 	struct lwp *lp;
731 
732 	vkp = p->p_vkernel;
733 
734 	/*
735 	 * Restore the original VM context if we are killed while running
736 	 * a different one.
737 	 *
738 	 * This isn't supposed to happen.  What is supposed to happen is
739 	 * that the process should enter vkernel_trap() before the handling
740 	 * the signal.
741 	 */
742 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
743 		vkernel_lwp_exit(lp);
744 	}
745 
746 	/*
747 	 * Dereference the common area
748 	 */
749 	p->p_vkernel = NULL;
750 	KKASSERT(vkp->refs > 0);
751 
752 	if (atomic_fetchadd_int(&vkp->refs, -1) == 1) {
753 		lwkt_gettoken(&vkp->token);
754 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
755 			rb_vmspace_delete, vkp);
756 		lwkt_reltoken(&vkp->token);
757 		kfree(vkp, M_VKERNEL);
758 	}
759 }
760 
761 /*
762  * No requirements.
763  */
764 void
765 vkernel_lwp_exit(struct lwp *lp)
766 {
767 	struct vkernel_lwp *vklp;
768 	struct vmspace_entry *ve;
769 
770 	if ((vklp = lp->lwp_vkernel) != NULL) {
771 		if (lp->lwp_thread->td_vmm == NULL) {
772 			/*
773 			 * vkernel thread
774 			 */
775 			if ((ve = vklp->ve) != NULL) {
776 				kprintf("Warning, pid %d killed with "
777 					"active VC!\n", lp->lwp_proc->p_pid);
778 				pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
779 				vklp->ve = NULL;
780 				KKASSERT(ve->refs > 0);
781 				vmspace_entry_drop(ve);
782 			}
783 		} else {
784 			/*
785 			 * guest thread
786 			 */
787 			vklp->ve = NULL;
788 		}
789 		if ((ve = vklp->ve_cache) != NULL) {
790 			vklp->ve_cache = NULL;
791 			vmspace_entry_cache_drop(ve);
792 		}
793 
794 		lp->lwp_vkernel = NULL;
795 		kfree(vklp, M_VKERNEL);
796 	}
797 }
798 
799 /*
800  * A VM space under virtual kernel control trapped out or made a system call
801  * or otherwise needs to return control to the virtual kernel context.
802  *
803  * No requirements.
804  */
805 void
806 vkernel_trap(struct lwp *lp, struct trapframe *frame)
807 {
808 	struct proc *p = lp->lwp_proc;
809 	struct vmspace_entry *ve;
810 	struct vkernel_lwp *vklp;
811 	int error;
812 
813 	/*
814 	 * Which vmspace entry was running?
815 	 */
816 	vklp = lp->lwp_vkernel;
817 	KKASSERT(vklp);
818 
819 	/* If it's a VMM thread just set the vkernel CR3 back */
820 	if (curthread->td_vmm == NULL) {
821 		ve = vklp->ve;
822 		KKASSERT(ve != NULL);
823 
824 		/*
825 		 * Switch the LWP vmspace back to the virtual kernel's VM space.
826 		 */
827 		vklp->ve = NULL;
828 		pmap_setlwpvm(lp, p->p_vmspace);
829 		KKASSERT(ve->refs > 0);
830 		vmspace_entry_drop(ve);
831 		/* ve is invalid once we kill our ref */
832 	} else {
833 		vklp->ve = NULL;
834 		vmm_vm_set_guest_cr3(p->p_vkernel->vkernel_cr3);
835 	}
836 
837 	/*
838 	 * Copy the emulated process frame to the virtual kernel process.
839 	 * The emulated process cannot change TLS descriptors so don't
840 	 * bother saving them, we already have a copy.
841 	 *
842 	 * Restore the virtual kernel's saved context so the virtual kernel
843 	 * process can resume.
844 	 */
845 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
846 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
847 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
848 	      sizeof(vklp->save_vextframe.vx_tls));
849 	set_user_TLS();
850 	cpu_vkernel_trap(frame, error);
851 }
852