xref: /dragonfly/sys/vm/vm_vmspace.c (revision 4a65f651)
1 /*
2  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vm/vm_vmspace.c,v 1.14 2007/08/15 03:15:07 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysproto.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/proc.h>
44 #include <sys/malloc.h>
45 #include <sys/sysctl.h>
46 #include <sys/vkernel.h>
47 #include <sys/vmspace.h>
48 
49 #include <vm/vm_extern.h>
50 #include <vm/pmap.h>
51 
52 #include <machine/vmparam.h>
53 
54 #include <sys/spinlock2.h>
55 #include <sys/sysref2.h>
56 
57 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
58 						  void *id);
59 static void vmspace_entry_delete(struct vmspace_entry *ve,
60 				 struct vkernel_proc *vkp);
61 
62 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
63 
64 /*
65  * vmspace_create (void *id, int type, void *data)
66  *
67  * Create a VMSPACE under the control of the caller with the specified id.
68  * An id of NULL cannot be used.  The type and data fields must currently
69  * be 0.
70  *
71  * The vmspace starts out completely empty.  Memory may be mapped into the
72  * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled
73  * with vmspace_mcontrol().
74  */
75 int
76 sys_vmspace_create(struct vmspace_create_args *uap)
77 {
78 	struct vmspace_entry *ve;
79 	struct vkernel_proc *vkp;
80 
81 	if (vkernel_enable == 0)
82 		return (EOPNOTSUPP);
83 
84 	/*
85 	 * Create a virtual kernel side-structure for the process if one
86 	 * does not exist.
87 	 */
88 	if ((vkp = curproc->p_vkernel) == NULL) {
89 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
90 		vkp->refs = 1;
91 		spin_init(&vkp->spin);
92 		RB_INIT(&vkp->root);
93 		curproc->p_vkernel = vkp;
94 	}
95 
96 	/*
97 	 * Create a new VMSPACE
98 	 */
99 	if (vkernel_find_vmspace(vkp, uap->id))
100 		return (EEXIST);
101 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
102 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
103 	ve->id = uap->id;
104 	pmap_pinit2(vmspace_pmap(ve->vmspace));
105 	RB_INSERT(vmspace_rb_tree, &vkp->root, ve);
106 	return (0);
107 }
108 
109 /*
110  * vmspace_destroy (void *id)
111  *
112  * Destroy a VMSPACE.
113  */
114 int
115 sys_vmspace_destroy(struct vmspace_destroy_args *uap)
116 {
117 	struct vkernel_proc *vkp;
118 	struct vmspace_entry *ve;
119 
120 	if ((vkp = curproc->p_vkernel) == NULL)
121 		return (EINVAL);
122 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL)
123 		return (ENOENT);
124 	if (ve->refs)
125 		return (EBUSY);
126 	vmspace_entry_delete(ve, vkp);
127 	return(0);
128 }
129 
130 /*
131  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
132  *		struct vextframe *vframe);
133  *
134  * Transfer control to a VMSPACE.  Control is returned after the specified
135  * number of microseconds or if a page fault, signal, trap, or system call
136  * occurs.  The context is updated as appropriate.
137  */
138 int
139 sys_vmspace_ctl(struct vmspace_ctl_args *uap)
140 {
141 	struct vkernel_proc *vkp;
142 	struct vkernel_lwp *vklp;
143 	struct vmspace_entry *ve;
144 	struct lwp *lp;
145 	struct proc *p;
146 	int framesz;
147 	int error;
148 
149 	lp = curthread->td_lwp;
150 	p = lp->lwp_proc;
151 
152 	if ((vkp = p->p_vkernel) == NULL)
153 		return (EINVAL);
154 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL)
155 		return (ENOENT);
156 
157 	/*
158 	 * Signal mailbox interlock
159 	 */
160 	if (p->p_flag & P_MAILBOX) {
161 		p->p_flag &= ~P_MAILBOX;
162 		return (EINTR);
163 	}
164 
165 	switch(uap->cmd) {
166 	case VMSPACE_CTL_RUN:
167 		/*
168 		 * Save the caller's register context, swap VM spaces, and
169 		 * install the passed register context.  Return with
170 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
171 		 */
172 		atomic_add_int(&ve->refs, 1);
173 		framesz = sizeof(struct trapframe);
174 		if ((vklp = lp->lwp_vkernel) == NULL) {
175 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
176 				       M_WAITOK|M_ZERO);
177 			lp->lwp_vkernel = vklp;
178 		}
179 		vklp->user_trapframe = uap->tframe;
180 		vklp->user_vextframe = uap->vframe;
181 		bcopy(uap->sysmsg_frame, &vklp->save_trapframe, framesz);
182 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
183 		      sizeof(vklp->save_vextframe.vx_tls));
184 		error = copyin(uap->tframe, uap->sysmsg_frame, framesz);
185 		if (error == 0)
186 			error = copyin(&uap->vframe->vx_tls, &curthread->td_tls, sizeof(struct savetls));
187 		if (error == 0)
188 			error = cpu_sanitize_frame(uap->sysmsg_frame);
189 		if (error == 0)
190 			error = cpu_sanitize_tls(&curthread->td_tls);
191 		if (error) {
192 			bcopy(&vklp->save_trapframe, uap->sysmsg_frame, framesz);
193 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
194 			      sizeof(vklp->save_vextframe.vx_tls));
195 			set_user_TLS();
196 			atomic_subtract_int(&ve->refs, 1);
197 		} else {
198 			vklp->ve = ve;
199 			pmap_setlwpvm(lp, ve->vmspace);
200 			set_user_TLS();
201 			set_vkernel_fp(uap->sysmsg_frame);
202 			error = EJUSTRETURN;
203 		}
204 		break;
205 	default:
206 		error = EOPNOTSUPP;
207 		break;
208 	}
209 	return(error);
210 }
211 
212 /*
213  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
214  *
215  * map memory within a VMSPACE.  This function is just like a normal mmap()
216  * but operates on the vmspace's memory map.  Most callers use this to create
217  * a MAP_VPAGETABLE mapping.
218  */
219 int
220 sys_vmspace_mmap(struct vmspace_mmap_args *uap)
221 {
222 	struct vkernel_proc *vkp;
223 	struct vmspace_entry *ve;
224 	int error;
225 
226 	if ((vkp = curproc->p_vkernel) == NULL)
227 		return (EINVAL);
228 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL)
229 		return (ENOENT);
230 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
231 			  uap->prot, uap->flags,
232 			  uap->fd, uap->offset, &uap->sysmsg_resultp);
233 	return (error);
234 }
235 
236 /*
237  * vmspace_munmap(id, addr, len)
238  *
239  * unmap memory within a VMSPACE.
240  */
241 int
242 sys_vmspace_munmap(struct vmspace_munmap_args *uap)
243 {
244 	struct vkernel_proc *vkp;
245 	struct vmspace_entry *ve;
246 	vm_offset_t addr;
247 	vm_offset_t tmpaddr;
248 	vm_size_t size, pageoff;
249 	vm_map_t map;
250 
251 	if ((vkp = curproc->p_vkernel) == NULL)
252 		return (EINVAL);
253 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL)
254 		return (ENOENT);
255 
256 	/*
257 	 * Copied from sys_munmap()
258 	 */
259 	addr = (vm_offset_t)uap->addr;
260 	size = uap->len;
261 
262 	pageoff = (addr & PAGE_MASK);
263 	addr -= pageoff;
264 	size += pageoff;
265 	size = (vm_size_t)round_page(size);
266 	if (size < uap->len)		/* wrap */
267 		return (EINVAL);
268 	tmpaddr = addr + size;		/* workaround gcc4 opt */
269 	if (tmpaddr < addr)		/* wrap */
270 		return (EINVAL);
271 	if (size == 0)
272 		return (0);
273 
274 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS)
275 		return (EINVAL);
276 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS)
277 		return (EINVAL);
278 	map = &ve->vmspace->vm_map;
279 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE))
280 		return (EINVAL);
281 	vm_map_remove(map, addr, addr + size);
282 	return (0);
283 }
284 
285 /*
286  * vmspace_pread(id, buf, nbyte, flags, offset)
287  *
288  * Read data from a vmspace.  The number of bytes read is returned or
289  * -1 if an unrecoverable error occured.  If the number of bytes read is
290  * less then the request size, a page fault occured in the VMSPACE which
291  * the caller must resolve in order to proceed.
292  */
293 int
294 sys_vmspace_pread(struct vmspace_pread_args *uap)
295 {
296 	struct vkernel_proc *vkp;
297 	struct vmspace_entry *ve;
298 
299 	if ((vkp = curproc->p_vkernel) == NULL)
300 		return (EINVAL);
301 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL)
302 		return (ENOENT);
303 	return (EINVAL);
304 }
305 
306 /*
307  * vmspace_pwrite(id, buf, nbyte, flags, offset)
308  *
309  * Write data to a vmspace.  The number of bytes written is returned or
310  * -1 if an unrecoverable error occured.  If the number of bytes written is
311  * less then the request size, a page fault occured in the VMSPACE which
312  * the caller must resolve in order to proceed.
313  */
314 int
315 sys_vmspace_pwrite(struct vmspace_pwrite_args *uap)
316 {
317 	struct vkernel_proc *vkp;
318 	struct vmspace_entry *ve;
319 
320 	if ((vkp = curproc->p_vkernel) == NULL)
321 		return (EINVAL);
322 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL)
323 		return (ENOENT);
324 	return (EINVAL);
325 }
326 
327 /*
328  * vmspace_mcontrol(id, addr, len, behav, value)
329  *
330  * madvise/mcontrol support for a vmspace.
331  */
332 int
333 sys_vmspace_mcontrol(struct vmspace_mcontrol_args *uap)
334 {
335 	struct vkernel_proc *vkp;
336 	struct vmspace_entry *ve;
337 	vm_offset_t start, end;
338 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
339 
340 	if ((vkp = curproc->p_vkernel) == NULL)
341 		return (EINVAL);
342 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL)
343 		return (ENOENT);
344 
345 	/*
346 	 * This code is basically copied from sys_mcontrol()
347 	 */
348 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END)
349 		return (EINVAL);
350 
351 	if (tmpaddr < (vm_offset_t)uap->addr)
352 		return (EINVAL);
353 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS)
354 		return (EINVAL);
355         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS)
356 		return (EINVAL);
357 
358 	start = trunc_page((vm_offset_t) uap->addr);
359 	end = round_page(tmpaddr);
360 
361 	return (vm_map_madvise(&ve->vmspace->vm_map, start, end,
362 				uap->behav, uap->value));
363 }
364 
365 /*
366  * Red black tree functions
367  */
368 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
369 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
370 
371 /* a->start is address, and the only field has to be initialized */
372 static int
373 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
374 {
375         if ((char *)a->id < (char *)b->id)
376                 return(-1);
377         else if ((char *)a->id > (char *)b->id)
378                 return(1);
379         return(0);
380 }
381 
382 static
383 int
384 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
385 {
386 	struct vkernel_proc *vkp = data;
387 
388 	KKASSERT(ve->refs == 0);
389 	vmspace_entry_delete(ve, vkp);
390 	return(0);
391 }
392 
393 /*
394  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
395  * up the pmap, the vm_map, then destroy the vmspace.
396  */
397 static
398 void
399 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp)
400 {
401 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
402 
403 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
404 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
405 	vm_map_remove(&ve->vmspace->vm_map,
406 		      VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
407 	sysref_put(&ve->vmspace->vm_sysref);
408 	kfree(ve, M_VKERNEL);
409 }
410 
411 
412 static
413 struct vmspace_entry *
414 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id)
415 {
416 	struct vmspace_entry *ve;
417 	struct vmspace_entry key;
418 
419 	key.id = id;
420 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
421 	return (ve);
422 }
423 
424 /*
425  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
426  * a vkernel process.
427  */
428 void
429 vkernel_inherit(struct proc *p1, struct proc *p2)
430 {
431 	struct vkernel_proc *vkp;
432 
433 	vkp = p1->p_vkernel;
434 	KKASSERT(vkp->refs > 0);
435 	atomic_add_int(&vkp->refs, 1);
436 	p2->p_vkernel = vkp;
437 }
438 
439 void
440 vkernel_exit(struct proc *p)
441 {
442 	struct vkernel_proc *vkp;
443 	struct lwp *lp;
444 	int freeme = 0;
445 
446 	vkp = p->p_vkernel;
447 	/*
448 	 * Restore the original VM context if we are killed while running
449 	 * a different one.
450 	 *
451 	 * This isn't supposed to happen.  What is supposed to happen is
452 	 * that the process should enter vkernel_trap() before the handling
453 	 * the signal.
454 	 */
455 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
456 		vkernel_lwp_exit(lp);
457 	}
458 
459 	/*
460 	 * Dereference the common area
461 	 */
462 	p->p_vkernel = NULL;
463 	KKASSERT(vkp->refs > 0);
464 	spin_lock_wr(&vkp->spin);
465 	if (--vkp->refs == 0)
466 		freeme = 1;
467 	spin_unlock_wr(&vkp->spin);
468 
469 	if (freeme) {
470 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
471 			rb_vmspace_delete, vkp);
472 		kfree(vkp, M_VKERNEL);
473 	}
474 }
475 
476 void
477 vkernel_lwp_exit(struct lwp *lp)
478 {
479 	struct vkernel_lwp *vklp;
480 	struct vmspace_entry *ve;
481 
482 	if ((vklp = lp->lwp_vkernel) != NULL) {
483 		if ((ve = vklp->ve) != NULL) {
484 			kprintf("Warning, pid %d killed with "
485 				"active VC!\n", lp->lwp_proc->p_pid);
486 			print_backtrace();
487 			pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
488 			vklp->ve = NULL;
489 			KKASSERT(ve->refs > 0);
490 			atomic_subtract_int(&ve->refs, 1);
491 		}
492 		lp->lwp_vkernel = NULL;
493 		kfree(vklp, M_VKERNEL);
494 	}
495 }
496 
497 /*
498  * A VM space under virtual kernel control trapped out or made a system call
499  * or otherwise needs to return control to the virtual kernel context.
500  */
501 int
502 vkernel_trap(struct lwp *lp, struct trapframe *frame)
503 {
504 	struct proc *p = lp->lwp_proc;
505 	struct vmspace_entry *ve;
506 	struct vkernel_lwp *vklp;
507 	int error;
508 
509 	/*
510 	 * Which vmspace entry was running?
511 	 */
512 	vklp = lp->lwp_vkernel;
513 	KKASSERT(vklp);
514 	ve = vklp->ve;
515 	KKASSERT(ve != NULL);
516 
517 	/*
518 	 * Switch the LWP vmspace back to the virtual kernel's VM space.
519 	 */
520 	vklp->ve = NULL;
521 	pmap_setlwpvm(lp, p->p_vmspace);
522 	KKASSERT(ve->refs > 0);
523 	atomic_subtract_int(&ve->refs, 1);
524 
525 	/*
526 	 * Copy the emulated process frame to the virtual kernel process.
527 	 * The emulated process cannot change TLS descriptors so don't
528 	 * bother saving them, we already have a copy.
529 	 *
530 	 * Restore the virtual kernel's saved context so the virtual kernel
531 	 * process can resume.
532 	 */
533 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
534 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
535 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
536 	      sizeof(vklp->save_vextframe.vx_tls));
537 	set_user_TLS();
538 	return(error);
539 }
540 
541