xref: /openbsd/sys/uvm/uvm_mmap.c (revision db3296cf)
1 /*	$OpenBSD: uvm_mmap.c,v 1.49 2003/07/21 22:52:19 tedu Exp $	*/
2 /*	$NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * Copyright (c) 1991, 1993 The Regents of the University of California.
7  * Copyright (c) 1988 University of Utah.
8  *
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * the Systems Programming Group of the University of Utah Computer
13  * Science Department.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. All advertising materials mentioning features or use of this software
24  *    must display the following acknowledgement:
25  *      This product includes software developed by the Charles D. Cranor,
26  *	Washington University, University of California, Berkeley and
27  *	its contributors.
28  * 4. Neither the name of the University nor the names of its contributors
29  *    may be used to endorse or promote products derived from this software
30  *    without specific prior written permission.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
36  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42  * SUCH DAMAGE.
43  *
44  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
45  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
46  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
47  */
48 
49 /*
50  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
51  * function.
52  */
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/file.h>
56 #include <sys/filedesc.h>
57 #include <sys/resourcevar.h>
58 #include <sys/mman.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/malloc.h>
62 #include <sys/vnode.h>
63 #include <sys/conf.h>
64 #include <sys/stat.h>
65 
66 #include <miscfs/specfs/specdev.h>
67 
68 #include <sys/syscallargs.h>
69 
70 #include <uvm/uvm.h>
71 #include <uvm/uvm_device.h>
72 #include <uvm/uvm_vnode.h>
73 
74 
75 /*
76  * unimplemented VM system calls:
77  */
78 
79 /*
80  * sys_sbrk: sbrk system call.
81  */
82 
83 /* ARGSUSED */
84 int
85 sys_sbrk(p, v, retval)
86 	struct proc *p;
87 	void *v;
88 	register_t *retval;
89 {
90 #if 0
91 	struct sys_sbrk_args /* {
92 		syscallarg(intptr_t) incr;
93 	} */ *uap = v;
94 #endif
95 
96 	return (ENOSYS);
97 }
98 
99 /*
100  * sys_sstk: sstk system call.
101  */
102 
103 /* ARGSUSED */
104 int
105 sys_sstk(p, v, retval)
106 	struct proc *p;
107 	void *v;
108 	register_t *retval;
109 {
110 #if 0
111 	struct sys_sstk_args /* {
112 		syscallarg(int) incr;
113 	} */ *uap = v;
114 #endif
115 
116 	return (ENOSYS);
117 }
118 
119 /*
120  * sys_mquery: provide mapping hints to applications that do fixed mappings
121  *
122  * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and
123  *	don't care about PMAP_PREFER or such)
124  * addr: hint where we'd like to place the mapping.
125  * size: size of the mapping
126  * fd: fd of the file we want to map
127  * off: offset within the file
128  */
129 
130 int
131 sys_mquery(p, v, retval)
132 	struct proc *p;
133 	void *v;
134 	register_t *retval;
135 {
136 	struct sys_mquery_args /* {
137 		syscallarg(caddr_t) addr;
138 		syscallarg(size_t) len;
139 		syscallarg(int) prot;
140 		syscallarg(int) flags;
141 		syscallarg(int) fd;
142 		syscallarg(long) pad;
143 		syscallarg(off_t) pos;
144 	} */ *uap = v;
145 	struct file *fp;
146 	struct uvm_object *uobj;
147 	voff_t uoff;
148 	int error;
149 	vaddr_t vaddr;
150 	int flags = 0;
151 	vsize_t size;
152 	vm_prot_t prot;
153 	int fd;
154 
155 	vaddr = (vaddr_t) SCARG(uap, addr);
156 	prot = SCARG(uap, prot);
157 	size = (vsize_t) SCARG(uap, len);
158 	fd = SCARG(uap, fd);
159 
160 	if ((prot & VM_PROT_ALL) != prot)
161 		return (EINVAL);
162 
163 	if (SCARG(uap, flags) & MAP_FIXED)
164 		flags |= UVM_FLAG_FIXED;
165 
166 	if (fd >= 0) {
167 		if ((error = getvnode(p->p_fd, fd, &fp)) != 0)
168 			return (error);
169 		uobj = &((struct vnode *)fp->f_data)->v_uvm.u_obj;
170 		uoff = SCARG(uap, pos);
171 	} else {
172 		fp = NULL;
173 		uobj = NULL;
174 		uoff = 0;
175 	}
176 
177 	if (vaddr == 0)
178 		vaddr = uvm_map_hint(p, prot);
179 
180 	/* prevent a user requested address from falling in heap space */
181 	if ((vaddr + size > (vaddr_t)p->p_vmspace->vm_daddr) &&
182 	    (vaddr < (vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ)) {
183 		if (flags & UVM_FLAG_FIXED) {
184 			error = EINVAL;
185 			goto done;
186 		}
187 		vaddr = round_page((vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ);
188 	}
189 again:
190 
191 	if (uvm_map_findspace(&p->p_vmspace->vm_map, vaddr, size,
192 	    &vaddr, uobj, uoff, 0, flags) == NULL) {
193 		if (flags & UVM_FLAG_FIXED)
194 			error = EINVAL;
195 		else
196 			error = ENOMEM;
197 	} else {
198 		/* prevent a returned address from falling in heap space */
199 		if ((vaddr + size > (vaddr_t)p->p_vmspace->vm_daddr)
200 		    && (vaddr < (vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ)) {
201 			vaddr = round_page((vaddr_t)p->p_vmspace->vm_daddr +
202 			    MAXDSIZ);
203 			goto again;
204 		}
205 		error = 0;
206 		*retval = (register_t)(vaddr);
207 	}
208 done:
209 	if (fp != NULL)
210 		FRELE(fp);
211 	return (error);
212 }
213 
214 /*
215  * sys_mincore: determine if pages are in core or not.
216  */
217 
218 /* ARGSUSED */
219 int
220 sys_mincore(p, v, retval)
221 	struct proc *p;
222 	void *v;
223 	register_t *retval;
224 {
225 	struct sys_mincore_args /* {
226 		syscallarg(void *) addr;
227 		syscallarg(size_t) len;
228 		syscallarg(char *) vec;
229 	} */ *uap = v;
230 	vm_page_t m;
231 	char *vec, pgi;
232 	struct uvm_object *uobj;
233 	struct vm_amap *amap;
234 	struct vm_anon *anon;
235 	vm_map_entry_t entry;
236 	vaddr_t start, end, lim;
237 	vm_map_t map;
238 	vsize_t len;
239 	int error = 0, npgs;
240 
241 	map = &p->p_vmspace->vm_map;
242 
243 	start = (vaddr_t)SCARG(uap, addr);
244 	len = SCARG(uap, len);
245 	vec = SCARG(uap, vec);
246 
247 	if (start & PAGE_MASK)
248 		return (EINVAL);
249 	len = round_page(len);
250 	end = start + len;
251 	if (end <= start)
252 		return (EINVAL);
253 
254 	npgs = len >> PAGE_SHIFT;
255 
256 	/*
257 	 * Lock down vec, so our returned status isn't outdated by
258 	 * storing the status byte for a page.
259 	 */
260 	if ((error = uvm_vslock(p, vec, npgs, VM_PROT_WRITE)) != 0)
261 		return (error);
262 
263 	vm_map_lock_read(map);
264 
265 	if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
266 		error = ENOMEM;
267 		goto out;
268 	}
269 
270 	for (/* nothing */;
271 	     entry != &map->header && entry->start < end;
272 	     entry = entry->next) {
273 		KASSERT(!UVM_ET_ISSUBMAP(entry));
274 		KASSERT(start >= entry->start);
275 
276 		/* Make sure there are no holes. */
277 		if (entry->end < end &&
278 		     (entry->next == &map->header ||
279 		      entry->next->start > entry->end)) {
280 			error = ENOMEM;
281 			goto out;
282 		}
283 
284 		lim = end < entry->end ? end : entry->end;
285 
286 		/*
287 		 * Special case for objects with no "real" pages.  Those
288 		 * are always considered resident (mapped devices).
289 		 */
290 		if (UVM_ET_ISOBJ(entry)) {
291 			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
292 			if (entry->object.uvm_obj->pgops->pgo_releasepg
293 			    == NULL) {
294 				pgi = 1;
295 				for (/* nothing */; start < lim;
296 				     start += PAGE_SIZE, vec++)
297 					copyout(&pgi, vec, sizeof(char));
298 				continue;
299 			}
300 		}
301 
302 		amap = entry->aref.ar_amap;	/* top layer */
303 		uobj = entry->object.uvm_obj;	/* bottom layer */
304 
305 		if (amap != NULL)
306 			amap_lock(amap);
307 		if (uobj != NULL)
308 			simple_lock(&uobj->vmobjlock);
309 
310 		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
311 			pgi = 0;
312 			if (amap != NULL) {
313 				/* Check the top layer first. */
314 				anon = amap_lookup(&entry->aref,
315 				    start - entry->start);
316 				/* Don't need to lock anon here. */
317 				if (anon != NULL && anon->u.an_page != NULL) {
318 					/*
319 					 * Anon has the page for this entry
320 					 * offset.
321 					 */
322 					pgi = 1;
323 				}
324 			}
325 
326 			if (uobj != NULL && pgi == 0) {
327 				/* Check the bottom layer. */
328 				m = uvm_pagelookup(uobj,
329 				    entry->offset + (start - entry->start));
330 				if (m != NULL) {
331 					/*
332 					 * Object has the page for this entry
333 					 * offset.
334 					 */
335 					pgi = 1;
336 				}
337 			}
338 
339 			copyout(&pgi, vec, sizeof(char));
340 		}
341 
342 		if (uobj != NULL)
343 			simple_unlock(&uobj->vmobjlock);
344 		if (amap != NULL)
345 			amap_unlock(amap);
346 	}
347 
348  out:
349 	vm_map_unlock_read(map);
350 	uvm_vsunlock(p, SCARG(uap, vec), npgs);
351 	return (error);
352 }
353 
354 /*
355  * sys_mmap: mmap system call.
356  *
357  * => file offset and address may not be page aligned
358  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
359  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
360  *      and the return value is adjusted up by the page offset.
361  */
362 
363 int
364 sys_mmap(p, v, retval)
365 	struct proc *p;
366 	void *v;
367 	register_t *retval;
368 {
369 	struct sys_mmap_args /* {
370 		syscallarg(caddr_t) addr;
371 		syscallarg(size_t) len;
372 		syscallarg(int) prot;
373 		syscallarg(int) flags;
374 		syscallarg(int) fd;
375 		syscallarg(long) pad;
376 		syscallarg(off_t) pos;
377 	} */ *uap = v;
378 	vaddr_t addr;
379 	struct vattr va;
380 	off_t pos;
381 	vsize_t size, pageoff;
382 	vm_prot_t prot, maxprot;
383 	int flags, fd;
384 	vaddr_t vm_min_address = VM_MIN_ADDRESS;
385 	struct filedesc *fdp = p->p_fd;
386 	struct file *fp = NULL;
387 	struct vnode *vp;
388 	caddr_t handle;
389 	int error;
390 
391 	/*
392 	 * first, extract syscall args from the uap.
393 	 */
394 
395 	addr = (vaddr_t) SCARG(uap, addr);
396 	size = (vsize_t) SCARG(uap, len);
397 	prot = SCARG(uap, prot);
398 	flags = SCARG(uap, flags);
399 	fd = SCARG(uap, fd);
400 	pos = SCARG(uap, pos);
401 
402 	/*
403 	 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
404 	 * validate the flags.
405 	 */
406 	if ((prot & VM_PROT_ALL) != prot)
407 		return (EINVAL);
408 	if ((flags & MAP_FLAGMASK) != flags)
409 		return (EINVAL);
410 	if (flags & MAP_COPY)
411 		flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
412 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
413 		return (EINVAL);
414 
415 	/*
416 	 * align file position and save offset.  adjust size.
417 	 */
418 
419 	pageoff = (pos & PAGE_MASK);
420 	pos  -= pageoff;
421 	size += pageoff;			/* add offset */
422 	size = (vsize_t) round_page(size);	/* round up */
423 	if ((ssize_t) size < 0)
424 		return (EINVAL);			/* don't allow wrap */
425 
426 	/*
427 	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
428 	 */
429 
430 	if (flags & MAP_FIXED) {
431 
432 		/* ensure address and file offset are aligned properly */
433 		addr -= pageoff;
434 		if (addr & PAGE_MASK)
435 			return (EINVAL);
436 
437 		if (VM_MAXUSER_ADDRESS > 0 &&
438 		    (addr + size) > VM_MAXUSER_ADDRESS)
439 			return (EINVAL);
440 		if (vm_min_address > 0 && addr < vm_min_address)
441 			return (EINVAL);
442 		if (addr > addr + size)
443 			return (EINVAL);		/* no wrapping! */
444 
445 	} else {
446 
447 		/*
448 		 * not fixed: make sure we skip over the largest possible heap.
449 		 * we will refine our guess later (e.g. to account for VAC, etc)
450 		 */
451 		if (addr == 0)
452 			addr = uvm_map_hint(p, prot);
453 		else if (!(flags & MAP_TRYFIXED) &&
454 		    addr < uvm_map_hint(p, prot))
455 			addr = uvm_map_hint(p, prot);
456 	}
457 
458 	/*
459 	 * check for file mappings (i.e. not anonymous) and verify file.
460 	 */
461 	if ((flags & MAP_ANON) == 0) {
462 
463 		if ((fp = fd_getfile(fdp, fd)) == NULL)
464 			return (EBADF);
465 
466 		FREF(fp);
467 
468 		if (fp->f_type != DTYPE_VNODE) {
469 			error = ENODEV;		/* only mmap vnodes! */
470 			goto out;
471 		}
472 		vp = (struct vnode *)fp->f_data;	/* convert to vnode */
473 
474 		if (vp->v_type != VREG && vp->v_type != VCHR &&
475 		    vp->v_type != VBLK) {
476 			error = ENODEV; /* only REG/CHR/BLK support mmap */
477 			goto out;
478 		}
479 
480 		if (vp->v_type == VREG && (pos + size) < pos) {
481 			error = EINVAL;		/* no offset wrapping */
482 			goto out;
483 		}
484 
485 		/* special case: catch SunOS style /dev/zero */
486 		if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
487 			flags |= MAP_ANON;
488 			FRELE(fp);
489 			fp = NULL;
490 			goto is_anon;
491 		}
492 
493 		/*
494 		 * Old programs may not select a specific sharing type, so
495 		 * default to an appropriate one.
496 		 *
497 		 * XXX: how does MAP_ANON fit in the picture?
498 		 */
499 		if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
500 #if defined(DEBUG)
501 			printf("WARNING: defaulted mmap() share type to "
502 			   "%s (pid %d comm %s)\n", vp->v_type == VCHR ?
503 			   "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
504 			    p->p_comm);
505 #endif
506 			if (vp->v_type == VCHR)
507 				flags |= MAP_SHARED;	/* for a device */
508 			else
509 				flags |= MAP_PRIVATE;	/* for a file */
510 		}
511 
512 		/*
513 		 * MAP_PRIVATE device mappings don't make sense (and aren't
514 		 * supported anyway).  However, some programs rely on this,
515 		 * so just change it to MAP_SHARED.
516 		 */
517 		if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
518 			flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
519 		}
520 
521 		/*
522 		 * now check protection
523 		 */
524 
525 		maxprot = VM_PROT_EXECUTE;
526 
527 		/* check read access */
528 		if (fp->f_flag & FREAD)
529 			maxprot |= VM_PROT_READ;
530 		else if (prot & PROT_READ) {
531 			error = EACCES;
532 			goto out;
533 		}
534 
535 		/* check write access, shared case first */
536 		if (flags & MAP_SHARED) {
537 			/*
538 			 * if the file is writable, only add PROT_WRITE to
539 			 * maxprot if the file is not immutable, append-only.
540 			 * otherwise, if we have asked for PROT_WRITE, return
541 			 * EPERM.
542 			 */
543 			if (fp->f_flag & FWRITE) {
544 				if ((error =
545 				    VOP_GETATTR(vp, &va, p->p_ucred, p)))
546 					goto out;
547 				if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
548 					maxprot |= VM_PROT_WRITE;
549 				else if (prot & PROT_WRITE) {
550 					error = EPERM;
551 					goto out;
552 				}
553 			} else if (prot & PROT_WRITE) {
554 				error = EACCES;
555 				goto out;
556 			}
557 		} else {
558 			/* MAP_PRIVATE mappings can always write to */
559 			maxprot |= VM_PROT_WRITE;
560 		}
561 
562 		/*
563 		 * set handle to vnode
564 		 */
565 
566 		handle = (caddr_t)vp;
567 
568 	} else {		/* MAP_ANON case */
569 		/*
570 		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
571 		 */
572 		if (fd != -1) {
573 			error = EINVAL;
574 			goto out;
575 		}
576 
577  is_anon:		/* label for SunOS style /dev/zero */
578 		handle = NULL;
579 		maxprot = VM_PROT_ALL;
580 		pos = 0;
581 	}
582 
583 	/*
584 	 * XXX (in)sanity check.  We don't do proper datasize checking
585 	 * XXX for anonymous (or private writable) mmap().  However,
586 	 * XXX know that if we're trying to allocate more than the amount
587 	 * XXX remaining under our current data size limit, _that_ should
588 	 * XXX be disallowed.
589 	 */
590 	if ((flags & MAP_ANON) != 0 ||
591 	    ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
592 		if (size >
593 		    (p->p_rlimit[RLIMIT_DATA].rlim_cur - ctob(p->p_vmspace->vm_dsize))) {
594 			error = ENOMEM;
595 			goto out;
596 		}
597 	}
598 
599 	/*
600 	 * now let kernel internal function uvm_mmap do the work.
601 	 */
602 
603 	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
604 	    flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
605 
606 	if (error == 0)
607 		/* remember to add offset */
608 		*retval = (register_t)(addr + pageoff);
609 
610 out:
611 	if (fp)
612 		FRELE(fp);
613 	return (error);
614 }
615 
616 /*
617  * sys_msync: the msync system call (a front-end for flush)
618  */
619 
620 int
621 sys_msync(p, v, retval)
622 	struct proc *p;
623 	void *v;
624 	register_t *retval;
625 {
626 	struct sys_msync_args /* {
627 		syscallarg(caddr_t) addr;
628 		syscallarg(size_t) len;
629 		syscallarg(int) flags;
630 	} */ *uap = v;
631 	vaddr_t addr;
632 	vsize_t size, pageoff;
633 	vm_map_t map;
634 	int rv, flags, uvmflags;
635 
636 	/*
637 	 * extract syscall args from the uap
638 	 */
639 
640 	addr = (vaddr_t)SCARG(uap, addr);
641 	size = (vsize_t)SCARG(uap, len);
642 	flags = SCARG(uap, flags);
643 
644 	/* sanity check flags */
645 	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
646 			(flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
647 			(flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
648 	  return (EINVAL);
649 	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
650 	  flags |= MS_SYNC;
651 
652 	/*
653 	 * align the address to a page boundary, and adjust the size accordingly
654 	 */
655 
656 	pageoff = (addr & PAGE_MASK);
657 	addr -= pageoff;
658 	size += pageoff;
659 	size = (vsize_t) round_page(size);
660 
661 	/* disallow wrap-around. */
662 	if (addr + (ssize_t)size < addr)
663 		return (EINVAL);
664 
665 	/*
666 	 * get map
667 	 */
668 
669 	map = &p->p_vmspace->vm_map;
670 
671 	/*
672 	 * XXXCDC: do we really need this semantic?
673 	 *
674 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
675 	 * pages with the region containing addr".  Unfortunately, we
676 	 * don't really keep track of individual mmaps so we approximate
677 	 * by flushing the range of the map entry containing addr.
678 	 * This can be incorrect if the region splits or is coalesced
679 	 * with a neighbor.
680 	 */
681 	if (size == 0) {
682 		vm_map_entry_t entry;
683 
684 		vm_map_lock_read(map);
685 		rv = uvm_map_lookup_entry(map, addr, &entry);
686 		if (rv == TRUE) {
687 			addr = entry->start;
688 			size = entry->end - entry->start;
689 		}
690 		vm_map_unlock_read(map);
691 		if (rv == FALSE)
692 			return (EINVAL);
693 	}
694 
695 	/*
696 	 * translate MS_ flags into PGO_ flags
697 	 */
698 	uvmflags = PGO_CLEANIT;
699 	if (flags & MS_INVALIDATE)
700 		uvmflags |= PGO_FREE;
701 	if (flags & MS_SYNC)
702 		uvmflags |= PGO_SYNCIO;
703 	else
704 		uvmflags |= PGO_SYNCIO;	 /* XXXCDC: force sync for now! */
705 
706 	/*
707 	 * doit!
708 	 */
709 	rv = uvm_map_clean(map, addr, addr+size, uvmflags);
710 
711 	/*
712 	 * and return...
713 	 */
714 	return (rv);
715 }
716 
717 /*
718  * sys_munmap: unmap a users memory
719  */
720 
721 int
722 sys_munmap(p, v, retval)
723 	struct proc *p;
724 	void *v;
725 	register_t *retval;
726 {
727 	struct sys_munmap_args /* {
728 		syscallarg(caddr_t) addr;
729 		syscallarg(size_t) len;
730 	} */ *uap = v;
731 	vaddr_t addr;
732 	vsize_t size, pageoff;
733 	vm_map_t map;
734 	vaddr_t vm_min_address = VM_MIN_ADDRESS;
735 	struct vm_map_entry *dead_entries;
736 
737 	/*
738 	 * get syscall args...
739 	 */
740 
741 	addr = (vaddr_t) SCARG(uap, addr);
742 	size = (vsize_t) SCARG(uap, len);
743 
744 	/*
745 	 * align the address to a page boundary, and adjust the size accordingly
746 	 */
747 
748 	pageoff = (addr & PAGE_MASK);
749 	addr -= pageoff;
750 	size += pageoff;
751 	size = (vsize_t) round_page(size);
752 
753 	if ((ssize_t)size < 0)
754 		return (EINVAL);
755 	if (size == 0)
756 		return (0);
757 
758 	/*
759 	 * Check for illegal addresses.  Watch out for address wrap...
760 	 * Note that VM_*_ADDRESS are not constants due to casts (argh).
761 	 */
762 	if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
763 		return (EINVAL);
764 	if (vm_min_address > 0 && addr < vm_min_address)
765 		return (EINVAL);
766 	if (addr > addr + size)
767 		return (EINVAL);
768 	map = &p->p_vmspace->vm_map;
769 
770 
771 	vm_map_lock(map);	/* lock map so we can checkprot */
772 
773 	/*
774 	 * interesting system call semantic: make sure entire range is
775 	 * allocated before allowing an unmap.
776 	 */
777 
778 	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
779 		vm_map_unlock(map);
780 		return (EINVAL);
781 	}
782 
783 	/*
784 	 * doit!
785 	 */
786 	uvm_unmap_remove(map, addr, addr + size, &dead_entries);
787 
788 	vm_map_unlock(map);	/* and unlock */
789 
790 	if (dead_entries != NULL)
791 		uvm_unmap_detach(dead_entries, 0);
792 
793 	return (0);
794 }
795 
796 /*
797  * sys_mprotect: the mprotect system call
798  */
799 
800 int
801 sys_mprotect(p, v, retval)
802 	struct proc *p;
803 	void *v;
804 	register_t *retval;
805 {
806 	struct sys_mprotect_args /* {
807 		syscallarg(caddr_t) addr;
808 		syscallarg(int) len;
809 		syscallarg(int) prot;
810 	} */ *uap = v;
811 	vaddr_t addr;
812 	vsize_t size, pageoff;
813 	vm_prot_t prot;
814 	int rv;
815 
816 	/*
817 	 * extract syscall args from uap
818 	 */
819 
820 	addr = (vaddr_t)SCARG(uap, addr);
821 	size = (vsize_t)SCARG(uap, len);
822 	prot = SCARG(uap, prot);
823 
824 	if ((prot & VM_PROT_ALL) != prot)
825 		return (EINVAL);
826 
827 	/*
828 	 * align the address to a page boundary, and adjust the size accordingly
829 	 */
830 	pageoff = (addr & PAGE_MASK);
831 	addr -= pageoff;
832 	size += pageoff;
833 	size = (vsize_t) round_page(size);
834 	if ((ssize_t)size < 0)
835 		return (EINVAL);
836 
837 	/*
838 	 * doit
839 	 */
840 
841 	rv = uvm_map_protect(&p->p_vmspace->vm_map,
842 			   addr, addr+size, prot, FALSE);
843 
844 	if (rv == KERN_SUCCESS)
845 		return (0);
846 	if (rv == KERN_PROTECTION_FAILURE)
847 		return (EACCES);
848 	return (EINVAL);
849 }
850 
851 /*
852  * sys_minherit: the minherit system call
853  */
854 
855 int
856 sys_minherit(p, v, retval)
857 	struct proc *p;
858 	void *v;
859 	register_t *retval;
860 {
861 	struct sys_minherit_args /* {
862 		syscallarg(caddr_t) addr;
863 		syscallarg(int) len;
864 		syscallarg(int) inherit;
865 	} */ *uap = v;
866 	vaddr_t addr;
867 	vsize_t size, pageoff;
868 	vm_inherit_t inherit;
869 
870 	addr = (vaddr_t)SCARG(uap, addr);
871 	size = (vsize_t)SCARG(uap, len);
872 	inherit = SCARG(uap, inherit);
873 	/*
874 	 * align the address to a page boundary, and adjust the size accordingly
875 	 */
876 
877 	pageoff = (addr & PAGE_MASK);
878 	addr -= pageoff;
879 	size += pageoff;
880 	size = (vsize_t) round_page(size);
881 
882 	if ((ssize_t)size < 0)
883 		return (EINVAL);
884 
885 	switch (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
886 			 inherit)) {
887 	case KERN_SUCCESS:
888 		return (0);
889 	case KERN_PROTECTION_FAILURE:
890 		return (EACCES);
891 	}
892 	return (EINVAL);
893 }
894 
895 /*
896  * sys_madvise: give advice about memory usage.
897  */
898 
899 /* ARGSUSED */
900 int
901 sys_madvise(p, v, retval)
902 	struct proc *p;
903 	void *v;
904 	register_t *retval;
905 {
906 	struct sys_madvise_args /* {
907 		syscallarg(caddr_t) addr;
908 		syscallarg(size_t) len;
909 		syscallarg(int) behav;
910 	} */ *uap = v;
911 	vaddr_t addr;
912 	vsize_t size, pageoff;
913 	int advice, rv;;
914 
915 	addr = (vaddr_t)SCARG(uap, addr);
916 	size = (vsize_t)SCARG(uap, len);
917 	advice = SCARG(uap, behav);
918 
919 	/*
920 	 * align the address to a page boundary, and adjust the size accordingly
921 	 */
922 	pageoff = (addr & PAGE_MASK);
923 	addr -= pageoff;
924 	size += pageoff;
925 	size = (vsize_t) round_page(size);
926 
927 	if ((ssize_t)size <= 0)
928 		return (EINVAL);
929 
930 	switch (advice) {
931 	case MADV_NORMAL:
932 	case MADV_RANDOM:
933 	case MADV_SEQUENTIAL:
934 		rv = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
935 		    advice);
936 		break;
937 
938 	case MADV_WILLNEED:
939 		/*
940 		 * Activate all these pages, pre-faulting them in if
941 		 * necessary.
942 		 */
943 		/*
944 		 * XXX IMPLEMENT ME.
945 		 * Should invent a "weak" mode for uvm_fault()
946 		 * which would only do the PGO_LOCKED pgo_get().
947 		 */
948 		return (0);
949 
950 	case MADV_DONTNEED:
951 		/*
952 		 * Deactivate all these pages.  We don't need them
953 		 * any more.  We don't, however, toss the data in
954 		 * the pages.
955 		 */
956 		rv = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
957 		    PGO_DEACTIVATE);
958 		break;
959 
960 	case MADV_FREE:
961 		/*
962 		 * These pages contain no valid data, and may be
963 		 * garbage-collected.  Toss all resources, including
964 		 * any swap space in use.
965 		 */
966 		rv = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
967 		    PGO_FREE);
968 		break;
969 
970 	case MADV_SPACEAVAIL:
971 		/*
972 		 * XXXMRG What is this?  I think it's:
973 		 *
974 		 *	Ensure that we have allocated backing-store
975 		 *	for these pages.
976 		 *
977 		 * This is going to require changes to the page daemon,
978 		 * as it will free swap space allocated to pages in core.
979 		 * There's also what to do for device/file/anonymous memory.
980 		 */
981 		return (EINVAL);
982 
983 	default:
984 		return (EINVAL);
985 	}
986 
987 	return (rv);
988 }
989 
990 /*
991  * sys_mlock: memory lock
992  */
993 
994 int
995 sys_mlock(p, v, retval)
996 	struct proc *p;
997 	void *v;
998 	register_t *retval;
999 {
1000 	struct sys_mlock_args /* {
1001 		syscallarg(const void *) addr;
1002 		syscallarg(size_t) len;
1003 	} */ *uap = v;
1004 	vaddr_t addr;
1005 	vsize_t size, pageoff;
1006 	int error;
1007 
1008 	/*
1009 	 * extract syscall args from uap
1010 	 */
1011 	addr = (vaddr_t)SCARG(uap, addr);
1012 	size = (vsize_t)SCARG(uap, len);
1013 
1014 	/*
1015 	 * align the address to a page boundary and adjust the size accordingly
1016 	 */
1017 	pageoff = (addr & PAGE_MASK);
1018 	addr -= pageoff;
1019 	size += pageoff;
1020 	size = (vsize_t) round_page(size);
1021 
1022 	/* disallow wrap-around. */
1023 	if (addr + (ssize_t)size < addr)
1024 		return (EINVAL);
1025 
1026 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
1027 		return (EAGAIN);
1028 
1029 #ifdef pmap_wired_count
1030 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
1031 			p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
1032 		return (EAGAIN);
1033 #else
1034 	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
1035 		return (error);
1036 #endif
1037 
1038 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
1039 	    0);
1040 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1041 }
1042 
1043 /*
1044  * sys_munlock: unlock wired pages
1045  */
1046 
1047 int
1048 sys_munlock(p, v, retval)
1049 	struct proc *p;
1050 	void *v;
1051 	register_t *retval;
1052 {
1053 	struct sys_munlock_args /* {
1054 		syscallarg(const void *) addr;
1055 		syscallarg(size_t) len;
1056 	} */ *uap = v;
1057 	vaddr_t addr;
1058 	vsize_t size, pageoff;
1059 	int error;
1060 
1061 	/*
1062 	 * extract syscall args from uap
1063 	 */
1064 
1065 	addr = (vaddr_t)SCARG(uap, addr);
1066 	size = (vsize_t)SCARG(uap, len);
1067 
1068 	/*
1069 	 * align the address to a page boundary, and adjust the size accordingly
1070 	 */
1071 	pageoff = (addr & PAGE_MASK);
1072 	addr -= pageoff;
1073 	size += pageoff;
1074 	size = (vsize_t) round_page(size);
1075 
1076 	/* disallow wrap-around. */
1077 	if (addr + (ssize_t)size < addr)
1078 		return (EINVAL);
1079 
1080 #ifndef pmap_wired_count
1081 	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
1082 		return (error);
1083 #endif
1084 
1085 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
1086 	    0);
1087 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1088 }
1089 
1090 /*
1091  * sys_mlockall: lock all pages mapped into an address space.
1092  */
1093 
1094 int
1095 sys_mlockall(p, v, retval)
1096 	struct proc *p;
1097 	void *v;
1098 	register_t *retval;
1099 {
1100 	struct sys_mlockall_args /* {
1101 		syscallarg(int) flags;
1102 	} */ *uap = v;
1103 	int error, flags;
1104 
1105 	flags = SCARG(uap, flags);
1106 
1107 	if (flags == 0 ||
1108 	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
1109 		return (EINVAL);
1110 
1111 #ifndef pmap_wired_count
1112 	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
1113 		return (error);
1114 #endif
1115 
1116 	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
1117 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1118 	switch (error) {
1119 	case KERN_SUCCESS:
1120 		error = 0;
1121 		break;
1122 
1123 	case KERN_NO_SPACE:	/* XXX overloaded */
1124 		error = ENOMEM;
1125 		break;
1126 
1127 	default:
1128 		/*
1129 		 * "Some or all of the memory could not be locked when
1130 		 * the call was made."
1131 		 */
1132 		error = EAGAIN;
1133 	}
1134 
1135 	return (error);
1136 }
1137 
1138 /*
1139  * sys_munlockall: unlock all pages mapped into an address space.
1140  */
1141 
1142 int
1143 sys_munlockall(p, v, retval)
1144 	struct proc *p;
1145 	void *v;
1146 	register_t *retval;
1147 {
1148 
1149 	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
1150 	return (0);
1151 }
1152 
1153 /*
1154  * uvm_mmap: internal version of mmap
1155  *
1156  * - used by sys_mmap, exec, and sysv shm
1157  * - handle is a vnode pointer or NULL for MAP_ANON (XXX: not true,
1158  *	sysv shm uses "named anonymous memory")
1159  * - caller must page-align the file offset
1160  */
1161 
1162 int
1163 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
1164 	vm_map_t map;
1165 	vaddr_t *addr;
1166 	vsize_t size;
1167 	vm_prot_t prot, maxprot;
1168 	int flags;
1169 	caddr_t handle;		/* XXX: VNODE? */
1170 	voff_t foff;
1171 	vsize_t locklimit;
1172 {
1173 	struct uvm_object *uobj;
1174 	struct vnode *vp;
1175 	int retval;
1176 	int advice = UVM_ADV_NORMAL;
1177 	uvm_flag_t uvmflag = 0;
1178 
1179 	/*
1180 	 * check params
1181 	 */
1182 
1183 	if (size == 0)
1184 		return(0);
1185 	if (foff & PAGE_MASK)
1186 		return(EINVAL);
1187 	if ((prot & maxprot) != prot)
1188 		return(EINVAL);
1189 
1190 	/*
1191 	 * for non-fixed mappings, round off the suggested address.
1192 	 * for fixed mappings, check alignment and zap old mappings.
1193 	 */
1194 
1195 	if ((flags & MAP_FIXED) == 0) {
1196 		*addr = round_page(*addr);	/* round */
1197 	} else {
1198 
1199 		if (*addr & PAGE_MASK)
1200 			return(EINVAL);
1201 		uvmflag |= UVM_FLAG_FIXED;
1202 		uvm_unmap(map, *addr, *addr + size);	/* zap! */
1203 	}
1204 
1205 	/*
1206 	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
1207 	 * to underlying vm object.
1208 	 */
1209 
1210 	if (flags & MAP_ANON) {
1211 		foff = UVM_UNKNOWN_OFFSET;
1212 		uobj = NULL;
1213 		if ((flags & MAP_SHARED) == 0)
1214 			/* XXX: defer amap create */
1215 			uvmflag |= UVM_FLAG_COPYONW;
1216 		else
1217 			/* shared: create amap now */
1218 			uvmflag |= UVM_FLAG_OVERLAY;
1219 
1220 	} else {
1221 
1222 		vp = (struct vnode *) handle;	/* get vnode */
1223 		if (vp->v_type != VCHR) {
1224 			uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ?
1225 			   maxprot : (maxprot & ~VM_PROT_WRITE));
1226 
1227 #ifndef UBC
1228 			/*
1229 			 * XXXCDC: hack from old code
1230 			 * don't allow vnodes which have been mapped
1231 			 * shared-writeable to persist [forces them to be
1232 			 * flushed out when last reference goes].
1233 			 * XXXCDC: interesting side effect: avoids a bug.
1234 			 * note that in WRITE [ufs_readwrite.c] that we
1235 			 * allocate buffer, uncache, and then do the write.
1236 			 * the problem with this is that if the uncache causes
1237 			 * VM data to be flushed to the same area of the file
1238 			 * we are writing to... in that case we've got the
1239 			 * buffer locked and our process goes to sleep forever.
1240 			 *
1241 			 * XXXCDC: checking maxprot protects us from the
1242 			 * "persistbug" program but this is not a long term
1243 			 * solution.
1244 			 *
1245 			 * XXXCDC: we don't bother calling uncache with the vp
1246 			 * VOP_LOCKed since we know that we are already
1247 			 * holding a valid reference to the uvn (from the
1248 			 * uvn_attach above), and thus it is impossible for
1249 			 * the uncache to kill the uvn and trigger I/O.
1250 			 */
1251 			if (flags & MAP_SHARED) {
1252 				if ((prot & VM_PROT_WRITE) ||
1253 				    (maxprot & VM_PROT_WRITE)) {
1254 					uvm_vnp_uncache(vp);
1255 				}
1256 			}
1257 #else
1258 			/* XXX for now, attach doesn't gain a ref */
1259 			VREF(vp);
1260 #endif
1261 		} else {
1262 			uobj = udv_attach((void *) &vp->v_rdev,
1263 			    (flags & MAP_SHARED) ? maxprot :
1264 			    (maxprot & ~VM_PROT_WRITE), foff, size);
1265 			/*
1266 			 * XXX Some devices don't like to be mapped with
1267 			 * XXX PROT_EXEC, but we don't really have a
1268 			 * XXX better way of handling this, right now
1269 			 */
1270 			if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1271 				maxprot &= ~VM_PROT_EXECUTE;
1272 				uobj = udv_attach((void *) &vp->v_rdev,
1273 				    (flags & MAP_SHARED) ? maxprot :
1274 				    (maxprot & ~VM_PROT_WRITE), foff, size);
1275 			}
1276 			advice = UVM_ADV_RANDOM;
1277 		}
1278 
1279 		if (uobj == NULL)
1280 			return((vp->v_type == VREG) ? ENOMEM : EINVAL);
1281 
1282 		if ((flags & MAP_SHARED) == 0)
1283 			uvmflag |= UVM_FLAG_COPYONW;
1284 	}
1285 
1286 	/*
1287 	 * set up mapping flags
1288 	 */
1289 
1290 	uvmflag = UVM_MAPFLAG(prot, maxprot,
1291 			(flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1292 			advice, uvmflag);
1293 
1294 	/*
1295 	 * do it!
1296 	 */
1297 
1298 	retval = uvm_map(map, addr, size, uobj, foff, 0, uvmflag);
1299 
1300 	if (retval == KERN_SUCCESS) {
1301 		/*
1302 		 * POSIX 1003.1b -- if our address space was configured
1303 		 * to lock all future mappings, wire the one we just made.
1304 		 */
1305 		if (prot == VM_PROT_NONE) {
1306 			/*
1307 			 * No more work to do in this case.
1308 			 */
1309 			return (0);
1310 		}
1311 
1312 		vm_map_lock(map);
1313 
1314 		if (map->flags & VM_MAP_WIREFUTURE) {
1315 			if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
1316 #ifdef pmap_wired_count
1317 			    || (locklimit != 0 && (size +
1318 			         ptoa(pmap_wired_count(vm_map_pmap(map)))) >
1319 			        locklimit)
1320 #endif
1321 			) {
1322 				retval = KERN_RESOURCE_SHORTAGE;
1323 				vm_map_unlock(map);
1324 				/* unmap the region! */
1325 				uvm_unmap(map, *addr, *addr + size);
1326 				goto bad;
1327 			}
1328 			/*
1329 			 * uvm_map_pageable() always returns the map
1330 			 * unlocked.
1331 			 */
1332 			retval = uvm_map_pageable(map, *addr, *addr + size,
1333 			    FALSE, UVM_LK_ENTER);
1334 			if (retval != KERN_SUCCESS) {
1335 				/* unmap the region! */
1336 				uvm_unmap(map, *addr, *addr + size);
1337 				goto bad;
1338 			}
1339 			return (0);
1340 		}
1341 
1342 		vm_map_unlock(map);
1343 
1344 		return (0);
1345 	}
1346 
1347 	/*
1348 	 * errors: first detach from the uobj, if any.
1349 	 */
1350 
1351 	if (uobj)
1352 		uobj->pgops->pgo_detach(uobj);
1353 
1354  bad:
1355 	return (retval);
1356 }
1357