1 /*	$NetBSD: uvm_mmap.c,v 1.159 2016/06/01 12:14:08 pgoyette Exp $	*/
2 
3 /*
4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
5  * Copyright (c) 1991, 1993 The Regents of the University of California.
6  * Copyright (c) 1988 University of Utah.
7  *
8  * All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * the Systems Programming Group of the University of Utah Computer
12  * Science Department.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
39  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
40  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
41  */
42 
43 /*
44  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
45  * function.
46  */
47 
48 #include <sys/cdefs.h>
49 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.159 2016/06/01 12:14:08 pgoyette Exp $");
50 
51 #include "opt_compat_netbsd.h"
52 #include "opt_pax.h"
53 
54 #include <sys/types.h>
55 #include <sys/file.h>
56 #include <sys/filedesc.h>
57 #include <sys/resourcevar.h>
58 #include <sys/mman.h>
59 #include <sys/pax.h>
60 
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm.h>
64 #include <uvm/uvm_device.h>
65 
66 static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t,
67 		    int, int, struct uvm_object *, voff_t, vsize_t);
68 
69 static int
range_test(struct vm_map * map,vaddr_t addr,vsize_t size,bool ismmap)70 range_test(struct vm_map *map, vaddr_t addr, vsize_t size, bool ismmap)
71 {
72 	vaddr_t vm_min_address = vm_map_min(map);
73 	vaddr_t vm_max_address = vm_map_max(map);
74 	vaddr_t eaddr = addr + size;
75 	int res = 0;
76 
77 	if (addr < vm_min_address)
78 		return EINVAL;
79 	if (eaddr > vm_max_address)
80 		return ismmap ? EFBIG : EINVAL;
81 	if (addr > eaddr) /* no wrapping! */
82 		return ismmap ? EOVERFLOW : EINVAL;
83 
84 #ifdef MD_MMAP_RANGE_TEST
85 	res = MD_MMAP_RANGE_TEST(addr, eaddr);
86 #endif
87 
88 	return res;
89 }
90 
91 /*
92  * unimplemented VM system calls:
93  */
94 
95 /*
96  * sys_sbrk: sbrk system call.
97  */
98 
99 /* ARGSUSED */
100 int
sys_sbrk(struct lwp * l,const struct sys_sbrk_args * uap,register_t * retval)101 sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval)
102 {
103 	/* {
104 		syscallarg(intptr_t) incr;
105 	} */
106 
107 	return (ENOSYS);
108 }
109 
110 /*
111  * sys_sstk: sstk system call.
112  */
113 
114 /* ARGSUSED */
115 int
sys_sstk(struct lwp * l,const struct sys_sstk_args * uap,register_t * retval)116 sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval)
117 {
118 	/* {
119 		syscallarg(int) incr;
120 	} */
121 
122 	return (ENOSYS);
123 }
124 
125 /*
126  * sys_mincore: determine if pages are in core or not.
127  */
128 
129 /* ARGSUSED */
130 int
sys_mincore(struct lwp * l,const struct sys_mincore_args * uap,register_t * retval)131 sys_mincore(struct lwp *l, const struct sys_mincore_args *uap,
132     register_t *retval)
133 {
134 	/* {
135 		syscallarg(void *) addr;
136 		syscallarg(size_t) len;
137 		syscallarg(char *) vec;
138 	} */
139 	struct proc *p = l->l_proc;
140 	struct vm_page *pg;
141 	char *vec, pgi;
142 	struct uvm_object *uobj;
143 	struct vm_amap *amap;
144 	struct vm_anon *anon;
145 	struct vm_map_entry *entry;
146 	vaddr_t start, end, lim;
147 	struct vm_map *map;
148 	vsize_t len;
149 	int error = 0, npgs;
150 
151 	map = &p->p_vmspace->vm_map;
152 
153 	start = (vaddr_t)SCARG(uap, addr);
154 	len = SCARG(uap, len);
155 	vec = SCARG(uap, vec);
156 
157 	if (start & PAGE_MASK)
158 		return (EINVAL);
159 	len = round_page(len);
160 	end = start + len;
161 	if (end <= start)
162 		return (EINVAL);
163 
164 	/*
165 	 * Lock down vec, so our returned status isn't outdated by
166 	 * storing the status byte for a page.
167 	 */
168 
169 	npgs = len >> PAGE_SHIFT;
170 	error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
171 	if (error) {
172 		return error;
173 	}
174 	vm_map_lock_read(map);
175 
176 	if (uvm_map_lookup_entry(map, start, &entry) == false) {
177 		error = ENOMEM;
178 		goto out;
179 	}
180 
181 	for (/* nothing */;
182 	     entry != &map->header && entry->start < end;
183 	     entry = entry->next) {
184 		KASSERT(!UVM_ET_ISSUBMAP(entry));
185 		KASSERT(start >= entry->start);
186 
187 		/* Make sure there are no holes. */
188 		if (entry->end < end &&
189 		     (entry->next == &map->header ||
190 		      entry->next->start > entry->end)) {
191 			error = ENOMEM;
192 			goto out;
193 		}
194 
195 		lim = end < entry->end ? end : entry->end;
196 
197 		/*
198 		 * Special case for objects with no "real" pages.  Those
199 		 * are always considered resident (mapped devices).
200 		 */
201 
202 		if (UVM_ET_ISOBJ(entry)) {
203 			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
204 			if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
205 				for (/* nothing */; start < lim;
206 				     start += PAGE_SIZE, vec++)
207 					subyte(vec, 1);
208 				continue;
209 			}
210 		}
211 
212 		amap = entry->aref.ar_amap;	/* upper layer */
213 		uobj = entry->object.uvm_obj;	/* lower layer */
214 
215 		if (amap != NULL)
216 			amap_lock(amap);
217 		if (uobj != NULL)
218 			mutex_enter(uobj->vmobjlock);
219 
220 		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
221 			pgi = 0;
222 			if (amap != NULL) {
223 				/* Check the upper layer first. */
224 				anon = amap_lookup(&entry->aref,
225 				    start - entry->start);
226 				/* Don't need to lock anon here. */
227 				if (anon != NULL && anon->an_page != NULL) {
228 
229 					/*
230 					 * Anon has the page for this entry
231 					 * offset.
232 					 */
233 
234 					pgi = 1;
235 				}
236 			}
237 			if (uobj != NULL && pgi == 0) {
238 				/* Check the lower layer. */
239 				pg = uvm_pagelookup(uobj,
240 				    entry->offset + (start - entry->start));
241 				if (pg != NULL) {
242 
243 					/*
244 					 * Object has the page for this entry
245 					 * offset.
246 					 */
247 
248 					pgi = 1;
249 				}
250 			}
251 			(void) subyte(vec, pgi);
252 		}
253 		if (uobj != NULL)
254 			mutex_exit(uobj->vmobjlock);
255 		if (amap != NULL)
256 			amap_unlock(amap);
257 	}
258 
259  out:
260 	vm_map_unlock_read(map);
261 	uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
262 	return (error);
263 }
264 
265 /*
266  * sys_mmap: mmap system call.
267  *
268  * => file offset and address may not be page aligned
269  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
270  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
271  *      and the return value is adjusted up by the page offset.
272  */
273 
274 int
sys_mmap(struct lwp * l,const struct sys_mmap_args * uap,register_t * retval)275 sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
276 {
277 	/* {
278 		syscallarg(void *) addr;
279 		syscallarg(size_t) len;
280 		syscallarg(int) prot;
281 		syscallarg(int) flags;
282 		syscallarg(int) fd;
283 		syscallarg(long) pad;
284 		syscallarg(off_t) pos;
285 	} */
286 	struct proc *p = l->l_proc;
287 	vaddr_t addr;
288 	off_t pos;
289 	vsize_t size, pageoff, newsize;
290 	vm_prot_t prot, maxprot;
291 	int flags, fd, advice;
292 	vaddr_t defaddr;
293 	struct file *fp = NULL;
294 	struct uvm_object *uobj;
295 	int error;
296 #ifdef PAX_ASLR
297 	vaddr_t orig_addr;
298 #endif /* PAX_ASLR */
299 
300 	/*
301 	 * first, extract syscall args from the uap.
302 	 */
303 
304 	addr = (vaddr_t)SCARG(uap, addr);
305 	size = (vsize_t)SCARG(uap, len);
306 	prot = SCARG(uap, prot) & VM_PROT_ALL;
307 	flags = SCARG(uap, flags);
308 	fd = SCARG(uap, fd);
309 	pos = SCARG(uap, pos);
310 
311 #ifdef PAX_ASLR
312 	orig_addr = addr;
313 #endif /* PAX_ASLR */
314 
315 	/*
316 	 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
317 	 * validate the flags.
318 	 */
319 	if (flags & MAP_COPY) {
320 		flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
321 #if defined(COMPAT_10) && defined(__i386__)
322 		/*
323 		 * Ancient kernel on x86 did not obey PROT_EXEC on i386 at least
324 		 * and ld.so did not turn it on. We take care of this on amd64
325 		 * in compat32.
326 		 */
327 		prot |= PROT_EXEC;
328 #endif
329 	}
330 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
331 		return (EINVAL);
332 
333 	/*
334 	 * align file position and save offset.  adjust size.
335 	 */
336 
337 	pageoff = (pos & PAGE_MASK);
338 	pos    -= pageoff;
339 	newsize = size + pageoff;		/* add offset */
340 	newsize = (vsize_t)round_page(newsize);	/* round up */
341 
342 	if (newsize < size)
343 		return (ENOMEM);
344 	size = newsize;
345 
346 	/*
347 	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
348 	 */
349 	if (flags & MAP_FIXED) {
350 
351 		/* ensure address and file offset are aligned properly */
352 		addr -= pageoff;
353 		if (addr & PAGE_MASK)
354 			return (EINVAL);
355 
356 		error = range_test(&p->p_vmspace->vm_map, addr, size, true);
357 		if (error) {
358 			return error;
359 		}
360 
361 	} else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
362 
363 		/*
364 		 * not fixed: make sure we skip over the largest
365 		 * possible heap for non-topdown mapping arrangements.
366 		 * we will refine our guess later (e.g. to account for
367 		 * VAC, etc)
368 		 */
369 
370 		defaddr = p->p_emul->e_vm_default_addr(p,
371 		    (vaddr_t)p->p_vmspace->vm_daddr, size,
372 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
373 
374 		if (addr == 0 ||
375 		    !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
376 			addr = MAX(addr, defaddr);
377 		else
378 			addr = MIN(addr, defaddr);
379 	}
380 
381 	/*
382 	 * check for file mappings (i.e. not anonymous) and verify file.
383 	 */
384 
385 	advice = UVM_ADV_NORMAL;
386 	if ((flags & MAP_ANON) == 0) {
387 		if ((fp = fd_getfile(fd)) == NULL)
388 			return (EBADF);
389 
390 		if (fp->f_ops->fo_mmap == NULL) {
391 			error = ENODEV;
392 			goto out;
393 		}
394 		error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags,
395 					      &advice, &uobj, &maxprot);
396 		if (error) {
397 			goto out;
398 		}
399 		if (uobj == NULL) {
400 			flags |= MAP_ANON;
401 			fd_putfile(fd);
402 			fp = NULL;
403 			goto is_anon;
404 		}
405 	} else {		/* MAP_ANON case */
406 		/*
407 		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
408 		 */
409 		if (fd != -1)
410 			return (EINVAL);
411 
412  is_anon:		/* label for SunOS style /dev/zero */
413 		uobj = NULL;
414 		maxprot = VM_PROT_ALL;
415 		pos = 0;
416 	}
417 
418 	PAX_MPROTECT_ADJUST(l, &prot, &maxprot);
419 
420 	pax_aslr_mmap(l, &addr, orig_addr, flags);
421 
422 	/*
423 	 * now let kernel internal function uvm_mmap do the work.
424 	 */
425 
426 	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
427 	    flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
428 
429 	/* remember to add offset */
430 	*retval = (register_t)(addr + pageoff);
431 
432  out:
433      	if (fp != NULL)
434 		fd_putfile(fd);
435 
436 	return (error);
437 }
438 
439 /*
440  * sys___msync13: the msync system call (a front-end for flush)
441  */
442 
443 int
sys___msync13(struct lwp * l,const struct sys___msync13_args * uap,register_t * retval)444 sys___msync13(struct lwp *l, const struct sys___msync13_args *uap,
445     register_t *retval)
446 {
447 	/* {
448 		syscallarg(void *) addr;
449 		syscallarg(size_t) len;
450 		syscallarg(int) flags;
451 	} */
452 	struct proc *p = l->l_proc;
453 	vaddr_t addr;
454 	vsize_t size, pageoff;
455 	struct vm_map *map;
456 	int error, flags, uvmflags;
457 	bool rv;
458 
459 	/*
460 	 * extract syscall args from the uap
461 	 */
462 
463 	addr = (vaddr_t)SCARG(uap, addr);
464 	size = (vsize_t)SCARG(uap, len);
465 	flags = SCARG(uap, flags);
466 
467 	/* sanity check flags */
468 	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
469 	    (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
470 	    (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
471 		return (EINVAL);
472 	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
473 		flags |= MS_SYNC;
474 
475 	/*
476 	 * align the address to a page boundary and adjust the size accordingly.
477 	 */
478 
479 	pageoff = (addr & PAGE_MASK);
480 	addr -= pageoff;
481 	size += pageoff;
482 	size = (vsize_t)round_page(size);
483 
484 
485 	/*
486 	 * get map
487 	 */
488 	map = &p->p_vmspace->vm_map;
489 
490 	error = range_test(map, addr, size, false);
491 	if (error)
492 		return error;
493 
494 	/*
495 	 * XXXCDC: do we really need this semantic?
496 	 *
497 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
498 	 * pages with the region containing addr".  Unfortunately, we
499 	 * don't really keep track of individual mmaps so we approximate
500 	 * by flushing the range of the map entry containing addr.
501 	 * This can be incorrect if the region splits or is coalesced
502 	 * with a neighbor.
503 	 */
504 
505 	if (size == 0) {
506 		struct vm_map_entry *entry;
507 
508 		vm_map_lock_read(map);
509 		rv = uvm_map_lookup_entry(map, addr, &entry);
510 		if (rv == true) {
511 			addr = entry->start;
512 			size = entry->end - entry->start;
513 		}
514 		vm_map_unlock_read(map);
515 		if (rv == false)
516 			return (EINVAL);
517 	}
518 
519 	/*
520 	 * translate MS_ flags into PGO_ flags
521 	 */
522 
523 	uvmflags = PGO_CLEANIT;
524 	if (flags & MS_INVALIDATE)
525 		uvmflags |= PGO_FREE;
526 	if (flags & MS_SYNC)
527 		uvmflags |= PGO_SYNCIO;
528 
529 	error = uvm_map_clean(map, addr, addr+size, uvmflags);
530 	return error;
531 }
532 
533 /*
534  * sys_munmap: unmap a users memory
535  */
536 
537 int
sys_munmap(struct lwp * l,const struct sys_munmap_args * uap,register_t * retval)538 sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
539 {
540 	/* {
541 		syscallarg(void *) addr;
542 		syscallarg(size_t) len;
543 	} */
544 	struct proc *p = l->l_proc;
545 	vaddr_t addr;
546 	vsize_t size, pageoff;
547 	struct vm_map *map;
548 	struct vm_map_entry *dead_entries;
549 	int error;
550 
551 	/*
552 	 * get syscall args.
553 	 */
554 
555 	addr = (vaddr_t)SCARG(uap, addr);
556 	size = (vsize_t)SCARG(uap, len);
557 
558 	/*
559 	 * align the address to a page boundary and adjust the size accordingly.
560 	 */
561 
562 	pageoff = (addr & PAGE_MASK);
563 	addr -= pageoff;
564 	size += pageoff;
565 	size = (vsize_t)round_page(size);
566 
567 	if (size == 0)
568 		return (0);
569 
570 	map = &p->p_vmspace->vm_map;
571 
572 	error = range_test(map, addr, size, false);
573 	if (error)
574 		return error;
575 
576 	/*
577 	 * interesting system call semantic: make sure entire range is
578 	 * allocated before allowing an unmap.
579 	 */
580 
581 	vm_map_lock(map);
582 #if 0
583 	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
584 		vm_map_unlock(map);
585 		return (EINVAL);
586 	}
587 #endif
588 	uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0);
589 	vm_map_unlock(map);
590 	if (dead_entries != NULL)
591 		uvm_unmap_detach(dead_entries, 0);
592 	return (0);
593 }
594 
595 /*
596  * sys_mprotect: the mprotect system call
597  */
598 
599 int
sys_mprotect(struct lwp * l,const struct sys_mprotect_args * uap,register_t * retval)600 sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap,
601     register_t *retval)
602 {
603 	/* {
604 		syscallarg(void *) addr;
605 		syscallarg(size_t) len;
606 		syscallarg(int) prot;
607 	} */
608 	struct proc *p = l->l_proc;
609 	vaddr_t addr;
610 	vsize_t size, pageoff;
611 	vm_prot_t prot;
612 	int error;
613 
614 	/*
615 	 * extract syscall args from uap
616 	 */
617 
618 	addr = (vaddr_t)SCARG(uap, addr);
619 	size = (vsize_t)SCARG(uap, len);
620 	prot = SCARG(uap, prot) & VM_PROT_ALL;
621 
622 	/*
623 	 * align the address to a page boundary and adjust the size accordingly.
624 	 */
625 
626 	pageoff = (addr & PAGE_MASK);
627 	addr -= pageoff;
628 	size += pageoff;
629 	size = round_page(size);
630 
631 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
632 	if (error)
633 		return error;
634 
635 	error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
636 				false);
637 	return error;
638 }
639 
640 /*
641  * sys_minherit: the minherit system call
642  */
643 
644 int
sys_minherit(struct lwp * l,const struct sys_minherit_args * uap,register_t * retval)645 sys_minherit(struct lwp *l, const struct sys_minherit_args *uap,
646    register_t *retval)
647 {
648 	/* {
649 		syscallarg(void *) addr;
650 		syscallarg(int) len;
651 		syscallarg(int) inherit;
652 	} */
653 	struct proc *p = l->l_proc;
654 	vaddr_t addr;
655 	vsize_t size, pageoff;
656 	vm_inherit_t inherit;
657 	int error;
658 
659 	addr = (vaddr_t)SCARG(uap, addr);
660 	size = (vsize_t)SCARG(uap, len);
661 	inherit = SCARG(uap, inherit);
662 
663 	/*
664 	 * align the address to a page boundary and adjust the size accordingly.
665 	 */
666 
667 	pageoff = (addr & PAGE_MASK);
668 	addr -= pageoff;
669 	size += pageoff;
670 	size = (vsize_t)round_page(size);
671 
672 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
673 	if (error)
674 		return error;
675 
676 	error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
677 				inherit);
678 	return error;
679 }
680 
681 /*
682  * sys_madvise: give advice about memory usage.
683  */
684 
685 /* ARGSUSED */
686 int
sys_madvise(struct lwp * l,const struct sys_madvise_args * uap,register_t * retval)687 sys_madvise(struct lwp *l, const struct sys_madvise_args *uap,
688    register_t *retval)
689 {
690 	/* {
691 		syscallarg(void *) addr;
692 		syscallarg(size_t) len;
693 		syscallarg(int) behav;
694 	} */
695 	struct proc *p = l->l_proc;
696 	vaddr_t addr;
697 	vsize_t size, pageoff;
698 	int advice, error;
699 
700 	addr = (vaddr_t)SCARG(uap, addr);
701 	size = (vsize_t)SCARG(uap, len);
702 	advice = SCARG(uap, behav);
703 
704 	/*
705 	 * align the address to a page boundary, and adjust the size accordingly
706 	 */
707 
708 	pageoff = (addr & PAGE_MASK);
709 	addr -= pageoff;
710 	size += pageoff;
711 	size = (vsize_t)round_page(size);
712 
713 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
714 	if (error)
715 		return error;
716 
717 	switch (advice) {
718 	case MADV_NORMAL:
719 	case MADV_RANDOM:
720 	case MADV_SEQUENTIAL:
721 		error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
722 		    advice);
723 		break;
724 
725 	case MADV_WILLNEED:
726 
727 		/*
728 		 * Activate all these pages, pre-faulting them in if
729 		 * necessary.
730 		 */
731 		error = uvm_map_willneed(&p->p_vmspace->vm_map,
732 		    addr, addr + size);
733 		break;
734 
735 	case MADV_DONTNEED:
736 
737 		/*
738 		 * Deactivate all these pages.  We don't need them
739 		 * any more.  We don't, however, toss the data in
740 		 * the pages.
741 		 */
742 
743 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
744 		    PGO_DEACTIVATE);
745 		break;
746 
747 	case MADV_FREE:
748 
749 		/*
750 		 * These pages contain no valid data, and may be
751 		 * garbage-collected.  Toss all resources, including
752 		 * any swap space in use.
753 		 */
754 
755 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
756 		    PGO_FREE);
757 		break;
758 
759 	case MADV_SPACEAVAIL:
760 
761 		/*
762 		 * XXXMRG What is this?  I think it's:
763 		 *
764 		 *	Ensure that we have allocated backing-store
765 		 *	for these pages.
766 		 *
767 		 * This is going to require changes to the page daemon,
768 		 * as it will free swap space allocated to pages in core.
769 		 * There's also what to do for device/file/anonymous memory.
770 		 */
771 
772 		return (EINVAL);
773 
774 	default:
775 		return (EINVAL);
776 	}
777 
778 	return error;
779 }
780 
781 /*
782  * sys_mlock: memory lock
783  */
784 
785 int
sys_mlock(struct lwp * l,const struct sys_mlock_args * uap,register_t * retval)786 sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
787 {
788 	/* {
789 		syscallarg(const void *) addr;
790 		syscallarg(size_t) len;
791 	} */
792 	struct proc *p = l->l_proc;
793 	vaddr_t addr;
794 	vsize_t size, pageoff;
795 	int error;
796 
797 	/*
798 	 * extract syscall args from uap
799 	 */
800 
801 	addr = (vaddr_t)SCARG(uap, addr);
802 	size = (vsize_t)SCARG(uap, len);
803 
804 	/*
805 	 * align the address to a page boundary and adjust the size accordingly
806 	 */
807 
808 	pageoff = (addr & PAGE_MASK);
809 	addr -= pageoff;
810 	size += pageoff;
811 	size = (vsize_t)round_page(size);
812 
813 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
814 	if (error)
815 		return error;
816 
817 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
818 		return (EAGAIN);
819 
820 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
821 			p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
822 		return (EAGAIN);
823 
824 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
825 	    0);
826 	if (error == EFAULT)
827 		error = ENOMEM;
828 	return error;
829 }
830 
831 /*
832  * sys_munlock: unlock wired pages
833  */
834 
835 int
sys_munlock(struct lwp * l,const struct sys_munlock_args * uap,register_t * retval)836 sys_munlock(struct lwp *l, const struct sys_munlock_args *uap,
837     register_t *retval)
838 {
839 	/* {
840 		syscallarg(const void *) addr;
841 		syscallarg(size_t) len;
842 	} */
843 	struct proc *p = l->l_proc;
844 	vaddr_t addr;
845 	vsize_t size, pageoff;
846 	int error;
847 
848 	/*
849 	 * extract syscall args from uap
850 	 */
851 
852 	addr = (vaddr_t)SCARG(uap, addr);
853 	size = (vsize_t)SCARG(uap, len);
854 
855 	/*
856 	 * align the address to a page boundary, and adjust the size accordingly
857 	 */
858 
859 	pageoff = (addr & PAGE_MASK);
860 	addr -= pageoff;
861 	size += pageoff;
862 	size = (vsize_t)round_page(size);
863 
864 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
865 	if (error)
866 		return error;
867 
868 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true,
869 	    0);
870 	if (error == EFAULT)
871 		error = ENOMEM;
872 	return error;
873 }
874 
875 /*
876  * sys_mlockall: lock all pages mapped into an address space.
877  */
878 
879 int
sys_mlockall(struct lwp * l,const struct sys_mlockall_args * uap,register_t * retval)880 sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap,
881     register_t *retval)
882 {
883 	/* {
884 		syscallarg(int) flags;
885 	} */
886 	struct proc *p = l->l_proc;
887 	int error, flags;
888 
889 	flags = SCARG(uap, flags);
890 
891 	if (flags == 0 ||
892 	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
893 		return (EINVAL);
894 
895 	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
896 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
897 	return (error);
898 }
899 
900 /*
901  * sys_munlockall: unlock all pages mapped into an address space.
902  */
903 
904 int
sys_munlockall(struct lwp * l,const void * v,register_t * retval)905 sys_munlockall(struct lwp *l, const void *v, register_t *retval)
906 {
907 	struct proc *p = l->l_proc;
908 
909 	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
910 	return (0);
911 }
912 
913 /*
914  * uvm_mmap: internal version of mmap
915  *
916  * - used by sys_mmap and various framebuffers
917  * - uobj is a struct uvm_object pointer or NULL for MAP_ANON
918  * - caller must page-align the file offset
919  */
920 
921 int
uvm_mmap(struct vm_map * map,vaddr_t * addr,vsize_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,int advice,struct uvm_object * uobj,voff_t foff,vsize_t locklimit)922 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
923     vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj,
924     voff_t foff, vsize_t locklimit)
925 {
926 	vaddr_t align = 0;
927 	int error;
928 	uvm_flag_t uvmflag = 0;
929 
930 	/*
931 	 * check params
932 	 */
933 
934 	if (size == 0)
935 		return(0);
936 	if (foff & PAGE_MASK)
937 		return(EINVAL);
938 	if ((prot & maxprot) != prot)
939 		return(EINVAL);
940 
941 	/*
942 	 * for non-fixed mappings, round off the suggested address.
943 	 * for fixed mappings, check alignment and zap old mappings.
944 	 */
945 
946 	if ((flags & MAP_FIXED) == 0) {
947 		*addr = round_page(*addr);
948 	} else {
949 		if (*addr & PAGE_MASK)
950 			return(EINVAL);
951 		uvmflag |= UVM_FLAG_FIXED;
952 		(void) uvm_unmap(map, *addr, *addr + size);
953 	}
954 
955 	/*
956 	 * Try to see if any requested alignment can even be attemped.
957 	 * Make sure we can express the alignment (asking for a >= 4GB
958 	 * alignment on an ILP32 architecure make no sense) and the
959 	 * alignment is at least for a page sized quanitiy.  If the
960 	 * request was for a fixed mapping, make sure supplied address
961 	 * adheres to the request alignment.
962 	 */
963 	align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
964 	if (align) {
965 		if (align >= sizeof(vaddr_t) * NBBY)
966 			return(EINVAL);
967 		align = 1L << align;
968 		if (align < PAGE_SIZE)
969 			return(EINVAL);
970 		if (align >= vm_map_max(map))
971 			return(ENOMEM);
972 		if (flags & MAP_FIXED) {
973 			if ((*addr & (align-1)) != 0)
974 				return(EINVAL);
975 			align = 0;
976 		}
977 	}
978 
979 	/*
980 	 * check resource limits
981 	 */
982 
983 	if (!VM_MAP_IS_KERNEL(map) &&
984 	    (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
985 	    curproc->p_rlimit[RLIMIT_AS].rlim_cur))
986 		return ENOMEM;
987 
988 	/*
989 	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
990 	 * to underlying vm object.
991 	 */
992 
993 	if (flags & MAP_ANON) {
994 		KASSERT(uobj == NULL);
995 		foff = UVM_UNKNOWN_OFFSET;
996 		if ((flags & MAP_SHARED) == 0)
997 			/* XXX: defer amap create */
998 			uvmflag |= UVM_FLAG_COPYONW;
999 		else
1000 			/* shared: create amap now */
1001 			uvmflag |= UVM_FLAG_OVERLAY;
1002 
1003 	} else {
1004 		KASSERT(uobj != NULL);
1005 		if ((flags & MAP_SHARED) == 0) {
1006 			uvmflag |= UVM_FLAG_COPYONW;
1007 		}
1008 	}
1009 
1010 	uvmflag = UVM_MAPFLAG(prot, maxprot,
1011 			(flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1012 			advice, uvmflag);
1013 	error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1014 	if (error) {
1015 		if (uobj)
1016 			uobj->pgops->pgo_detach(uobj);
1017 		return error;
1018 	}
1019 
1020 	/*
1021 	 * POSIX 1003.1b -- if our address space was configured
1022 	 * to lock all future mappings, wire the one we just made.
1023 	 *
1024 	 * Also handle the MAP_WIRED flag here.
1025 	 */
1026 
1027 	if (prot == VM_PROT_NONE) {
1028 
1029 		/*
1030 		 * No more work to do in this case.
1031 		 */
1032 
1033 		return (0);
1034 	}
1035 	if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
1036 		vm_map_lock(map);
1037 		if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
1038 		    (locklimit != 0 &&
1039 		     size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
1040 		     locklimit)) {
1041 			vm_map_unlock(map);
1042 			uvm_unmap(map, *addr, *addr + size);
1043 			return ENOMEM;
1044 		}
1045 
1046 		/*
1047 		 * uvm_map_pageable() always returns the map unlocked.
1048 		 */
1049 
1050 		error = uvm_map_pageable(map, *addr, *addr + size,
1051 					 false, UVM_LK_ENTER);
1052 		if (error) {
1053 			uvm_unmap(map, *addr, *addr + size);
1054 			return error;
1055 		}
1056 		return (0);
1057 	}
1058 	return 0;
1059 }
1060 
1061 vaddr_t
uvm_default_mapaddr(struct proc * p,vaddr_t base,vsize_t sz,int topdown)1062 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown)
1063 {
1064 
1065 	if (topdown)
1066 		return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz);
1067 	else
1068 		return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz);
1069 }
1070 
1071 int
uvm_mmap_dev(struct proc * p,void ** addrp,size_t len,dev_t dev,off_t off)1072 uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev,
1073     off_t off)
1074 {
1075 	struct uvm_object *uobj;
1076 	int error, flags, prot;
1077 
1078 	flags = MAP_SHARED;
1079 	prot = VM_PROT_READ | VM_PROT_WRITE;
1080 	if (*addrp)
1081 		flags |= MAP_FIXED;
1082 	else
1083 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
1084 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
1085 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1086 
1087 	uobj = udv_attach(dev, prot, off, len);
1088 	if (uobj == NULL)
1089 		return EINVAL;
1090 
1091 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
1092 			 (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM,
1093 			 uobj, off, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1094 	return error;
1095 }
1096 
1097 int
uvm_mmap_anon(struct proc * p,void ** addrp,size_t len)1098 uvm_mmap_anon(struct proc *p, void **addrp, size_t len)
1099 {
1100 	int error, flags, prot;
1101 
1102 	flags = MAP_PRIVATE | MAP_ANON;
1103 	prot = VM_PROT_READ | VM_PROT_WRITE;
1104 	if (*addrp)
1105 		flags |= MAP_FIXED;
1106 	else
1107 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
1108 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
1109 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1110 
1111 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
1112 			 (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL,
1113 			 NULL, 0, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1114 	return error;
1115 }
1116