1 /* $OpenBSD: uvm_mmap.c,v 1.193 2024/12/14 12:07:38 mvs Exp $ */
2 /* $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $ */
3
4 /*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * Copyright (c) 1991, 1993 The Regents of the University of California.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
40 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
41 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
42 */
43
44 /*
45 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
46 * function.
47 */
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/filedesc.h>
53 #include <sys/resourcevar.h>
54 #include <sys/mman.h>
55 #include <sys/mount.h>
56 #include <sys/proc.h>
57 #include <sys/malloc.h>
58 #include <sys/vnode.h>
59 #include <sys/conf.h>
60 #include <sys/signalvar.h>
61 #include <sys/syslog.h>
62 #include <sys/stat.h>
63 #include <sys/specdev.h>
64 #include <sys/stdint.h>
65 #include <sys/pledge.h>
66 #include <sys/unistd.h> /* for KBIND* */
67 #include <sys/user.h>
68
69 #include <machine/exec.h> /* for __LDPGSZ */
70
71 #include <sys/syscall.h>
72 #include <sys/syscallargs.h>
73
74 #include <uvm/uvm.h>
75 #include <uvm/uvm_device.h>
76 #include <uvm/uvm_vnode.h>
77
78 /*
79 * Locks used to protect data:
80 * a atomic
81 */
82
83 int uvm_mmapanon(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
84 vsize_t, struct proc *);
85 int uvm_mmapfile(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
86 struct vnode *, voff_t, vsize_t, struct proc *);
87
88
89 /*
90 * Page align addr and size, returning EINVAL on wraparound.
91 */
92 #define ALIGN_ADDR(addr, size, pageoff) do { \
93 pageoff = (addr & PAGE_MASK); \
94 if (pageoff != 0) { \
95 if (size > SIZE_MAX - pageoff) \
96 return EINVAL; /* wraparound */ \
97 addr -= pageoff; \
98 size += pageoff; \
99 } \
100 if (size != 0) { \
101 size = (vsize_t)round_page(size); \
102 if (size == 0) \
103 return EINVAL; /* wraparound */ \
104 } \
105 } while (0)
106
107 /*
108 * sys_mquery: provide mapping hints to applications that do fixed mappings
109 *
110 * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and
111 * don't care about PMAP_PREFER or such)
112 * addr: hint where we'd like to place the mapping.
113 * size: size of the mapping
114 * fd: fd of the file we want to map
115 * off: offset within the file
116 */
117 int
sys_mquery(struct proc * p,void * v,register_t * retval)118 sys_mquery(struct proc *p, void *v, register_t *retval)
119 {
120 struct sys_mquery_args /* {
121 syscallarg(void *) addr;
122 syscallarg(size_t) len;
123 syscallarg(int) prot;
124 syscallarg(int) flags;
125 syscallarg(int) fd;
126 syscallarg(off_t) pos;
127 } */ *uap = v;
128 struct file *fp;
129 voff_t uoff;
130 int error;
131 vaddr_t vaddr;
132 int flags = 0;
133 vsize_t size;
134 vm_prot_t prot;
135 int fd;
136
137 vaddr = (vaddr_t) SCARG(uap, addr);
138 prot = SCARG(uap, prot);
139 size = (vsize_t) SCARG(uap, len);
140 fd = SCARG(uap, fd);
141
142 if ((prot & PROT_MASK) != prot)
143 return EINVAL;
144
145 if (SCARG(uap, flags) & MAP_FIXED)
146 flags |= UVM_FLAG_FIXED;
147
148 if (fd >= 0) {
149 if ((error = getvnode(p, fd, &fp)) != 0)
150 return error;
151 uoff = SCARG(uap, pos);
152 } else {
153 fp = NULL;
154 uoff = UVM_UNKNOWN_OFFSET;
155 }
156
157 if (vaddr == 0)
158 vaddr = uvm_map_hint(p->p_vmspace, prot, VM_MIN_ADDRESS,
159 VM_MAXUSER_ADDRESS);
160
161 error = uvm_map_mquery(&p->p_vmspace->vm_map, &vaddr, size, uoff,
162 flags);
163 if (error == 0)
164 *retval = (register_t)(vaddr);
165
166 if (fp != NULL)
167 FRELE(fp, p);
168 return error;
169 }
170
171 int uvm_wxabort; /* [a] */
172
173 /*
174 * W^X violations are only allowed on permitted filesystems.
175 */
176 static inline int
uvm_wxcheck(struct proc * p,char * call)177 uvm_wxcheck(struct proc *p, char *call)
178 {
179 struct process *pr = p->p_p;
180 int wxallowed = (pr->ps_textvp->v_mount &&
181 (pr->ps_textvp->v_mount->mnt_flag & MNT_WXALLOWED));
182
183 if (wxallowed && (pr->ps_flags & PS_WXNEEDED))
184 return 0;
185
186 if (atomic_load_int(&uvm_wxabort)) {
187 KERNEL_LOCK();
188 /* Report W^X failures */
189 if (pr->ps_wxcounter++ == 0)
190 log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
191 pr->ps_comm, pr->ps_pid, call);
192 /* Send uncatchable SIGABRT for coredump */
193 sigexit(p, SIGABRT);
194 KERNEL_UNLOCK();
195 }
196
197 return ENOTSUP;
198 }
199
200 /*
201 * sys_mmap: mmap system call.
202 *
203 * => file offset and address may not be page aligned
204 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
205 * - if address isn't page aligned the mapping starts at trunc_page(addr)
206 * and the return value is adjusted up by the page offset.
207 */
208 int
sys_mmap(struct proc * p,void * v,register_t * retval)209 sys_mmap(struct proc *p, void *v, register_t *retval)
210 {
211 struct sys_mmap_args /* {
212 syscallarg(void *) addr;
213 syscallarg(size_t) len;
214 syscallarg(int) prot;
215 syscallarg(int) flags;
216 syscallarg(int) fd;
217 syscallarg(off_t) pos;
218 } */ *uap = v;
219 vaddr_t addr;
220 struct vattr va;
221 off_t pos;
222 vsize_t limit, pageoff, size;
223 vm_prot_t prot, maxprot;
224 int flags, fd;
225 vaddr_t vm_min_address = VM_MIN_ADDRESS;
226 struct filedesc *fdp = p->p_fd;
227 struct file *fp = NULL;
228 struct vnode *vp;
229 int error;
230
231 /* first, extract syscall args from the uap. */
232 addr = (vaddr_t) SCARG(uap, addr);
233 size = (vsize_t) SCARG(uap, len);
234 prot = SCARG(uap, prot);
235 flags = SCARG(uap, flags);
236 fd = SCARG(uap, fd);
237 pos = SCARG(uap, pos);
238
239 /*
240 * Validate the flags.
241 */
242 if ((prot & PROT_MASK) != prot)
243 return EINVAL;
244 if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
245 (error = uvm_wxcheck(p, "mmap")))
246 return error;
247
248 if ((flags & MAP_FLAGMASK) != flags)
249 return EINVAL;
250 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
251 return EINVAL;
252 if ((flags & (MAP_FIXED|__MAP_NOREPLACE)) == __MAP_NOREPLACE)
253 return EINVAL;
254 if (flags & MAP_STACK) {
255 if ((flags & (MAP_ANON|MAP_PRIVATE)) != (MAP_ANON|MAP_PRIVATE))
256 return EINVAL;
257 if (flags & ~(MAP_STACK|MAP_FIXED|MAP_ANON|MAP_PRIVATE))
258 return EINVAL;
259 if (pos != 0)
260 return EINVAL;
261 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE))
262 return EINVAL;
263 }
264 if (size == 0)
265 return EINVAL;
266
267 error = pledge_protexec(p, prot);
268 if (error)
269 return error;
270
271 /* align file position and save offset. adjust size. */
272 ALIGN_ADDR(pos, size, pageoff);
273
274 /* now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" */
275 if (flags & MAP_FIXED) {
276 /* adjust address by the same amount as we did the offset */
277 addr -= pageoff;
278 if (addr & PAGE_MASK)
279 return EINVAL; /* not page aligned */
280
281 if (addr > SIZE_MAX - size)
282 return EINVAL; /* no wrapping! */
283 if (VM_MAXUSER_ADDRESS > 0 &&
284 (addr + size) > VM_MAXUSER_ADDRESS)
285 return EINVAL;
286 if (vm_min_address > 0 && addr < vm_min_address)
287 return EINVAL;
288 }
289
290 /* check for file mappings (i.e. not anonymous) and verify file. */
291 if ((flags & MAP_ANON) == 0) {
292 KERNEL_LOCK();
293 if ((fp = fd_getfile(fdp, fd)) == NULL) {
294 error = EBADF;
295 goto out;
296 }
297
298 if (fp->f_type != DTYPE_VNODE) {
299 error = ENODEV; /* only mmap vnodes! */
300 goto out;
301 }
302 vp = (struct vnode *)fp->f_data; /* convert to vnode */
303
304 if (vp->v_type != VREG && vp->v_type != VCHR &&
305 vp->v_type != VBLK) {
306 error = ENODEV; /* only REG/CHR/BLK support mmap */
307 goto out;
308 }
309
310 if (vp->v_type == VREG && (pos + size) < pos) {
311 error = EINVAL; /* no offset wrapping */
312 goto out;
313 }
314
315 /* special case: catch SunOS style /dev/zero */
316 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
317 flags |= MAP_ANON;
318 FRELE(fp, p);
319 fp = NULL;
320 KERNEL_UNLOCK();
321 goto is_anon;
322 }
323
324 /*
325 * Old programs may not select a specific sharing type, so
326 * default to an appropriate one.
327 */
328 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
329 #if defined(DEBUG)
330 printf("WARNING: defaulted mmap() share type to"
331 " %s (pid %d comm %s)\n",
332 vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE",
333 p->p_p->ps_pid, p->p_p->ps_comm);
334 #endif
335 if (vp->v_type == VCHR)
336 flags |= MAP_SHARED; /* for a device */
337 else
338 flags |= MAP_PRIVATE; /* for a file */
339 }
340
341 /*
342 * MAP_PRIVATE device mappings don't make sense (and aren't
343 * supported anyway). However, some programs rely on this,
344 * so just change it to MAP_SHARED.
345 */
346 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
347 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
348 }
349
350 /* now check protection */
351 maxprot = PROT_EXEC;
352
353 /* check read access */
354 if (fp->f_flag & FREAD)
355 maxprot |= PROT_READ;
356 else if (prot & PROT_READ) {
357 error = EACCES;
358 goto out;
359 }
360
361 /* check write access, shared case first */
362 if (flags & MAP_SHARED) {
363 /*
364 * if the file is writable, only add PROT_WRITE to
365 * maxprot if the file is not immutable, append-only.
366 * otherwise, if we have asked for PROT_WRITE, return
367 * EPERM.
368 */
369 if (fp->f_flag & FWRITE) {
370 error = VOP_GETATTR(vp, &va, p->p_ucred, p);
371 if (error)
372 goto out;
373 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
374 maxprot |= PROT_WRITE;
375 else if (prot & PROT_WRITE) {
376 error = EPERM;
377 goto out;
378 }
379 } else if (prot & PROT_WRITE) {
380 error = EACCES;
381 goto out;
382 }
383 } else {
384 /* MAP_PRIVATE mappings can always write to */
385 maxprot |= PROT_WRITE;
386 }
387 if ((flags & __MAP_NOFAULT) != 0 ||
388 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
389 limit = lim_cur(RLIMIT_DATA);
390 if (limit < size ||
391 limit - size < ptoa(p->p_vmspace->vm_dused)) {
392 error = ENOMEM;
393 goto out;
394 }
395 }
396 error = uvm_mmapfile(&p->p_vmspace->vm_map, &addr, size, prot,
397 maxprot, flags, vp, pos, lim_cur(RLIMIT_MEMLOCK), p);
398 FRELE(fp, p);
399 KERNEL_UNLOCK();
400 } else { /* MAP_ANON case */
401 if (fd != -1)
402 return EINVAL;
403
404 is_anon: /* label for SunOS style /dev/zero */
405
406 /* __MAP_NOFAULT only makes sense with a backing object */
407 if ((flags & __MAP_NOFAULT) != 0)
408 return EINVAL;
409
410 if (prot != PROT_NONE || (flags & MAP_SHARED)) {
411 limit = lim_cur(RLIMIT_DATA);
412 if (limit < size ||
413 limit - size < ptoa(p->p_vmspace->vm_dused)) {
414 return ENOMEM;
415 }
416 }
417
418 /*
419 * We've been treating (MAP_SHARED|MAP_PRIVATE) == 0 as
420 * MAP_PRIVATE, so make that clear.
421 */
422 if ((flags & MAP_SHARED) == 0)
423 flags |= MAP_PRIVATE;
424
425 maxprot = PROT_MASK;
426 error = uvm_mmapanon(&p->p_vmspace->vm_map, &addr, size, prot,
427 maxprot, flags, lim_cur(RLIMIT_MEMLOCK), p);
428 }
429
430 if (error == 0)
431 /* remember to add offset */
432 *retval = (register_t)(addr + pageoff);
433
434 return error;
435
436 out:
437 KERNEL_UNLOCK();
438 if (fp)
439 FRELE(fp, p);
440 return error;
441 }
442
443 /*
444 * sys_msync: the msync system call (a front-end for flush)
445 */
446
447 int
sys_msync(struct proc * p,void * v,register_t * retval)448 sys_msync(struct proc *p, void *v, register_t *retval)
449 {
450 struct sys_msync_args /* {
451 syscallarg(void *) addr;
452 syscallarg(size_t) len;
453 syscallarg(int) flags;
454 } */ *uap = v;
455 vaddr_t addr;
456 vsize_t size, pageoff;
457 int flags, uvmflags;
458
459 /* extract syscall args from the uap */
460 addr = (vaddr_t)SCARG(uap, addr);
461 size = (vsize_t)SCARG(uap, len);
462 flags = SCARG(uap, flags);
463
464 /* sanity check flags */
465 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
466 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
467 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
468 return EINVAL;
469 if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
470 flags |= MS_SYNC;
471
472 /* align the address to a page boundary, and adjust the size accordingly */
473 ALIGN_ADDR(addr, size, pageoff);
474 if (addr > SIZE_MAX - size)
475 return EINVAL; /* disallow wrap-around. */
476
477 /* translate MS_ flags into PGO_ flags */
478 uvmflags = PGO_CLEANIT;
479 if (flags & MS_INVALIDATE)
480 uvmflags |= PGO_FREE;
481 if (flags & MS_SYNC)
482 uvmflags |= PGO_SYNCIO;
483 else
484 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */
485
486 return uvm_map_clean(&p->p_vmspace->vm_map, addr, addr+size, uvmflags);
487 }
488
489 /*
490 * sys_munmap: unmap a users memory
491 */
492 int
sys_munmap(struct proc * p,void * v,register_t * retval)493 sys_munmap(struct proc *p, void *v, register_t *retval)
494 {
495 struct sys_munmap_args /* {
496 syscallarg(void *) addr;
497 syscallarg(size_t) len;
498 } */ *uap = v;
499 vaddr_t addr;
500 vsize_t size, pageoff;
501 vm_map_t map;
502 vaddr_t vm_min_address = VM_MIN_ADDRESS;
503 struct uvm_map_deadq dead_entries;
504
505 /* get syscall args... */
506 addr = (vaddr_t) SCARG(uap, addr);
507 size = (vsize_t) SCARG(uap, len);
508
509 /* align address to a page boundary, and adjust size accordingly */
510 ALIGN_ADDR(addr, size, pageoff);
511
512 /*
513 * Check for illegal addresses. Watch out for address wrap...
514 * Note that VM_*_ADDRESS are not constants due to casts (argh).
515 */
516 if (addr > SIZE_MAX - size)
517 return EINVAL;
518 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
519 return EINVAL;
520 if (vm_min_address > 0 && addr < vm_min_address)
521 return EINVAL;
522 map = &p->p_vmspace->vm_map;
523
524
525 vm_map_lock(map); /* lock map so we can checkprot */
526
527 /*
528 * interesting system call semantic: make sure entire range is
529 * allocated before allowing an unmap.
530 */
531 if (!uvm_map_checkprot(map, addr, addr + size, PROT_NONE)) {
532 vm_map_unlock(map);
533 return EINVAL;
534 }
535
536 TAILQ_INIT(&dead_entries);
537 if (uvm_unmap_remove(map, addr, addr + size, &dead_entries,
538 FALSE, TRUE, TRUE) != 0) {
539 vm_map_unlock(map);
540 return EPERM; /* immutable entries found */
541 }
542 vm_map_unlock(map); /* and unlock */
543
544 uvm_unmap_detach(&dead_entries, 0);
545
546 return 0;
547 }
548
549 /*
550 * sys_mprotect: the mprotect system call
551 */
552 int
sys_mprotect(struct proc * p,void * v,register_t * retval)553 sys_mprotect(struct proc *p, void *v, register_t *retval)
554 {
555 struct sys_mprotect_args /* {
556 syscallarg(void *) addr;
557 syscallarg(size_t) len;
558 syscallarg(int) prot;
559 } */ *uap = v;
560 vaddr_t addr;
561 vsize_t size, pageoff;
562 vm_prot_t prot;
563 int error;
564
565 /*
566 * extract syscall args from uap
567 */
568
569 addr = (vaddr_t)SCARG(uap, addr);
570 size = (vsize_t)SCARG(uap, len);
571 prot = SCARG(uap, prot);
572
573 if ((prot & PROT_MASK) != prot)
574 return EINVAL;
575 if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
576 (error = uvm_wxcheck(p, "mprotect")))
577 return error;
578
579 error = pledge_protexec(p, prot);
580 if (error)
581 return error;
582
583 /*
584 * align the address to a page boundary, and adjust the size accordingly
585 */
586 ALIGN_ADDR(addr, size, pageoff);
587 if (addr > SIZE_MAX - size)
588 return EINVAL; /* disallow wrap-around. */
589
590 return (uvm_map_protect(&p->p_vmspace->vm_map, addr, addr+size,
591 prot, 0, FALSE, TRUE));
592 }
593
594 /*
595 * sys_pinsyscalls. The caller is required to normalize base,len
596 * to the minimum .text region, and adjust pintable offsets relative
597 * to that base.
598 */
599 int
sys_pinsyscalls(struct proc * p,void * v,register_t * retval)600 sys_pinsyscalls(struct proc *p, void *v, register_t *retval)
601 {
602 struct sys_pinsyscalls_args /* {
603 syscallarg(void *) base;
604 syscallarg(size_t) len;
605 syscallarg(u_int *) pins;
606 syscallarg(int) npins;
607 } */ *uap = v;
608 struct process *pr = p->p_p;
609 struct vm_map *map = &p->p_vmspace->vm_map;
610 int npins, error = 0, i;
611 vaddr_t base;
612 size_t len;
613 u_int *pins;
614
615 if (pr->ps_libcpin.pn_start ||
616 (pr->ps_vmspace->vm_map.flags & VM_MAP_PINSYSCALL_ONCE))
617 return (EPERM);
618 base = (vaddr_t)SCARG(uap, base);
619 len = (vsize_t)SCARG(uap, len);
620 if (base > SIZE_MAX - len)
621 return (EINVAL); /* disallow wrap-around. */
622 if (base < map->min_offset || base+len > map->max_offset)
623 return (EINVAL);
624
625 /* XXX MP unlock */
626
627 npins = SCARG(uap, npins);
628 if (npins < 1 || npins > SYS_MAXSYSCALL)
629 return (E2BIG);
630 pins = malloc(npins * sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO);
631 if (pins == NULL)
632 return (ENOMEM);
633 error = copyin(SCARG(uap, pins), pins, npins * sizeof(u_int));
634 if (error)
635 goto err;
636
637 /* Range-check pintable offsets */
638 for (i = 0; i < npins; i++) {
639 if (pins[i] == (u_int)-1 || pins[i] == 0)
640 continue;
641 if (pins[i] > SCARG(uap, len)) {
642 error = ERANGE;
643 break;
644 }
645 }
646 if (error) {
647 err:
648 free(pins, M_PINSYSCALL, npins * sizeof(u_int));
649 return (error);
650 }
651 pr->ps_libcpin.pn_start = base;
652 pr->ps_libcpin.pn_end = base + len;
653 pr->ps_libcpin.pn_pins = pins;
654 pr->ps_libcpin.pn_npins = npins;
655
656 #ifdef PMAP_CHECK_COPYIN
657 /* Assume (and insist) on libc.so text being execute-only */
658 if (PMAP_CHECK_COPYIN)
659 uvm_map_check_copyin_add(map, base, base+len);
660 #endif
661 return (0);
662 }
663
664 /*
665 * sys_mimmutable: the mimmutable system call
666 */
667 int
sys_mimmutable(struct proc * p,void * v,register_t * retval)668 sys_mimmutable(struct proc *p, void *v, register_t *retval)
669 {
670 struct sys_mimmutable_args /* {
671 immutablearg(void *) addr;
672 immutablearg(size_t) len;
673 } */ *uap = v;
674 vaddr_t addr;
675 vsize_t size, pageoff;
676
677 addr = (vaddr_t)SCARG(uap, addr);
678 size = (vsize_t)SCARG(uap, len);
679
680 /*
681 * align the address to a page boundary, and adjust the size accordingly
682 */
683 ALIGN_ADDR(addr, size, pageoff);
684 if (addr > SIZE_MAX - size)
685 return EINVAL; /* disallow wrap-around. */
686
687 return uvm_map_immutable(&p->p_vmspace->vm_map, addr, addr+size, 1);
688 }
689
690 /*
691 * sys_minherit: the minherit system call
692 */
693 int
sys_minherit(struct proc * p,void * v,register_t * retval)694 sys_minherit(struct proc *p, void *v, register_t *retval)
695 {
696 struct sys_minherit_args /* {
697 syscallarg(void *) addr;
698 syscallarg(size_t) len;
699 syscallarg(int) inherit;
700 } */ *uap = v;
701 vaddr_t addr;
702 vsize_t size, pageoff;
703 vm_inherit_t inherit;
704
705 addr = (vaddr_t)SCARG(uap, addr);
706 size = (vsize_t)SCARG(uap, len);
707 inherit = SCARG(uap, inherit);
708
709 /*
710 * align the address to a page boundary, and adjust the size accordingly
711 */
712 ALIGN_ADDR(addr, size, pageoff);
713 if (addr > SIZE_MAX - size)
714 return EINVAL; /* disallow wrap-around. */
715
716 return (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
717 inherit));
718 }
719
720 /*
721 * sys_madvise: give advice about memory usage.
722 */
723 int
sys_madvise(struct proc * p,void * v,register_t * retval)724 sys_madvise(struct proc *p, void *v, register_t *retval)
725 {
726 struct sys_madvise_args /* {
727 syscallarg(void *) addr;
728 syscallarg(size_t) len;
729 syscallarg(int) behav;
730 } */ *uap = v;
731 vaddr_t addr;
732 vsize_t size, pageoff;
733 int advice, error;
734
735 addr = (vaddr_t)SCARG(uap, addr);
736 size = (vsize_t)SCARG(uap, len);
737 advice = SCARG(uap, behav);
738
739 /*
740 * align the address to a page boundary, and adjust the size accordingly
741 */
742 ALIGN_ADDR(addr, size, pageoff);
743 if (addr > SIZE_MAX - size)
744 return EINVAL; /* disallow wrap-around. */
745
746 switch (advice) {
747 case MADV_NORMAL:
748 case MADV_RANDOM:
749 case MADV_SEQUENTIAL:
750 error = uvm_map_advice(&p->p_vmspace->vm_map, addr,
751 addr + size, advice);
752 break;
753
754 case MADV_WILLNEED:
755 /*
756 * Activate all these pages, pre-faulting them in if
757 * necessary.
758 */
759 /*
760 * XXX IMPLEMENT ME.
761 * Should invent a "weak" mode for uvm_fault()
762 * which would only do the PGO_LOCKED pgo_get().
763 */
764 return 0;
765
766 case MADV_DONTNEED:
767 /*
768 * Deactivate all these pages. We don't need them
769 * any more. We don't, however, toss the data in
770 * the pages.
771 */
772 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
773 PGO_DEACTIVATE);
774 break;
775
776 case MADV_FREE:
777 /*
778 * These pages contain no valid data, and may be
779 * garbage-collected. Toss all resources, including
780 * any swap space in use.
781 */
782 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
783 PGO_FREE);
784 break;
785
786 case MADV_SPACEAVAIL:
787 /*
788 * XXXMRG What is this? I think it's:
789 *
790 * Ensure that we have allocated backing-store
791 * for these pages.
792 *
793 * This is going to require changes to the page daemon,
794 * as it will free swap space allocated to pages in core.
795 * There's also what to do for device/file/anonymous memory.
796 */
797 return EINVAL;
798
799 default:
800 return EINVAL;
801 }
802
803 return error;
804 }
805
806 /*
807 * sys_mlock: memory lock
808 */
809
810 int
sys_mlock(struct proc * p,void * v,register_t * retval)811 sys_mlock(struct proc *p, void *v, register_t *retval)
812 {
813 struct sys_mlock_args /* {
814 syscallarg(const void *) addr;
815 syscallarg(size_t) len;
816 } */ *uap = v;
817 vaddr_t addr;
818 vsize_t size, pageoff;
819 int error;
820
821 /* extract syscall args from uap */
822 addr = (vaddr_t)SCARG(uap, addr);
823 size = (vsize_t)SCARG(uap, len);
824
825 /* align address to a page boundary and adjust size accordingly */
826 ALIGN_ADDR(addr, size, pageoff);
827 if (addr > SIZE_MAX - size)
828 return EINVAL; /* disallow wrap-around. */
829
830 if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
831 return EAGAIN;
832
833 #ifdef pmap_wired_count
834 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
835 lim_cur(RLIMIT_MEMLOCK))
836 return EAGAIN;
837 #else
838 if ((error = suser(p)) != 0)
839 return error;
840 #endif
841
842 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
843 0);
844 return error == 0 ? 0 : ENOMEM;
845 }
846
847 /*
848 * sys_munlock: unlock wired pages
849 */
850
851 int
sys_munlock(struct proc * p,void * v,register_t * retval)852 sys_munlock(struct proc *p, void *v, register_t *retval)
853 {
854 struct sys_munlock_args /* {
855 syscallarg(const void *) addr;
856 syscallarg(size_t) len;
857 } */ *uap = v;
858 vaddr_t addr;
859 vsize_t size, pageoff;
860 int error;
861
862 /* extract syscall args from uap */
863 addr = (vaddr_t)SCARG(uap, addr);
864 size = (vsize_t)SCARG(uap, len);
865
866 /* align address to a page boundary, and adjust size accordingly */
867 ALIGN_ADDR(addr, size, pageoff);
868 if (addr > SIZE_MAX - size)
869 return EINVAL; /* disallow wrap-around. */
870
871 #ifndef pmap_wired_count
872 if ((error = suser(p)) != 0)
873 return error;
874 #endif
875
876 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
877 0);
878 return error == 0 ? 0 : ENOMEM;
879 }
880
881 /*
882 * sys_mlockall: lock all pages mapped into an address space.
883 */
884 int
sys_mlockall(struct proc * p,void * v,register_t * retval)885 sys_mlockall(struct proc *p, void *v, register_t *retval)
886 {
887 struct sys_mlockall_args /* {
888 syscallarg(int) flags;
889 } */ *uap = v;
890 int error, flags;
891
892 flags = SCARG(uap, flags);
893
894 if (flags == 0 ||
895 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
896 return EINVAL;
897
898 #ifndef pmap_wired_count
899 if ((error = suser(p)) != 0)
900 return error;
901 #endif
902
903 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
904 lim_cur(RLIMIT_MEMLOCK));
905 if (error != 0 && error != ENOMEM)
906 return EAGAIN;
907 return error;
908 }
909
910 /*
911 * sys_munlockall: unlock all pages mapped into an address space.
912 */
913 int
sys_munlockall(struct proc * p,void * v,register_t * retval)914 sys_munlockall(struct proc *p, void *v, register_t *retval)
915 {
916
917 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
918 return 0;
919 }
920
921 /*
922 * common code for mmapanon and mmapfile to lock a mmaping
923 */
924 int
uvm_mmaplock(vm_map_t map,vaddr_t * addr,vsize_t size,vm_prot_t prot,vsize_t locklimit)925 uvm_mmaplock(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
926 vsize_t locklimit)
927 {
928 int error;
929
930 /*
931 * POSIX 1003.1b -- if our address space was configured
932 * to lock all future mappings, wire the one we just made.
933 */
934 if (prot == PROT_NONE) {
935 /*
936 * No more work to do in this case.
937 */
938 return 0;
939 }
940
941 vm_map_lock(map);
942 if (map->flags & VM_MAP_WIREFUTURE) {
943 KERNEL_LOCK();
944 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
945 #ifdef pmap_wired_count
946 || (locklimit != 0 && (size +
947 ptoa(pmap_wired_count(vm_map_pmap(map)))) >
948 locklimit)
949 #endif
950 ) {
951 error = ENOMEM;
952 vm_map_unlock(map);
953 /* unmap the region! */
954 uvm_unmap(map, *addr, *addr + size);
955 KERNEL_UNLOCK();
956 return error;
957 }
958 /*
959 * uvm_map_pageable() always returns the map
960 * unlocked.
961 */
962 error = uvm_map_pageable(map, *addr, *addr + size,
963 FALSE, UVM_LK_ENTER);
964 if (error != 0) {
965 /* unmap the region! */
966 uvm_unmap(map, *addr, *addr + size);
967 KERNEL_UNLOCK();
968 return error;
969 }
970 KERNEL_UNLOCK();
971 return 0;
972 }
973 vm_map_unlock(map);
974 return 0;
975 }
976
977 /*
978 * uvm_mmapanon: internal version of mmap for anons
979 *
980 * - used by sys_mmap
981 */
982 int
uvm_mmapanon(vm_map_t map,vaddr_t * addr,vsize_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,vsize_t locklimit,struct proc * p)983 uvm_mmapanon(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
984 vm_prot_t maxprot, int flags, vsize_t locklimit, struct proc *p)
985 {
986 int error;
987 int advice = MADV_NORMAL;
988 unsigned int uvmflag = 0;
989 vsize_t align = 0; /* userland page size */
990
991 /*
992 * for non-fixed mappings, round off the suggested address.
993 * for fixed mappings, check alignment and zap old mappings.
994 */
995 if ((flags & MAP_FIXED) == 0) {
996 *addr = round_page(*addr); /* round */
997 } else {
998 if (*addr & PAGE_MASK)
999 return EINVAL;
1000
1001 uvmflag |= UVM_FLAG_FIXED;
1002 if ((flags & __MAP_NOREPLACE) == 0)
1003 uvmflag |= UVM_FLAG_UNMAP;
1004 }
1005
1006 if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ)
1007 align = __LDPGSZ;
1008 if ((flags & MAP_SHARED) == 0)
1009 /* XXX: defer amap create */
1010 uvmflag |= UVM_FLAG_COPYONW;
1011 else
1012 /* shared: create amap now */
1013 uvmflag |= UVM_FLAG_OVERLAY;
1014 if (flags & MAP_STACK)
1015 uvmflag |= UVM_FLAG_STACK;
1016 if (flags & MAP_CONCEAL)
1017 uvmflag |= UVM_FLAG_CONCEAL;
1018
1019 /* set up mapping flags */
1020 uvmflag = UVM_MAPFLAG(prot, maxprot,
1021 (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1022 advice, uvmflag);
1023
1024 error = uvm_mapanon(map, addr, size, align, uvmflag);
1025
1026 if (error == 0)
1027 error = uvm_mmaplock(map, addr, size, prot, locklimit);
1028 return error;
1029 }
1030
1031 /*
1032 * uvm_mmapfile: internal version of mmap for non-anons
1033 *
1034 * - used by sys_mmap
1035 * - caller must page-align the file offset
1036 */
1037 int
uvm_mmapfile(vm_map_t map,vaddr_t * addr,vsize_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,struct vnode * vp,voff_t foff,vsize_t locklimit,struct proc * p)1038 uvm_mmapfile(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
1039 vm_prot_t maxprot, int flags, struct vnode *vp, voff_t foff,
1040 vsize_t locklimit, struct proc *p)
1041 {
1042 struct uvm_object *uobj;
1043 int error;
1044 int advice = MADV_NORMAL;
1045 unsigned int uvmflag = 0;
1046 vsize_t align = 0; /* userland page size */
1047
1048 /*
1049 * for non-fixed mappings, round off the suggested address.
1050 * for fixed mappings, check alignment and zap old mappings.
1051 */
1052 if ((flags & MAP_FIXED) == 0) {
1053 *addr = round_page(*addr); /* round */
1054 } else {
1055 if (*addr & PAGE_MASK)
1056 return EINVAL;
1057
1058 uvmflag |= UVM_FLAG_FIXED;
1059 if ((flags & __MAP_NOREPLACE) == 0)
1060 uvmflag |= UVM_FLAG_UNMAP;
1061 }
1062
1063 /*
1064 * attach to underlying vm object.
1065 */
1066 if (vp->v_type != VCHR) {
1067 uobj = uvn_attach(vp, (flags & MAP_SHARED) ?
1068 maxprot : (maxprot & ~PROT_WRITE));
1069
1070 /*
1071 * XXXCDC: hack from old code
1072 * don't allow vnodes which have been mapped
1073 * shared-writeable to persist [forces them to be
1074 * flushed out when last reference goes].
1075 * XXXCDC: interesting side effect: avoids a bug.
1076 * note that in WRITE [ufs_readwrite.c] that we
1077 * allocate buffer, uncache, and then do the write.
1078 * the problem with this is that if the uncache causes
1079 * VM data to be flushed to the same area of the file
1080 * we are writing to... in that case we've got the
1081 * buffer locked and our process goes to sleep forever.
1082 *
1083 * XXXCDC: checking maxprot protects us from the
1084 * "persistbug" program but this is not a long term
1085 * solution.
1086 *
1087 * XXXCDC: we don't bother calling uncache with the vp
1088 * VOP_LOCKed since we know that we are already
1089 * holding a valid reference to the uvn (from the
1090 * uvn_attach above), and thus it is impossible for
1091 * the uncache to kill the uvn and trigger I/O.
1092 */
1093 if (flags & MAP_SHARED) {
1094 if ((prot & PROT_WRITE) ||
1095 (maxprot & PROT_WRITE)) {
1096 uvm_vnp_uncache(vp);
1097 }
1098 }
1099 } else {
1100 uobj = udv_attach(vp->v_rdev,
1101 (flags & MAP_SHARED) ? maxprot :
1102 (maxprot & ~PROT_WRITE), foff, size);
1103 /*
1104 * XXX Some devices don't like to be mapped with
1105 * XXX PROT_EXEC, but we don't really have a
1106 * XXX better way of handling this, right now
1107 */
1108 if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1109 maxprot &= ~PROT_EXEC;
1110 uobj = udv_attach(vp->v_rdev,
1111 (flags & MAP_SHARED) ? maxprot :
1112 (maxprot & ~PROT_WRITE), foff, size);
1113 }
1114 advice = MADV_RANDOM;
1115 }
1116
1117 if (uobj == NULL)
1118 return vp->v_type == VREG ? ENOMEM : EINVAL;
1119
1120 if ((flags & MAP_SHARED) == 0)
1121 uvmflag |= UVM_FLAG_COPYONW;
1122 if (flags & __MAP_NOFAULT)
1123 uvmflag |= (UVM_FLAG_NOFAULT | UVM_FLAG_OVERLAY);
1124 if (flags & MAP_STACK)
1125 uvmflag |= UVM_FLAG_STACK;
1126 if (flags & MAP_CONCEAL)
1127 uvmflag |= UVM_FLAG_CONCEAL;
1128
1129 /* set up mapping flags */
1130 uvmflag = UVM_MAPFLAG(prot, maxprot,
1131 (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1132 advice, uvmflag);
1133
1134 error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1135
1136 if (error == 0)
1137 return uvm_mmaplock(map, addr, size, prot, locklimit);
1138
1139 /* errors: first detach from the uobj, if any. */
1140 if (uobj)
1141 uobj->pgops->pgo_detach(uobj);
1142
1143 return error;
1144 }
1145
1146 int
sys_kbind(struct proc * p,void * v,register_t * retval)1147 sys_kbind(struct proc *p, void *v, register_t *retval)
1148 {
1149 struct sys_kbind_args /* {
1150 syscallarg(const struct __kbind *) param;
1151 syscallarg(size_t) psize;
1152 syscallarg(uint64_t) proc_cookie;
1153 } */ *uap = v;
1154 const struct __kbind *paramp;
1155 union {
1156 struct __kbind uk[KBIND_BLOCK_MAX];
1157 char upad[KBIND_BLOCK_MAX * sizeof(*paramp) + KBIND_DATA_MAX];
1158 } param;
1159 struct uvm_map_deadq dead_entries;
1160 struct process *pr = p->p_p;
1161 const char *data;
1162 vaddr_t baseva, last_baseva, endva, pageoffset, kva;
1163 size_t psize, s;
1164 u_long pc;
1165 int count, i, extra;
1166 int error, sigill = 0;
1167
1168 /*
1169 * extract syscall args from uap
1170 */
1171 paramp = SCARG(uap, param);
1172 psize = SCARG(uap, psize);
1173
1174 /*
1175 * If paramp is NULL and we're uninitialized, disable the syscall
1176 * for the process. Raise SIGILL if paramp is NULL and we're
1177 * already initialized.
1178 *
1179 * If paramp is non-NULL and we're uninitialized, do initialization.
1180 * Otherwise, do security checks and raise SIGILL on failure.
1181 */
1182 pc = PROC_PC(p);
1183 mtx_enter(&pr->ps_mtx);
1184 if (paramp == NULL) {
1185 /* ld.so disables kbind() when lazy binding is disabled */
1186 if (pr->ps_kbind_addr == 0)
1187 pr->ps_kbind_addr = BOGO_PC;
1188 /* pre-7.3 static binaries disable kbind */
1189 /* XXX delete check in 2026 */
1190 else if (pr->ps_kbind_addr != BOGO_PC)
1191 sigill = 1;
1192 } else if (pr->ps_kbind_addr == 0) {
1193 pr->ps_kbind_addr = pc;
1194 pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
1195 } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC ||
1196 pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) {
1197 sigill = 1;
1198 }
1199 mtx_leave(&pr->ps_mtx);
1200
1201 /* Raise SIGILL if something is off. */
1202 if (sigill) {
1203 KERNEL_LOCK();
1204 sigexit(p, SIGILL);
1205 /* NOTREACHED */
1206 KERNEL_UNLOCK();
1207 }
1208
1209 /* We're done if we were disabling the syscall. */
1210 if (paramp == NULL)
1211 return 0;
1212
1213 if (psize < sizeof(struct __kbind) || psize > sizeof(param))
1214 return EINVAL;
1215 if ((error = copyin(paramp, ¶m, psize)))
1216 return error;
1217
1218 /*
1219 * The param argument points to an array of __kbind structures
1220 * followed by the corresponding new data areas for them. Verify
1221 * that the sizes in the __kbind structures add up to the total
1222 * size and find the start of the new area.
1223 */
1224 paramp = ¶m.uk[0];
1225 s = psize;
1226 for (count = 0; s > 0 && count < KBIND_BLOCK_MAX; count++) {
1227 if (s < sizeof(*paramp))
1228 return EINVAL;
1229 s -= sizeof(*paramp);
1230
1231 baseva = (vaddr_t)paramp[count].kb_addr;
1232 endva = baseva + paramp[count].kb_size - 1;
1233 if (paramp[count].kb_addr == NULL ||
1234 paramp[count].kb_size == 0 ||
1235 paramp[count].kb_size > KBIND_DATA_MAX ||
1236 baseva >= VM_MAXUSER_ADDRESS ||
1237 endva >= VM_MAXUSER_ADDRESS ||
1238 s < paramp[count].kb_size)
1239 return EINVAL;
1240
1241 s -= paramp[count].kb_size;
1242 }
1243 if (s > 0)
1244 return EINVAL;
1245 data = (const char *)¶mp[count];
1246
1247 /* all looks good, so do the bindings */
1248 last_baseva = VM_MAXUSER_ADDRESS;
1249 kva = 0;
1250 TAILQ_INIT(&dead_entries);
1251 for (i = 0; i < count; i++) {
1252 baseva = (vaddr_t)paramp[i].kb_addr;
1253 s = paramp[i].kb_size;
1254 pageoffset = baseva & PAGE_MASK;
1255 baseva = trunc_page(baseva);
1256
1257 /* hppa at least runs PLT entries over page edge */
1258 extra = (pageoffset + s) & PAGE_MASK;
1259 if (extra > pageoffset)
1260 extra = 0;
1261 else
1262 s -= extra;
1263 redo:
1264 /* make sure the desired page is mapped into kernel_map */
1265 if (baseva != last_baseva) {
1266 if (kva != 0) {
1267 vm_map_lock(kernel_map);
1268 uvm_unmap_remove(kernel_map, kva,
1269 kva+PAGE_SIZE, &dead_entries,
1270 FALSE, TRUE, FALSE); /* XXX */
1271 vm_map_unlock(kernel_map);
1272 kva = 0;
1273 }
1274 if ((error = uvm_map_extract(&p->p_vmspace->vm_map,
1275 baseva, PAGE_SIZE, &kva, UVM_EXTRACT_FIXPROT)))
1276 break;
1277 last_baseva = baseva;
1278 }
1279
1280 /* do the update */
1281 if ((error = kcopy(data, (char *)kva + pageoffset, s)))
1282 break;
1283 data += s;
1284
1285 if (extra > 0) {
1286 baseva += PAGE_SIZE;
1287 s = extra;
1288 pageoffset = 0;
1289 extra = 0;
1290 goto redo;
1291 }
1292 }
1293
1294 if (kva != 0) {
1295 vm_map_lock(kernel_map);
1296 uvm_unmap_remove(kernel_map, kva, kva+PAGE_SIZE,
1297 &dead_entries, FALSE, TRUE, FALSE); /* XXX */
1298 vm_map_unlock(kernel_map);
1299 }
1300 uvm_unmap_detach(&dead_entries, AMAP_REFALL);
1301
1302 return error;
1303 }
1304