1 /* $OpenBSD: uvm_mmap.c,v 1.191 2024/04/05 14:16:05 deraadt Exp $ */
2 /* $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $ */
3
4 /*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * Copyright (c) 1991, 1993 The Regents of the University of California.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
40 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
41 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
42 */
43
44 /*
45 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
46 * function.
47 */
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/filedesc.h>
53 #include <sys/resourcevar.h>
54 #include <sys/mman.h>
55 #include <sys/mount.h>
56 #include <sys/proc.h>
57 #include <sys/malloc.h>
58 #include <sys/vnode.h>
59 #include <sys/conf.h>
60 #include <sys/signalvar.h>
61 #include <sys/syslog.h>
62 #include <sys/stat.h>
63 #include <sys/specdev.h>
64 #include <sys/stdint.h>
65 #include <sys/pledge.h>
66 #include <sys/unistd.h> /* for KBIND* */
67 #include <sys/user.h>
68
69 #include <machine/exec.h> /* for __LDPGSZ */
70
71 #include <sys/syscall.h>
72 #include <sys/syscallargs.h>
73
74 #include <uvm/uvm.h>
75 #include <uvm/uvm_device.h>
76 #include <uvm/uvm_vnode.h>
77
78 int uvm_mmapanon(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
79 vsize_t, struct proc *);
80 int uvm_mmapfile(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
81 struct vnode *, voff_t, vsize_t, struct proc *);
82
83
84 /*
85 * Page align addr and size, returning EINVAL on wraparound.
86 */
87 #define ALIGN_ADDR(addr, size, pageoff) do { \
88 pageoff = (addr & PAGE_MASK); \
89 if (pageoff != 0) { \
90 if (size > SIZE_MAX - pageoff) \
91 return EINVAL; /* wraparound */ \
92 addr -= pageoff; \
93 size += pageoff; \
94 } \
95 if (size != 0) { \
96 size = (vsize_t)round_page(size); \
97 if (size == 0) \
98 return EINVAL; /* wraparound */ \
99 } \
100 } while (0)
101
102 /*
103 * sys_mquery: provide mapping hints to applications that do fixed mappings
104 *
105 * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and
106 * don't care about PMAP_PREFER or such)
107 * addr: hint where we'd like to place the mapping.
108 * size: size of the mapping
109 * fd: fd of the file we want to map
110 * off: offset within the file
111 */
112 int
sys_mquery(struct proc * p,void * v,register_t * retval)113 sys_mquery(struct proc *p, void *v, register_t *retval)
114 {
115 struct sys_mquery_args /* {
116 syscallarg(void *) addr;
117 syscallarg(size_t) len;
118 syscallarg(int) prot;
119 syscallarg(int) flags;
120 syscallarg(int) fd;
121 syscallarg(off_t) pos;
122 } */ *uap = v;
123 struct file *fp;
124 voff_t uoff;
125 int error;
126 vaddr_t vaddr;
127 int flags = 0;
128 vsize_t size;
129 vm_prot_t prot;
130 int fd;
131
132 vaddr = (vaddr_t) SCARG(uap, addr);
133 prot = SCARG(uap, prot);
134 size = (vsize_t) SCARG(uap, len);
135 fd = SCARG(uap, fd);
136
137 if ((prot & PROT_MASK) != prot)
138 return EINVAL;
139
140 if (SCARG(uap, flags) & MAP_FIXED)
141 flags |= UVM_FLAG_FIXED;
142
143 if (fd >= 0) {
144 if ((error = getvnode(p, fd, &fp)) != 0)
145 return error;
146 uoff = SCARG(uap, pos);
147 } else {
148 fp = NULL;
149 uoff = UVM_UNKNOWN_OFFSET;
150 }
151
152 if (vaddr == 0)
153 vaddr = uvm_map_hint(p->p_vmspace, prot, VM_MIN_ADDRESS,
154 VM_MAXUSER_ADDRESS);
155
156 error = uvm_map_mquery(&p->p_vmspace->vm_map, &vaddr, size, uoff,
157 flags);
158 if (error == 0)
159 *retval = (register_t)(vaddr);
160
161 if (fp != NULL)
162 FRELE(fp, p);
163 return error;
164 }
165
166 int uvm_wxabort;
167
168 /*
169 * W^X violations are only allowed on permitted filesystems.
170 */
171 static inline int
uvm_wxcheck(struct proc * p,char * call)172 uvm_wxcheck(struct proc *p, char *call)
173 {
174 struct process *pr = p->p_p;
175 int wxallowed = (pr->ps_textvp->v_mount &&
176 (pr->ps_textvp->v_mount->mnt_flag & MNT_WXALLOWED));
177
178 if (wxallowed && (pr->ps_flags & PS_WXNEEDED))
179 return 0;
180
181 if (uvm_wxabort) {
182 KERNEL_LOCK();
183 /* Report W^X failures */
184 if (pr->ps_wxcounter++ == 0)
185 log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
186 pr->ps_comm, pr->ps_pid, call);
187 /* Send uncatchable SIGABRT for coredump */
188 sigexit(p, SIGABRT);
189 KERNEL_UNLOCK();
190 }
191
192 return ENOTSUP;
193 }
194
195 /*
196 * sys_mmap: mmap system call.
197 *
198 * => file offset and address may not be page aligned
199 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
200 * - if address isn't page aligned the mapping starts at trunc_page(addr)
201 * and the return value is adjusted up by the page offset.
202 */
203 int
sys_mmap(struct proc * p,void * v,register_t * retval)204 sys_mmap(struct proc *p, void *v, register_t *retval)
205 {
206 struct sys_mmap_args /* {
207 syscallarg(void *) addr;
208 syscallarg(size_t) len;
209 syscallarg(int) prot;
210 syscallarg(int) flags;
211 syscallarg(int) fd;
212 syscallarg(off_t) pos;
213 } */ *uap = v;
214 vaddr_t addr;
215 struct vattr va;
216 off_t pos;
217 vsize_t limit, pageoff, size;
218 vm_prot_t prot, maxprot;
219 int flags, fd;
220 vaddr_t vm_min_address = VM_MIN_ADDRESS;
221 struct filedesc *fdp = p->p_fd;
222 struct file *fp = NULL;
223 struct vnode *vp;
224 int error;
225
226 /* first, extract syscall args from the uap. */
227 addr = (vaddr_t) SCARG(uap, addr);
228 size = (vsize_t) SCARG(uap, len);
229 prot = SCARG(uap, prot);
230 flags = SCARG(uap, flags);
231 fd = SCARG(uap, fd);
232 pos = SCARG(uap, pos);
233
234 /*
235 * Validate the flags.
236 */
237 if ((prot & PROT_MASK) != prot)
238 return EINVAL;
239 if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
240 (error = uvm_wxcheck(p, "mmap")))
241 return error;
242
243 if ((flags & MAP_FLAGMASK) != flags)
244 return EINVAL;
245 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
246 return EINVAL;
247 if ((flags & (MAP_FIXED|__MAP_NOREPLACE)) == __MAP_NOREPLACE)
248 return EINVAL;
249 if (flags & MAP_STACK) {
250 if ((flags & (MAP_ANON|MAP_PRIVATE)) != (MAP_ANON|MAP_PRIVATE))
251 return EINVAL;
252 if (flags & ~(MAP_STACK|MAP_FIXED|MAP_ANON|MAP_PRIVATE))
253 return EINVAL;
254 if (pos != 0)
255 return EINVAL;
256 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE))
257 return EINVAL;
258 }
259 if (size == 0)
260 return EINVAL;
261
262 error = pledge_protexec(p, prot);
263 if (error)
264 return error;
265
266 /* align file position and save offset. adjust size. */
267 ALIGN_ADDR(pos, size, pageoff);
268
269 /* now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" */
270 if (flags & MAP_FIXED) {
271 /* adjust address by the same amount as we did the offset */
272 addr -= pageoff;
273 if (addr & PAGE_MASK)
274 return EINVAL; /* not page aligned */
275
276 if (addr > SIZE_MAX - size)
277 return EINVAL; /* no wrapping! */
278 if (VM_MAXUSER_ADDRESS > 0 &&
279 (addr + size) > VM_MAXUSER_ADDRESS)
280 return EINVAL;
281 if (vm_min_address > 0 && addr < vm_min_address)
282 return EINVAL;
283 }
284
285 /* check for file mappings (i.e. not anonymous) and verify file. */
286 if ((flags & MAP_ANON) == 0) {
287 KERNEL_LOCK();
288 if ((fp = fd_getfile(fdp, fd)) == NULL) {
289 error = EBADF;
290 goto out;
291 }
292
293 if (fp->f_type != DTYPE_VNODE) {
294 error = ENODEV; /* only mmap vnodes! */
295 goto out;
296 }
297 vp = (struct vnode *)fp->f_data; /* convert to vnode */
298
299 if (vp->v_type != VREG && vp->v_type != VCHR &&
300 vp->v_type != VBLK) {
301 error = ENODEV; /* only REG/CHR/BLK support mmap */
302 goto out;
303 }
304
305 if (vp->v_type == VREG && (pos + size) < pos) {
306 error = EINVAL; /* no offset wrapping */
307 goto out;
308 }
309
310 /* special case: catch SunOS style /dev/zero */
311 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
312 flags |= MAP_ANON;
313 FRELE(fp, p);
314 fp = NULL;
315 KERNEL_UNLOCK();
316 goto is_anon;
317 }
318
319 /*
320 * Old programs may not select a specific sharing type, so
321 * default to an appropriate one.
322 */
323 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
324 #if defined(DEBUG)
325 printf("WARNING: defaulted mmap() share type to"
326 " %s (pid %d comm %s)\n",
327 vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE",
328 p->p_p->ps_pid, p->p_p->ps_comm);
329 #endif
330 if (vp->v_type == VCHR)
331 flags |= MAP_SHARED; /* for a device */
332 else
333 flags |= MAP_PRIVATE; /* for a file */
334 }
335
336 /*
337 * MAP_PRIVATE device mappings don't make sense (and aren't
338 * supported anyway). However, some programs rely on this,
339 * so just change it to MAP_SHARED.
340 */
341 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
342 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
343 }
344
345 /* now check protection */
346 maxprot = PROT_EXEC;
347
348 /* check read access */
349 if (fp->f_flag & FREAD)
350 maxprot |= PROT_READ;
351 else if (prot & PROT_READ) {
352 error = EACCES;
353 goto out;
354 }
355
356 /* check write access, shared case first */
357 if (flags & MAP_SHARED) {
358 /*
359 * if the file is writable, only add PROT_WRITE to
360 * maxprot if the file is not immutable, append-only.
361 * otherwise, if we have asked for PROT_WRITE, return
362 * EPERM.
363 */
364 if (fp->f_flag & FWRITE) {
365 error = VOP_GETATTR(vp, &va, p->p_ucred, p);
366 if (error)
367 goto out;
368 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
369 maxprot |= PROT_WRITE;
370 else if (prot & PROT_WRITE) {
371 error = EPERM;
372 goto out;
373 }
374 } else if (prot & PROT_WRITE) {
375 error = EACCES;
376 goto out;
377 }
378 } else {
379 /* MAP_PRIVATE mappings can always write to */
380 maxprot |= PROT_WRITE;
381 }
382 if ((flags & __MAP_NOFAULT) != 0 ||
383 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
384 limit = lim_cur(RLIMIT_DATA);
385 if (limit < size ||
386 limit - size < ptoa(p->p_vmspace->vm_dused)) {
387 error = ENOMEM;
388 goto out;
389 }
390 }
391 error = uvm_mmapfile(&p->p_vmspace->vm_map, &addr, size, prot,
392 maxprot, flags, vp, pos, lim_cur(RLIMIT_MEMLOCK), p);
393 FRELE(fp, p);
394 KERNEL_UNLOCK();
395 } else { /* MAP_ANON case */
396 if (fd != -1)
397 return EINVAL;
398
399 is_anon: /* label for SunOS style /dev/zero */
400
401 /* __MAP_NOFAULT only makes sense with a backing object */
402 if ((flags & __MAP_NOFAULT) != 0)
403 return EINVAL;
404
405 if (prot != PROT_NONE || (flags & MAP_SHARED)) {
406 limit = lim_cur(RLIMIT_DATA);
407 if (limit < size ||
408 limit - size < ptoa(p->p_vmspace->vm_dused)) {
409 return ENOMEM;
410 }
411 }
412
413 /*
414 * We've been treating (MAP_SHARED|MAP_PRIVATE) == 0 as
415 * MAP_PRIVATE, so make that clear.
416 */
417 if ((flags & MAP_SHARED) == 0)
418 flags |= MAP_PRIVATE;
419
420 maxprot = PROT_MASK;
421 error = uvm_mmapanon(&p->p_vmspace->vm_map, &addr, size, prot,
422 maxprot, flags, lim_cur(RLIMIT_MEMLOCK), p);
423 }
424
425 if (error == 0)
426 /* remember to add offset */
427 *retval = (register_t)(addr + pageoff);
428
429 return error;
430
431 out:
432 KERNEL_UNLOCK();
433 if (fp)
434 FRELE(fp, p);
435 return error;
436 }
437
438 /*
439 * sys_msync: the msync system call (a front-end for flush)
440 */
441
442 int
sys_msync(struct proc * p,void * v,register_t * retval)443 sys_msync(struct proc *p, void *v, register_t *retval)
444 {
445 struct sys_msync_args /* {
446 syscallarg(void *) addr;
447 syscallarg(size_t) len;
448 syscallarg(int) flags;
449 } */ *uap = v;
450 vaddr_t addr;
451 vsize_t size, pageoff;
452 int flags, uvmflags;
453
454 /* extract syscall args from the uap */
455 addr = (vaddr_t)SCARG(uap, addr);
456 size = (vsize_t)SCARG(uap, len);
457 flags = SCARG(uap, flags);
458
459 /* sanity check flags */
460 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
461 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
462 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
463 return EINVAL;
464 if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
465 flags |= MS_SYNC;
466
467 /* align the address to a page boundary, and adjust the size accordingly */
468 ALIGN_ADDR(addr, size, pageoff);
469 if (addr > SIZE_MAX - size)
470 return EINVAL; /* disallow wrap-around. */
471
472 /* translate MS_ flags into PGO_ flags */
473 uvmflags = PGO_CLEANIT;
474 if (flags & MS_INVALIDATE)
475 uvmflags |= PGO_FREE;
476 if (flags & MS_SYNC)
477 uvmflags |= PGO_SYNCIO;
478 else
479 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */
480
481 return uvm_map_clean(&p->p_vmspace->vm_map, addr, addr+size, uvmflags);
482 }
483
484 /*
485 * sys_munmap: unmap a users memory
486 */
487 int
sys_munmap(struct proc * p,void * v,register_t * retval)488 sys_munmap(struct proc *p, void *v, register_t *retval)
489 {
490 struct sys_munmap_args /* {
491 syscallarg(void *) addr;
492 syscallarg(size_t) len;
493 } */ *uap = v;
494 vaddr_t addr;
495 vsize_t size, pageoff;
496 vm_map_t map;
497 vaddr_t vm_min_address = VM_MIN_ADDRESS;
498 struct uvm_map_deadq dead_entries;
499
500 /* get syscall args... */
501 addr = (vaddr_t) SCARG(uap, addr);
502 size = (vsize_t) SCARG(uap, len);
503
504 /* align address to a page boundary, and adjust size accordingly */
505 ALIGN_ADDR(addr, size, pageoff);
506
507 /*
508 * Check for illegal addresses. Watch out for address wrap...
509 * Note that VM_*_ADDRESS are not constants due to casts (argh).
510 */
511 if (addr > SIZE_MAX - size)
512 return EINVAL;
513 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
514 return EINVAL;
515 if (vm_min_address > 0 && addr < vm_min_address)
516 return EINVAL;
517 map = &p->p_vmspace->vm_map;
518
519
520 vm_map_lock(map); /* lock map so we can checkprot */
521
522 /*
523 * interesting system call semantic: make sure entire range is
524 * allocated before allowing an unmap.
525 */
526 if (!uvm_map_checkprot(map, addr, addr + size, PROT_NONE)) {
527 vm_map_unlock(map);
528 return EINVAL;
529 }
530
531 TAILQ_INIT(&dead_entries);
532 if (uvm_unmap_remove(map, addr, addr + size, &dead_entries,
533 FALSE, TRUE, TRUE) != 0) {
534 vm_map_unlock(map);
535 return EPERM; /* immutable entries found */
536 }
537 vm_map_unlock(map); /* and unlock */
538
539 uvm_unmap_detach(&dead_entries, 0);
540
541 return 0;
542 }
543
544 /*
545 * sys_mprotect: the mprotect system call
546 */
547 int
sys_mprotect(struct proc * p,void * v,register_t * retval)548 sys_mprotect(struct proc *p, void *v, register_t *retval)
549 {
550 struct sys_mprotect_args /* {
551 syscallarg(void *) addr;
552 syscallarg(size_t) len;
553 syscallarg(int) prot;
554 } */ *uap = v;
555 vaddr_t addr;
556 vsize_t size, pageoff;
557 vm_prot_t prot;
558 int error;
559
560 /*
561 * extract syscall args from uap
562 */
563
564 addr = (vaddr_t)SCARG(uap, addr);
565 size = (vsize_t)SCARG(uap, len);
566 prot = SCARG(uap, prot);
567
568 if ((prot & PROT_MASK) != prot)
569 return EINVAL;
570 if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
571 (error = uvm_wxcheck(p, "mprotect")))
572 return error;
573
574 error = pledge_protexec(p, prot);
575 if (error)
576 return error;
577
578 /*
579 * align the address to a page boundary, and adjust the size accordingly
580 */
581 ALIGN_ADDR(addr, size, pageoff);
582 if (addr > SIZE_MAX - size)
583 return EINVAL; /* disallow wrap-around. */
584
585 return (uvm_map_protect(&p->p_vmspace->vm_map, addr, addr+size,
586 prot, 0, FALSE, TRUE));
587 }
588
589 /*
590 * sys_pinsyscalls. The caller is required to normalize base,len
591 * to the minimum .text region, and adjust pintable offsets relative
592 * to that base.
593 */
594 int
sys_pinsyscalls(struct proc * p,void * v,register_t * retval)595 sys_pinsyscalls(struct proc *p, void *v, register_t *retval)
596 {
597 struct sys_pinsyscalls_args /* {
598 syscallarg(void *) base;
599 syscallarg(size_t) len;
600 syscallarg(u_int *) pins;
601 syscallarg(int) npins;
602 } */ *uap = v;
603 struct process *pr = p->p_p;
604 struct vm_map *map = &p->p_vmspace->vm_map;
605 int npins, error = 0, i;
606 vaddr_t base;
607 size_t len;
608 u_int *pins;
609
610 if (pr->ps_libcpin.pn_start ||
611 (pr->ps_vmspace->vm_map.flags & VM_MAP_PINSYSCALL_ONCE))
612 return (EPERM);
613 base = (vaddr_t)SCARG(uap, base);
614 len = (vsize_t)SCARG(uap, len);
615 if (base > SIZE_MAX - len)
616 return (EINVAL); /* disallow wrap-around. */
617 if (base < map->min_offset || base+len > map->max_offset)
618 return (EINVAL);
619
620 /* XXX MP unlock */
621
622 npins = SCARG(uap, npins);
623 if (npins < 1 || npins > SYS_MAXSYSCALL)
624 return (E2BIG);
625 pins = malloc(npins * sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO);
626 if (pins == NULL)
627 return (ENOMEM);
628 error = copyin(SCARG(uap, pins), pins, npins * sizeof(u_int));
629 if (error)
630 goto err;
631
632 /* Range-check pintable offsets */
633 for (i = 0; i < npins; i++) {
634 if (pins[i] == (u_int)-1 || pins[i] == 0)
635 continue;
636 if (pins[i] > SCARG(uap, len)) {
637 error = ERANGE;
638 break;
639 }
640 }
641 if (error) {
642 err:
643 free(pins, M_PINSYSCALL, npins * sizeof(u_int));
644 return (error);
645 }
646 pr->ps_libcpin.pn_start = base;
647 pr->ps_libcpin.pn_end = base + len;
648 pr->ps_libcpin.pn_pins = pins;
649 pr->ps_libcpin.pn_npins = npins;
650 pr->ps_flags |= PS_LIBCPIN;
651
652 #ifdef PMAP_CHECK_COPYIN
653 /* Assume (and insist) on libc.so text being execute-only */
654 if (PMAP_CHECK_COPYIN)
655 uvm_map_check_copyin_add(map, base, base+len);
656 #endif
657 return (0);
658 }
659
660 /*
661 * sys_mimmutable: the mimmutable system call
662 */
663 int
sys_mimmutable(struct proc * p,void * v,register_t * retval)664 sys_mimmutable(struct proc *p, void *v, register_t *retval)
665 {
666 struct sys_mimmutable_args /* {
667 immutablearg(void *) addr;
668 immutablearg(size_t) len;
669 } */ *uap = v;
670 vaddr_t addr;
671 vsize_t size, pageoff;
672
673 addr = (vaddr_t)SCARG(uap, addr);
674 size = (vsize_t)SCARG(uap, len);
675
676 /*
677 * align the address to a page boundary, and adjust the size accordingly
678 */
679 ALIGN_ADDR(addr, size, pageoff);
680 if (addr > SIZE_MAX - size)
681 return EINVAL; /* disallow wrap-around. */
682
683 return uvm_map_immutable(&p->p_vmspace->vm_map, addr, addr+size, 1);
684 }
685
686 /*
687 * sys_minherit: the minherit system call
688 */
689 int
sys_minherit(struct proc * p,void * v,register_t * retval)690 sys_minherit(struct proc *p, void *v, register_t *retval)
691 {
692 struct sys_minherit_args /* {
693 syscallarg(void *) addr;
694 syscallarg(size_t) len;
695 syscallarg(int) inherit;
696 } */ *uap = v;
697 vaddr_t addr;
698 vsize_t size, pageoff;
699 vm_inherit_t inherit;
700
701 addr = (vaddr_t)SCARG(uap, addr);
702 size = (vsize_t)SCARG(uap, len);
703 inherit = SCARG(uap, inherit);
704
705 /*
706 * align the address to a page boundary, and adjust the size accordingly
707 */
708 ALIGN_ADDR(addr, size, pageoff);
709 if (addr > SIZE_MAX - size)
710 return EINVAL; /* disallow wrap-around. */
711
712 return (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
713 inherit));
714 }
715
716 /*
717 * sys_madvise: give advice about memory usage.
718 */
719 int
sys_madvise(struct proc * p,void * v,register_t * retval)720 sys_madvise(struct proc *p, void *v, register_t *retval)
721 {
722 struct sys_madvise_args /* {
723 syscallarg(void *) addr;
724 syscallarg(size_t) len;
725 syscallarg(int) behav;
726 } */ *uap = v;
727 vaddr_t addr;
728 vsize_t size, pageoff;
729 int advice, error;
730
731 addr = (vaddr_t)SCARG(uap, addr);
732 size = (vsize_t)SCARG(uap, len);
733 advice = SCARG(uap, behav);
734
735 /*
736 * align the address to a page boundary, and adjust the size accordingly
737 */
738 ALIGN_ADDR(addr, size, pageoff);
739 if (addr > SIZE_MAX - size)
740 return EINVAL; /* disallow wrap-around. */
741
742 switch (advice) {
743 case MADV_NORMAL:
744 case MADV_RANDOM:
745 case MADV_SEQUENTIAL:
746 error = uvm_map_advice(&p->p_vmspace->vm_map, addr,
747 addr + size, advice);
748 break;
749
750 case MADV_WILLNEED:
751 /*
752 * Activate all these pages, pre-faulting them in if
753 * necessary.
754 */
755 /*
756 * XXX IMPLEMENT ME.
757 * Should invent a "weak" mode for uvm_fault()
758 * which would only do the PGO_LOCKED pgo_get().
759 */
760 return 0;
761
762 case MADV_DONTNEED:
763 /*
764 * Deactivate all these pages. We don't need them
765 * any more. We don't, however, toss the data in
766 * the pages.
767 */
768 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
769 PGO_DEACTIVATE);
770 break;
771
772 case MADV_FREE:
773 /*
774 * These pages contain no valid data, and may be
775 * garbage-collected. Toss all resources, including
776 * any swap space in use.
777 */
778 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
779 PGO_FREE);
780 break;
781
782 case MADV_SPACEAVAIL:
783 /*
784 * XXXMRG What is this? I think it's:
785 *
786 * Ensure that we have allocated backing-store
787 * for these pages.
788 *
789 * This is going to require changes to the page daemon,
790 * as it will free swap space allocated to pages in core.
791 * There's also what to do for device/file/anonymous memory.
792 */
793 return EINVAL;
794
795 default:
796 return EINVAL;
797 }
798
799 return error;
800 }
801
802 /*
803 * sys_mlock: memory lock
804 */
805
806 int
sys_mlock(struct proc * p,void * v,register_t * retval)807 sys_mlock(struct proc *p, void *v, register_t *retval)
808 {
809 struct sys_mlock_args /* {
810 syscallarg(const void *) addr;
811 syscallarg(size_t) len;
812 } */ *uap = v;
813 vaddr_t addr;
814 vsize_t size, pageoff;
815 int error;
816
817 /* extract syscall args from uap */
818 addr = (vaddr_t)SCARG(uap, addr);
819 size = (vsize_t)SCARG(uap, len);
820
821 /* align address to a page boundary and adjust size accordingly */
822 ALIGN_ADDR(addr, size, pageoff);
823 if (addr > SIZE_MAX - size)
824 return EINVAL; /* disallow wrap-around. */
825
826 if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
827 return EAGAIN;
828
829 #ifdef pmap_wired_count
830 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
831 lim_cur(RLIMIT_MEMLOCK))
832 return EAGAIN;
833 #else
834 if ((error = suser(p)) != 0)
835 return error;
836 #endif
837
838 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
839 0);
840 return error == 0 ? 0 : ENOMEM;
841 }
842
843 /*
844 * sys_munlock: unlock wired pages
845 */
846
847 int
sys_munlock(struct proc * p,void * v,register_t * retval)848 sys_munlock(struct proc *p, void *v, register_t *retval)
849 {
850 struct sys_munlock_args /* {
851 syscallarg(const void *) addr;
852 syscallarg(size_t) len;
853 } */ *uap = v;
854 vaddr_t addr;
855 vsize_t size, pageoff;
856 int error;
857
858 /* extract syscall args from uap */
859 addr = (vaddr_t)SCARG(uap, addr);
860 size = (vsize_t)SCARG(uap, len);
861
862 /* align address to a page boundary, and adjust size accordingly */
863 ALIGN_ADDR(addr, size, pageoff);
864 if (addr > SIZE_MAX - size)
865 return EINVAL; /* disallow wrap-around. */
866
867 #ifndef pmap_wired_count
868 if ((error = suser(p)) != 0)
869 return error;
870 #endif
871
872 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
873 0);
874 return error == 0 ? 0 : ENOMEM;
875 }
876
877 /*
878 * sys_mlockall: lock all pages mapped into an address space.
879 */
880 int
sys_mlockall(struct proc * p,void * v,register_t * retval)881 sys_mlockall(struct proc *p, void *v, register_t *retval)
882 {
883 struct sys_mlockall_args /* {
884 syscallarg(int) flags;
885 } */ *uap = v;
886 int error, flags;
887
888 flags = SCARG(uap, flags);
889
890 if (flags == 0 ||
891 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
892 return EINVAL;
893
894 #ifndef pmap_wired_count
895 if ((error = suser(p)) != 0)
896 return error;
897 #endif
898
899 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
900 lim_cur(RLIMIT_MEMLOCK));
901 if (error != 0 && error != ENOMEM)
902 return EAGAIN;
903 return error;
904 }
905
906 /*
907 * sys_munlockall: unlock all pages mapped into an address space.
908 */
909 int
sys_munlockall(struct proc * p,void * v,register_t * retval)910 sys_munlockall(struct proc *p, void *v, register_t *retval)
911 {
912
913 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
914 return 0;
915 }
916
917 /*
918 * common code for mmapanon and mmapfile to lock a mmaping
919 */
920 int
uvm_mmaplock(vm_map_t map,vaddr_t * addr,vsize_t size,vm_prot_t prot,vsize_t locklimit)921 uvm_mmaplock(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
922 vsize_t locklimit)
923 {
924 int error;
925
926 /*
927 * POSIX 1003.1b -- if our address space was configured
928 * to lock all future mappings, wire the one we just made.
929 */
930 if (prot == PROT_NONE) {
931 /*
932 * No more work to do in this case.
933 */
934 return 0;
935 }
936
937 vm_map_lock(map);
938 if (map->flags & VM_MAP_WIREFUTURE) {
939 KERNEL_LOCK();
940 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
941 #ifdef pmap_wired_count
942 || (locklimit != 0 && (size +
943 ptoa(pmap_wired_count(vm_map_pmap(map)))) >
944 locklimit)
945 #endif
946 ) {
947 error = ENOMEM;
948 vm_map_unlock(map);
949 /* unmap the region! */
950 uvm_unmap(map, *addr, *addr + size);
951 KERNEL_UNLOCK();
952 return error;
953 }
954 /*
955 * uvm_map_pageable() always returns the map
956 * unlocked.
957 */
958 error = uvm_map_pageable(map, *addr, *addr + size,
959 FALSE, UVM_LK_ENTER);
960 if (error != 0) {
961 /* unmap the region! */
962 uvm_unmap(map, *addr, *addr + size);
963 KERNEL_UNLOCK();
964 return error;
965 }
966 KERNEL_UNLOCK();
967 return 0;
968 }
969 vm_map_unlock(map);
970 return 0;
971 }
972
973 /*
974 * uvm_mmapanon: internal version of mmap for anons
975 *
976 * - used by sys_mmap
977 */
978 int
uvm_mmapanon(vm_map_t map,vaddr_t * addr,vsize_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,vsize_t locklimit,struct proc * p)979 uvm_mmapanon(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
980 vm_prot_t maxprot, int flags, vsize_t locklimit, struct proc *p)
981 {
982 int error;
983 int advice = MADV_NORMAL;
984 unsigned int uvmflag = 0;
985 vsize_t align = 0; /* userland page size */
986
987 /*
988 * for non-fixed mappings, round off the suggested address.
989 * for fixed mappings, check alignment and zap old mappings.
990 */
991 if ((flags & MAP_FIXED) == 0) {
992 *addr = round_page(*addr); /* round */
993 } else {
994 if (*addr & PAGE_MASK)
995 return EINVAL;
996
997 uvmflag |= UVM_FLAG_FIXED;
998 if ((flags & __MAP_NOREPLACE) == 0)
999 uvmflag |= UVM_FLAG_UNMAP;
1000 }
1001
1002 if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ)
1003 align = __LDPGSZ;
1004 if ((flags & MAP_SHARED) == 0)
1005 /* XXX: defer amap create */
1006 uvmflag |= UVM_FLAG_COPYONW;
1007 else
1008 /* shared: create amap now */
1009 uvmflag |= UVM_FLAG_OVERLAY;
1010 if (flags & MAP_STACK)
1011 uvmflag |= UVM_FLAG_STACK;
1012 if (flags & MAP_CONCEAL)
1013 uvmflag |= UVM_FLAG_CONCEAL;
1014
1015 /* set up mapping flags */
1016 uvmflag = UVM_MAPFLAG(prot, maxprot,
1017 (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1018 advice, uvmflag);
1019
1020 error = uvm_mapanon(map, addr, size, align, uvmflag);
1021
1022 if (error == 0)
1023 error = uvm_mmaplock(map, addr, size, prot, locklimit);
1024 return error;
1025 }
1026
1027 /*
1028 * uvm_mmapfile: internal version of mmap for non-anons
1029 *
1030 * - used by sys_mmap
1031 * - caller must page-align the file offset
1032 */
1033 int
uvm_mmapfile(vm_map_t map,vaddr_t * addr,vsize_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,struct vnode * vp,voff_t foff,vsize_t locklimit,struct proc * p)1034 uvm_mmapfile(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
1035 vm_prot_t maxprot, int flags, struct vnode *vp, voff_t foff,
1036 vsize_t locklimit, struct proc *p)
1037 {
1038 struct uvm_object *uobj;
1039 int error;
1040 int advice = MADV_NORMAL;
1041 unsigned int uvmflag = 0;
1042 vsize_t align = 0; /* userland page size */
1043
1044 /*
1045 * for non-fixed mappings, round off the suggested address.
1046 * for fixed mappings, check alignment and zap old mappings.
1047 */
1048 if ((flags & MAP_FIXED) == 0) {
1049 *addr = round_page(*addr); /* round */
1050 } else {
1051 if (*addr & PAGE_MASK)
1052 return EINVAL;
1053
1054 uvmflag |= UVM_FLAG_FIXED;
1055 if ((flags & __MAP_NOREPLACE) == 0)
1056 uvmflag |= UVM_FLAG_UNMAP;
1057 }
1058
1059 /*
1060 * attach to underlying vm object.
1061 */
1062 if (vp->v_type != VCHR) {
1063 uobj = uvn_attach(vp, (flags & MAP_SHARED) ?
1064 maxprot : (maxprot & ~PROT_WRITE));
1065
1066 /*
1067 * XXXCDC: hack from old code
1068 * don't allow vnodes which have been mapped
1069 * shared-writeable to persist [forces them to be
1070 * flushed out when last reference goes].
1071 * XXXCDC: interesting side effect: avoids a bug.
1072 * note that in WRITE [ufs_readwrite.c] that we
1073 * allocate buffer, uncache, and then do the write.
1074 * the problem with this is that if the uncache causes
1075 * VM data to be flushed to the same area of the file
1076 * we are writing to... in that case we've got the
1077 * buffer locked and our process goes to sleep forever.
1078 *
1079 * XXXCDC: checking maxprot protects us from the
1080 * "persistbug" program but this is not a long term
1081 * solution.
1082 *
1083 * XXXCDC: we don't bother calling uncache with the vp
1084 * VOP_LOCKed since we know that we are already
1085 * holding a valid reference to the uvn (from the
1086 * uvn_attach above), and thus it is impossible for
1087 * the uncache to kill the uvn and trigger I/O.
1088 */
1089 if (flags & MAP_SHARED) {
1090 if ((prot & PROT_WRITE) ||
1091 (maxprot & PROT_WRITE)) {
1092 uvm_vnp_uncache(vp);
1093 }
1094 }
1095 } else {
1096 uobj = udv_attach(vp->v_rdev,
1097 (flags & MAP_SHARED) ? maxprot :
1098 (maxprot & ~PROT_WRITE), foff, size);
1099 /*
1100 * XXX Some devices don't like to be mapped with
1101 * XXX PROT_EXEC, but we don't really have a
1102 * XXX better way of handling this, right now
1103 */
1104 if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1105 maxprot &= ~PROT_EXEC;
1106 uobj = udv_attach(vp->v_rdev,
1107 (flags & MAP_SHARED) ? maxprot :
1108 (maxprot & ~PROT_WRITE), foff, size);
1109 }
1110 advice = MADV_RANDOM;
1111 }
1112
1113 if (uobj == NULL)
1114 return vp->v_type == VREG ? ENOMEM : EINVAL;
1115
1116 if ((flags & MAP_SHARED) == 0)
1117 uvmflag |= UVM_FLAG_COPYONW;
1118 if (flags & __MAP_NOFAULT)
1119 uvmflag |= (UVM_FLAG_NOFAULT | UVM_FLAG_OVERLAY);
1120 if (flags & MAP_STACK)
1121 uvmflag |= UVM_FLAG_STACK;
1122 if (flags & MAP_CONCEAL)
1123 uvmflag |= UVM_FLAG_CONCEAL;
1124
1125 /* set up mapping flags */
1126 uvmflag = UVM_MAPFLAG(prot, maxprot,
1127 (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1128 advice, uvmflag);
1129
1130 error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1131
1132 if (error == 0)
1133 return uvm_mmaplock(map, addr, size, prot, locklimit);
1134
1135 /* errors: first detach from the uobj, if any. */
1136 if (uobj)
1137 uobj->pgops->pgo_detach(uobj);
1138
1139 return error;
1140 }
1141
1142 int
sys_kbind(struct proc * p,void * v,register_t * retval)1143 sys_kbind(struct proc *p, void *v, register_t *retval)
1144 {
1145 struct sys_kbind_args /* {
1146 syscallarg(const struct __kbind *) param;
1147 syscallarg(size_t) psize;
1148 syscallarg(uint64_t) proc_cookie;
1149 } */ *uap = v;
1150 const struct __kbind *paramp;
1151 union {
1152 struct __kbind uk[KBIND_BLOCK_MAX];
1153 char upad[KBIND_BLOCK_MAX * sizeof(*paramp) + KBIND_DATA_MAX];
1154 } param;
1155 struct uvm_map_deadq dead_entries;
1156 struct process *pr = p->p_p;
1157 const char *data;
1158 vaddr_t baseva, last_baseva, endva, pageoffset, kva;
1159 size_t psize, s;
1160 u_long pc;
1161 int count, i, extra;
1162 int error, sigill = 0;
1163
1164 /*
1165 * extract syscall args from uap
1166 */
1167 paramp = SCARG(uap, param);
1168 psize = SCARG(uap, psize);
1169
1170 /*
1171 * If paramp is NULL and we're uninitialized, disable the syscall
1172 * for the process. Raise SIGILL if paramp is NULL and we're
1173 * already initialized.
1174 *
1175 * If paramp is non-NULL and we're uninitialized, do initialization.
1176 * Otherwise, do security checks and raise SIGILL on failure.
1177 */
1178 pc = PROC_PC(p);
1179 mtx_enter(&pr->ps_mtx);
1180 if (paramp == NULL) {
1181 /* ld.so disables kbind() when lazy binding is disabled */
1182 if (pr->ps_kbind_addr == 0)
1183 pr->ps_kbind_addr = BOGO_PC;
1184 /* pre-7.3 static binaries disable kbind */
1185 /* XXX delete check in 2026 */
1186 else if (pr->ps_kbind_addr != BOGO_PC)
1187 sigill = 1;
1188 } else if (pr->ps_kbind_addr == 0) {
1189 pr->ps_kbind_addr = pc;
1190 pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
1191 } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC ||
1192 pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) {
1193 sigill = 1;
1194 }
1195 mtx_leave(&pr->ps_mtx);
1196
1197 /* Raise SIGILL if something is off. */
1198 if (sigill) {
1199 KERNEL_LOCK();
1200 sigexit(p, SIGILL);
1201 /* NOTREACHED */
1202 KERNEL_UNLOCK();
1203 }
1204
1205 /* We're done if we were disabling the syscall. */
1206 if (paramp == NULL)
1207 return 0;
1208
1209 if (psize < sizeof(struct __kbind) || psize > sizeof(param))
1210 return EINVAL;
1211 if ((error = copyin(paramp, ¶m, psize)))
1212 return error;
1213
1214 /*
1215 * The param argument points to an array of __kbind structures
1216 * followed by the corresponding new data areas for them. Verify
1217 * that the sizes in the __kbind structures add up to the total
1218 * size and find the start of the new area.
1219 */
1220 paramp = ¶m.uk[0];
1221 s = psize;
1222 for (count = 0; s > 0 && count < KBIND_BLOCK_MAX; count++) {
1223 if (s < sizeof(*paramp))
1224 return EINVAL;
1225 s -= sizeof(*paramp);
1226
1227 baseva = (vaddr_t)paramp[count].kb_addr;
1228 endva = baseva + paramp[count].kb_size - 1;
1229 if (paramp[count].kb_addr == NULL ||
1230 paramp[count].kb_size == 0 ||
1231 paramp[count].kb_size > KBIND_DATA_MAX ||
1232 baseva >= VM_MAXUSER_ADDRESS ||
1233 endva >= VM_MAXUSER_ADDRESS ||
1234 s < paramp[count].kb_size)
1235 return EINVAL;
1236
1237 s -= paramp[count].kb_size;
1238 }
1239 if (s > 0)
1240 return EINVAL;
1241 data = (const char *)¶mp[count];
1242
1243 /* all looks good, so do the bindings */
1244 last_baseva = VM_MAXUSER_ADDRESS;
1245 kva = 0;
1246 TAILQ_INIT(&dead_entries);
1247 for (i = 0; i < count; i++) {
1248 baseva = (vaddr_t)paramp[i].kb_addr;
1249 s = paramp[i].kb_size;
1250 pageoffset = baseva & PAGE_MASK;
1251 baseva = trunc_page(baseva);
1252
1253 /* hppa at least runs PLT entries over page edge */
1254 extra = (pageoffset + s) & PAGE_MASK;
1255 if (extra > pageoffset)
1256 extra = 0;
1257 else
1258 s -= extra;
1259 redo:
1260 /* make sure the desired page is mapped into kernel_map */
1261 if (baseva != last_baseva) {
1262 if (kva != 0) {
1263 vm_map_lock(kernel_map);
1264 uvm_unmap_remove(kernel_map, kva,
1265 kva+PAGE_SIZE, &dead_entries,
1266 FALSE, TRUE, FALSE); /* XXX */
1267 vm_map_unlock(kernel_map);
1268 kva = 0;
1269 }
1270 if ((error = uvm_map_extract(&p->p_vmspace->vm_map,
1271 baseva, PAGE_SIZE, &kva, UVM_EXTRACT_FIXPROT)))
1272 break;
1273 last_baseva = baseva;
1274 }
1275
1276 /* do the update */
1277 if ((error = kcopy(data, (char *)kva + pageoffset, s)))
1278 break;
1279 data += s;
1280
1281 if (extra > 0) {
1282 baseva += PAGE_SIZE;
1283 s = extra;
1284 pageoffset = 0;
1285 extra = 0;
1286 goto redo;
1287 }
1288 }
1289
1290 if (kva != 0) {
1291 vm_map_lock(kernel_map);
1292 uvm_unmap_remove(kernel_map, kva, kva+PAGE_SIZE,
1293 &dead_entries, FALSE, TRUE, FALSE); /* XXX */
1294 vm_map_unlock(kernel_map);
1295 }
1296 uvm_unmap_detach(&dead_entries, AMAP_REFALL);
1297
1298 return error;
1299 }
1300