1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $ 42 * $DragonFly: src/sys/vm/vm_mmap.c,v 1.2 2003/06/17 04:29:00 dillon Exp $ 43 */ 44 45 /* 46 * Mapped file (mmap) interface to VM 47 */ 48 49 #include "opt_compat.h" 50 51 #include <sys/param.h> 52 #include <sys/kernel.h> 53 #include <sys/systm.h> 54 #include <sys/sysproto.h> 55 #include <sys/filedesc.h> 56 #include <sys/proc.h> 57 #include <sys/resource.h> 58 #include <sys/resourcevar.h> 59 #include <sys/vnode.h> 60 #include <sys/fcntl.h> 61 #include <sys/file.h> 62 #include <sys/mman.h> 63 #include <sys/conf.h> 64 #include <sys/stat.h> 65 #include <sys/vmmeter.h> 66 #include <sys/sysctl.h> 67 68 #include <vm/vm.h> 69 #include <vm/vm_param.h> 70 #include <sys/lock.h> 71 #include <vm/pmap.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_object.h> 74 #include <vm/vm_page.h> 75 #include <vm/vm_pager.h> 76 #include <vm/vm_pageout.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_kern.h> 80 81 #ifndef _SYS_SYSPROTO_H_ 82 struct sbrk_args { 83 int incr; 84 }; 85 #endif 86 87 static int max_proc_mmap; 88 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 89 90 /* 91 * Set the maximum number of vm_map_entry structures per process. Roughly 92 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 93 * of our KVM malloc space still results in generous limits. We want a 94 * default that is good enough to prevent the kernel running out of resources 95 * if attacked from compromised user account but generous enough such that 96 * multi-threaded processes are not unduly inconvenienced. 97 */ 98 99 static void vmmapentry_rsrc_init __P((void *)); 100 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 101 102 static void 103 vmmapentry_rsrc_init(dummy) 104 void *dummy; 105 { 106 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 107 max_proc_mmap /= 100; 108 } 109 110 /* ARGSUSED */ 111 int 112 sbrk(p, uap) 113 struct proc *p; 114 struct sbrk_args *uap; 115 { 116 117 /* Not yet implemented */ 118 return (EOPNOTSUPP); 119 } 120 121 #ifndef _SYS_SYSPROTO_H_ 122 struct sstk_args { 123 int incr; 124 }; 125 #endif 126 127 /* ARGSUSED */ 128 int 129 sstk(p, uap) 130 struct proc *p; 131 struct sstk_args *uap; 132 { 133 134 /* Not yet implemented */ 135 return (EOPNOTSUPP); 136 } 137 138 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 139 #ifndef _SYS_SYSPROTO_H_ 140 struct getpagesize_args { 141 int dummy; 142 }; 143 #endif 144 145 /* ARGSUSED */ 146 int 147 ogetpagesize(p, uap) 148 struct proc *p; 149 struct getpagesize_args *uap; 150 { 151 152 p->p_retval[0] = PAGE_SIZE; 153 return (0); 154 } 155 #endif /* COMPAT_43 || COMPAT_SUNOS */ 156 157 158 /* 159 * Memory Map (mmap) system call. Note that the file offset 160 * and address are allowed to be NOT page aligned, though if 161 * the MAP_FIXED flag it set, both must have the same remainder 162 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 163 * page-aligned, the actual mapping starts at trunc_page(addr) 164 * and the return value is adjusted up by the page offset. 165 * 166 * Generally speaking, only character devices which are themselves 167 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 168 * there would be no cache coherency between a descriptor and a VM mapping 169 * both to the same character device. 170 * 171 * Block devices can be mmap'd no matter what they represent. Cache coherency 172 * is maintained as long as you do not write directly to the underlying 173 * character device. 174 */ 175 #ifndef _SYS_SYSPROTO_H_ 176 struct mmap_args { 177 void *addr; 178 size_t len; 179 int prot; 180 int flags; 181 int fd; 182 long pad; 183 off_t pos; 184 }; 185 #endif 186 187 int 188 mmap(p, uap) 189 struct proc *p; 190 register struct mmap_args *uap; 191 { 192 register struct filedesc *fdp = p->p_fd; 193 register struct file *fp = NULL; 194 struct vnode *vp; 195 vm_offset_t addr; 196 vm_size_t size, pageoff; 197 vm_prot_t prot, maxprot; 198 void *handle; 199 int flags, error; 200 int disablexworkaround; 201 off_t pos; 202 struct vmspace *vms = p->p_vmspace; 203 vm_object_t obj; 204 205 addr = (vm_offset_t) uap->addr; 206 size = uap->len; 207 prot = uap->prot & VM_PROT_ALL; 208 flags = uap->flags; 209 pos = uap->pos; 210 211 /* make sure mapping fits into numeric range etc */ 212 if ((ssize_t) uap->len < 0 || 213 ((flags & MAP_ANON) && uap->fd != -1)) 214 return (EINVAL); 215 216 if (flags & MAP_STACK) { 217 if ((uap->fd != -1) || 218 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 219 return (EINVAL); 220 flags |= MAP_ANON; 221 pos = 0; 222 } 223 224 /* 225 * Align the file position to a page boundary, 226 * and save its page offset component. 227 */ 228 pageoff = (pos & PAGE_MASK); 229 pos -= pageoff; 230 231 /* Adjust size for rounding (on both ends). */ 232 size += pageoff; /* low end... */ 233 size = (vm_size_t) round_page(size); /* hi end */ 234 235 /* 236 * Check for illegal addresses. Watch out for address wrap... Note 237 * that VM_*_ADDRESS are not constants due to casts (argh). 238 */ 239 if (flags & MAP_FIXED) { 240 /* 241 * The specified address must have the same remainder 242 * as the file offset taken modulo PAGE_SIZE, so it 243 * should be aligned after adjustment by pageoff. 244 */ 245 addr -= pageoff; 246 if (addr & PAGE_MASK) 247 return (EINVAL); 248 /* Address range must be all in user VM space. */ 249 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 250 return (EINVAL); 251 #ifndef i386 252 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 253 return (EINVAL); 254 #endif 255 if (addr + size < addr) 256 return (EINVAL); 257 } 258 /* 259 * XXX for non-fixed mappings where no hint is provided or 260 * the hint would fall in the potential heap space, 261 * place it after the end of the largest possible heap. 262 * 263 * There should really be a pmap call to determine a reasonable 264 * location. 265 */ 266 else if (addr == 0 || 267 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 268 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) 269 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); 270 271 if (flags & MAP_ANON) { 272 /* 273 * Mapping blank space is trivial. 274 */ 275 handle = NULL; 276 maxprot = VM_PROT_ALL; 277 pos = 0; 278 } else { 279 /* 280 * Mapping file, get fp for validation. Obtain vnode and make 281 * sure it is of appropriate type. 282 */ 283 if (((unsigned) uap->fd) >= fdp->fd_nfiles || 284 (fp = fdp->fd_ofiles[uap->fd]) == NULL) 285 return (EBADF); 286 if (fp->f_type != DTYPE_VNODE) 287 return (EINVAL); 288 /* 289 * POSIX shared-memory objects are defined to have 290 * kernel persistence, and are not defined to support 291 * read(2)/write(2) -- or even open(2). Thus, we can 292 * use MAP_ASYNC to trade on-disk coherence for speed. 293 * The shm_open(3) library routine turns on the FPOSIXSHM 294 * flag to request this behavior. 295 */ 296 if (fp->f_flag & FPOSIXSHM) 297 flags |= MAP_NOSYNC; 298 vp = (struct vnode *) fp->f_data; 299 if (vp->v_type != VREG && vp->v_type != VCHR) 300 return (EINVAL); 301 if (vp->v_type == VREG) { 302 /* 303 * Get the proper underlying object 304 */ 305 if (VOP_GETVOBJECT(vp, &obj) != 0) 306 return (EINVAL); 307 vp = (struct vnode*)obj->handle; 308 } 309 310 /* 311 * don't let the descriptor disappear on us if we block 312 */ 313 fhold(fp); 314 315 /* 316 * XXX hack to handle use of /dev/zero to map anon memory (ala 317 * SunOS). 318 */ 319 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 320 handle = NULL; 321 maxprot = VM_PROT_ALL; 322 flags |= MAP_ANON; 323 pos = 0; 324 } else { 325 /* 326 * cdevs does not provide private mappings of any kind. 327 */ 328 /* 329 * However, for XIG X server to continue to work, 330 * we should allow the superuser to do it anyway. 331 * We only allow it at securelevel < 1. 332 * (Because the XIG X server writes directly to video 333 * memory via /dev/mem, it should never work at any 334 * other securelevel. 335 * XXX this will have to go 336 */ 337 if (securelevel >= 1) 338 disablexworkaround = 1; 339 else 340 disablexworkaround = suser(p); 341 if (vp->v_type == VCHR && disablexworkaround && 342 (flags & (MAP_PRIVATE|MAP_COPY))) { 343 error = EINVAL; 344 goto done; 345 } 346 /* 347 * Ensure that file and memory protections are 348 * compatible. Note that we only worry about 349 * writability if mapping is shared; in this case, 350 * current and max prot are dictated by the open file. 351 * XXX use the vnode instead? Problem is: what 352 * credentials do we use for determination? What if 353 * proc does a setuid? 354 */ 355 maxprot = VM_PROT_EXECUTE; /* ??? */ 356 if (fp->f_flag & FREAD) { 357 maxprot |= VM_PROT_READ; 358 } else if (prot & PROT_READ) { 359 error = EACCES; 360 goto done; 361 } 362 /* 363 * If we are sharing potential changes (either via 364 * MAP_SHARED or via the implicit sharing of character 365 * device mappings), and we are trying to get write 366 * permission although we opened it without asking 367 * for it, bail out. Check for superuser, only if 368 * we're at securelevel < 1, to allow the XIG X server 369 * to continue to work. 370 */ 371 372 if ((flags & MAP_SHARED) != 0 || 373 (vp->v_type == VCHR && disablexworkaround)) { 374 if ((fp->f_flag & FWRITE) != 0) { 375 struct vattr va; 376 if ((error = 377 VOP_GETATTR(vp, &va, 378 p->p_ucred, p))) { 379 goto done; 380 } 381 if ((va.va_flags & 382 (IMMUTABLE|APPEND)) == 0) { 383 maxprot |= VM_PROT_WRITE; 384 } else if (prot & PROT_WRITE) { 385 error = EPERM; 386 goto done; 387 } 388 } else if ((prot & PROT_WRITE) != 0) { 389 error = EACCES; 390 goto done; 391 } 392 } else { 393 maxprot |= VM_PROT_WRITE; 394 } 395 handle = (void *)vp; 396 } 397 } 398 399 /* 400 * Do not allow more then a certain number of vm_map_entry structures 401 * per process. Scale with the number of rforks sharing the map 402 * to make the limit reasonable for threads. 403 */ 404 if (max_proc_mmap && 405 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 406 error = ENOMEM; 407 goto done; 408 } 409 410 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 411 flags, handle, pos); 412 if (error == 0) 413 p->p_retval[0] = (register_t) (addr + pageoff); 414 done: 415 if (fp) 416 fdrop(fp, p); 417 return (error); 418 } 419 420 #ifdef COMPAT_43 421 #ifndef _SYS_SYSPROTO_H_ 422 struct ommap_args { 423 caddr_t addr; 424 int len; 425 int prot; 426 int flags; 427 int fd; 428 long pos; 429 }; 430 #endif 431 int 432 ommap(p, uap) 433 struct proc *p; 434 register struct ommap_args *uap; 435 { 436 struct mmap_args nargs; 437 static const char cvtbsdprot[8] = { 438 0, 439 PROT_EXEC, 440 PROT_WRITE, 441 PROT_EXEC | PROT_WRITE, 442 PROT_READ, 443 PROT_EXEC | PROT_READ, 444 PROT_WRITE | PROT_READ, 445 PROT_EXEC | PROT_WRITE | PROT_READ, 446 }; 447 448 #define OMAP_ANON 0x0002 449 #define OMAP_COPY 0x0020 450 #define OMAP_SHARED 0x0010 451 #define OMAP_FIXED 0x0100 452 #define OMAP_INHERIT 0x0800 453 454 nargs.addr = uap->addr; 455 nargs.len = uap->len; 456 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 457 nargs.flags = 0; 458 if (uap->flags & OMAP_ANON) 459 nargs.flags |= MAP_ANON; 460 if (uap->flags & OMAP_COPY) 461 nargs.flags |= MAP_COPY; 462 if (uap->flags & OMAP_SHARED) 463 nargs.flags |= MAP_SHARED; 464 else 465 nargs.flags |= MAP_PRIVATE; 466 if (uap->flags & OMAP_FIXED) 467 nargs.flags |= MAP_FIXED; 468 if (uap->flags & OMAP_INHERIT) 469 nargs.flags |= MAP_INHERIT; 470 nargs.fd = uap->fd; 471 nargs.pos = uap->pos; 472 return (mmap(p, &nargs)); 473 } 474 #endif /* COMPAT_43 */ 475 476 477 #ifndef _SYS_SYSPROTO_H_ 478 struct msync_args { 479 void *addr; 480 int len; 481 int flags; 482 }; 483 #endif 484 int 485 msync(p, uap) 486 struct proc *p; 487 struct msync_args *uap; 488 { 489 vm_offset_t addr; 490 vm_size_t size, pageoff; 491 int flags; 492 vm_map_t map; 493 int rv; 494 495 addr = (vm_offset_t) uap->addr; 496 size = uap->len; 497 flags = uap->flags; 498 499 pageoff = (addr & PAGE_MASK); 500 addr -= pageoff; 501 size += pageoff; 502 size = (vm_size_t) round_page(size); 503 if (addr + size < addr) 504 return(EINVAL); 505 506 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 507 return (EINVAL); 508 509 map = &p->p_vmspace->vm_map; 510 511 /* 512 * XXX Gak! If size is zero we are supposed to sync "all modified 513 * pages with the region containing addr". Unfortunately, we don't 514 * really keep track of individual mmaps so we approximate by flushing 515 * the range of the map entry containing addr. This can be incorrect 516 * if the region splits or is coalesced with a neighbor. 517 */ 518 if (size == 0) { 519 vm_map_entry_t entry; 520 521 vm_map_lock_read(map); 522 rv = vm_map_lookup_entry(map, addr, &entry); 523 vm_map_unlock_read(map); 524 if (rv == FALSE) 525 return (EINVAL); 526 addr = entry->start; 527 size = entry->end - entry->start; 528 } 529 530 /* 531 * Clean the pages and interpret the return value. 532 */ 533 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 534 (flags & MS_INVALIDATE) != 0); 535 536 switch (rv) { 537 case KERN_SUCCESS: 538 break; 539 case KERN_INVALID_ADDRESS: 540 return (EINVAL); /* Sun returns ENOMEM? */ 541 case KERN_FAILURE: 542 return (EIO); 543 default: 544 return (EINVAL); 545 } 546 547 return (0); 548 } 549 550 #ifndef _SYS_SYSPROTO_H_ 551 struct munmap_args { 552 void *addr; 553 size_t len; 554 }; 555 #endif 556 int 557 munmap(p, uap) 558 register struct proc *p; 559 register struct munmap_args *uap; 560 { 561 vm_offset_t addr; 562 vm_size_t size, pageoff; 563 vm_map_t map; 564 565 addr = (vm_offset_t) uap->addr; 566 size = uap->len; 567 568 pageoff = (addr & PAGE_MASK); 569 addr -= pageoff; 570 size += pageoff; 571 size = (vm_size_t) round_page(size); 572 if (addr + size < addr) 573 return(EINVAL); 574 575 if (size == 0) 576 return (0); 577 578 /* 579 * Check for illegal addresses. Watch out for address wrap... Note 580 * that VM_*_ADDRESS are not constants due to casts (argh). 581 */ 582 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 583 return (EINVAL); 584 #ifndef i386 585 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 586 return (EINVAL); 587 #endif 588 map = &p->p_vmspace->vm_map; 589 /* 590 * Make sure entire range is allocated. 591 */ 592 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) 593 return (EINVAL); 594 /* returns nothing but KERN_SUCCESS anyway */ 595 (void) vm_map_remove(map, addr, addr + size); 596 return (0); 597 } 598 599 #if 0 600 void 601 munmapfd(p, fd) 602 struct proc *p; 603 int fd; 604 { 605 /* 606 * XXX should unmap any regions mapped to this file 607 */ 608 p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; 609 } 610 #endif 611 612 #ifndef _SYS_SYSPROTO_H_ 613 struct mprotect_args { 614 const void *addr; 615 size_t len; 616 int prot; 617 }; 618 #endif 619 int 620 mprotect(p, uap) 621 struct proc *p; 622 struct mprotect_args *uap; 623 { 624 vm_offset_t addr; 625 vm_size_t size, pageoff; 626 register vm_prot_t prot; 627 628 addr = (vm_offset_t) uap->addr; 629 size = uap->len; 630 prot = uap->prot & VM_PROT_ALL; 631 #if defined(VM_PROT_READ_IS_EXEC) 632 if (prot & VM_PROT_READ) 633 prot |= VM_PROT_EXECUTE; 634 #endif 635 636 pageoff = (addr & PAGE_MASK); 637 addr -= pageoff; 638 size += pageoff; 639 size = (vm_size_t) round_page(size); 640 if (addr + size < addr) 641 return(EINVAL); 642 643 switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, 644 FALSE)) { 645 case KERN_SUCCESS: 646 return (0); 647 case KERN_PROTECTION_FAILURE: 648 return (EACCES); 649 } 650 return (EINVAL); 651 } 652 653 #ifndef _SYS_SYSPROTO_H_ 654 struct minherit_args { 655 void *addr; 656 size_t len; 657 int inherit; 658 }; 659 #endif 660 int 661 minherit(p, uap) 662 struct proc *p; 663 struct minherit_args *uap; 664 { 665 vm_offset_t addr; 666 vm_size_t size, pageoff; 667 register vm_inherit_t inherit; 668 669 addr = (vm_offset_t)uap->addr; 670 size = uap->len; 671 inherit = uap->inherit; 672 673 pageoff = (addr & PAGE_MASK); 674 addr -= pageoff; 675 size += pageoff; 676 size = (vm_size_t) round_page(size); 677 if (addr + size < addr) 678 return(EINVAL); 679 680 switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, 681 inherit)) { 682 case KERN_SUCCESS: 683 return (0); 684 case KERN_PROTECTION_FAILURE: 685 return (EACCES); 686 } 687 return (EINVAL); 688 } 689 690 #ifndef _SYS_SYSPROTO_H_ 691 struct madvise_args { 692 void *addr; 693 size_t len; 694 int behav; 695 }; 696 #endif 697 698 /* ARGSUSED */ 699 int 700 madvise(p, uap) 701 struct proc *p; 702 struct madvise_args *uap; 703 { 704 vm_offset_t start, end; 705 706 /* 707 * Check for illegal behavior 708 */ 709 if (uap->behav < 0 || uap->behav > MADV_CORE) 710 return (EINVAL); 711 /* 712 * Check for illegal addresses. Watch out for address wrap... Note 713 * that VM_*_ADDRESS are not constants due to casts (argh). 714 */ 715 if (VM_MAXUSER_ADDRESS > 0 && 716 ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS) 717 return (EINVAL); 718 #ifndef i386 719 if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) 720 return (EINVAL); 721 #endif 722 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 723 return (EINVAL); 724 725 /* 726 * Since this routine is only advisory, we default to conservative 727 * behavior. 728 */ 729 start = trunc_page((vm_offset_t) uap->addr); 730 end = round_page((vm_offset_t) uap->addr + uap->len); 731 732 if (vm_map_madvise(&p->p_vmspace->vm_map, start, end, uap->behav)) 733 return (EINVAL); 734 return (0); 735 } 736 737 #ifndef _SYS_SYSPROTO_H_ 738 struct mincore_args { 739 const void *addr; 740 size_t len; 741 char *vec; 742 }; 743 #endif 744 745 /* ARGSUSED */ 746 int 747 mincore(p, uap) 748 struct proc *p; 749 struct mincore_args *uap; 750 { 751 vm_offset_t addr, first_addr; 752 vm_offset_t end, cend; 753 pmap_t pmap; 754 vm_map_t map; 755 char *vec; 756 int error; 757 int vecindex, lastvecindex; 758 register vm_map_entry_t current; 759 vm_map_entry_t entry; 760 int mincoreinfo; 761 unsigned int timestamp; 762 763 /* 764 * Make sure that the addresses presented are valid for user 765 * mode. 766 */ 767 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 768 end = addr + (vm_size_t)round_page(uap->len); 769 if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) 770 return (EINVAL); 771 if (end < addr) 772 return (EINVAL); 773 774 /* 775 * Address of byte vector 776 */ 777 vec = uap->vec; 778 779 map = &p->p_vmspace->vm_map; 780 pmap = vmspace_pmap(p->p_vmspace); 781 782 vm_map_lock_read(map); 783 RestartScan: 784 timestamp = map->timestamp; 785 786 if (!vm_map_lookup_entry(map, addr, &entry)) 787 entry = entry->next; 788 789 /* 790 * Do this on a map entry basis so that if the pages are not 791 * in the current processes address space, we can easily look 792 * up the pages elsewhere. 793 */ 794 lastvecindex = -1; 795 for(current = entry; 796 (current != &map->header) && (current->start < end); 797 current = current->next) { 798 799 /* 800 * ignore submaps (for now) or null objects 801 */ 802 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 803 current->object.vm_object == NULL) 804 continue; 805 806 /* 807 * limit this scan to the current map entry and the 808 * limits for the mincore call 809 */ 810 if (addr < current->start) 811 addr = current->start; 812 cend = current->end; 813 if (cend > end) 814 cend = end; 815 816 /* 817 * scan this entry one page at a time 818 */ 819 while(addr < cend) { 820 /* 821 * Check pmap first, it is likely faster, also 822 * it can provide info as to whether we are the 823 * one referencing or modifying the page. 824 */ 825 mincoreinfo = pmap_mincore(pmap, addr); 826 if (!mincoreinfo) { 827 vm_pindex_t pindex; 828 vm_ooffset_t offset; 829 vm_page_t m; 830 /* 831 * calculate the page index into the object 832 */ 833 offset = current->offset + (addr - current->start); 834 pindex = OFF_TO_IDX(offset); 835 m = vm_page_lookup(current->object.vm_object, 836 pindex); 837 /* 838 * if the page is resident, then gather information about 839 * it. 840 */ 841 if (m) { 842 mincoreinfo = MINCORE_INCORE; 843 if (m->dirty || 844 pmap_is_modified(m)) 845 mincoreinfo |= MINCORE_MODIFIED_OTHER; 846 if ((m->flags & PG_REFERENCED) || 847 pmap_ts_referenced(m)) { 848 vm_page_flag_set(m, PG_REFERENCED); 849 mincoreinfo |= MINCORE_REFERENCED_OTHER; 850 } 851 } 852 } 853 854 /* 855 * subyte may page fault. In case it needs to modify 856 * the map, we release the lock. 857 */ 858 vm_map_unlock_read(map); 859 860 /* 861 * calculate index into user supplied byte vector 862 */ 863 vecindex = OFF_TO_IDX(addr - first_addr); 864 865 /* 866 * If we have skipped map entries, we need to make sure that 867 * the byte vector is zeroed for those skipped entries. 868 */ 869 while((lastvecindex + 1) < vecindex) { 870 error = subyte( vec + lastvecindex, 0); 871 if (error) { 872 return (EFAULT); 873 } 874 ++lastvecindex; 875 } 876 877 /* 878 * Pass the page information to the user 879 */ 880 error = subyte( vec + vecindex, mincoreinfo); 881 if (error) { 882 return (EFAULT); 883 } 884 885 /* 886 * If the map has changed, due to the subyte, the previous 887 * output may be invalid. 888 */ 889 vm_map_lock_read(map); 890 if (timestamp != map->timestamp) 891 goto RestartScan; 892 893 lastvecindex = vecindex; 894 addr += PAGE_SIZE; 895 } 896 } 897 898 /* 899 * subyte may page fault. In case it needs to modify 900 * the map, we release the lock. 901 */ 902 vm_map_unlock_read(map); 903 904 /* 905 * Zero the last entries in the byte vector. 906 */ 907 vecindex = OFF_TO_IDX(end - first_addr); 908 while((lastvecindex + 1) < vecindex) { 909 error = subyte( vec + lastvecindex, 0); 910 if (error) { 911 return (EFAULT); 912 } 913 ++lastvecindex; 914 } 915 916 /* 917 * If the map has changed, due to the subyte, the previous 918 * output may be invalid. 919 */ 920 vm_map_lock_read(map); 921 if (timestamp != map->timestamp) 922 goto RestartScan; 923 vm_map_unlock_read(map); 924 925 return (0); 926 } 927 928 #ifndef _SYS_SYSPROTO_H_ 929 struct mlock_args { 930 const void *addr; 931 size_t len; 932 }; 933 #endif 934 int 935 mlock(p, uap) 936 struct proc *p; 937 struct mlock_args *uap; 938 { 939 vm_offset_t addr; 940 vm_size_t size, pageoff; 941 int error; 942 943 addr = (vm_offset_t) uap->addr; 944 size = uap->len; 945 946 pageoff = (addr & PAGE_MASK); 947 addr -= pageoff; 948 size += pageoff; 949 size = (vm_size_t) round_page(size); 950 951 /* disable wrap around */ 952 if (addr + size < addr) 953 return (EINVAL); 954 955 if (atop(size) + cnt.v_wire_count > vm_page_max_wired) 956 return (EAGAIN); 957 958 #ifdef pmap_wired_count 959 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 960 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 961 return (ENOMEM); 962 #else 963 error = suser(p); 964 if (error) 965 return (error); 966 #endif 967 968 error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, addr + size, FALSE); 969 return (error == KERN_SUCCESS ? 0 : ENOMEM); 970 } 971 972 #ifndef _SYS_SYSPROTO_H_ 973 struct mlockall_args { 974 int how; 975 }; 976 #endif 977 978 int 979 mlockall(p, uap) 980 struct proc *p; 981 struct mlockall_args *uap; 982 { 983 return 0; 984 } 985 986 #ifndef _SYS_SYSPROTO_H_ 987 struct mlockall_args { 988 int how; 989 }; 990 #endif 991 992 int 993 munlockall(p, uap) 994 struct proc *p; 995 struct munlockall_args *uap; 996 { 997 return 0; 998 } 999 1000 #ifndef _SYS_SYSPROTO_H_ 1001 struct munlock_args { 1002 const void *addr; 1003 size_t len; 1004 }; 1005 #endif 1006 int 1007 munlock(p, uap) 1008 struct proc *p; 1009 struct munlock_args *uap; 1010 { 1011 vm_offset_t addr; 1012 vm_size_t size, pageoff; 1013 int error; 1014 1015 addr = (vm_offset_t) uap->addr; 1016 size = uap->len; 1017 1018 pageoff = (addr & PAGE_MASK); 1019 addr -= pageoff; 1020 size += pageoff; 1021 size = (vm_size_t) round_page(size); 1022 1023 /* disable wrap around */ 1024 if (addr + size < addr) 1025 return (EINVAL); 1026 1027 #ifndef pmap_wired_count 1028 error = suser(p); 1029 if (error) 1030 return (error); 1031 #endif 1032 1033 error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, addr + size, TRUE); 1034 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1035 } 1036 1037 /* 1038 * Internal version of mmap. 1039 * Currently used by mmap, exec, and sys5 shared memory. 1040 * Handle is either a vnode pointer or NULL for MAP_ANON. 1041 */ 1042 int 1043 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1044 vm_prot_t maxprot, int flags, 1045 void *handle, 1046 vm_ooffset_t foff) 1047 { 1048 boolean_t fitit; 1049 vm_object_t object; 1050 struct vnode *vp = NULL; 1051 objtype_t type; 1052 int rv = KERN_SUCCESS; 1053 vm_ooffset_t objsize; 1054 int docow; 1055 struct proc *p = curproc; 1056 1057 if (size == 0) 1058 return (0); 1059 1060 objsize = size = round_page(size); 1061 1062 if (p->p_vmspace->vm_map.size + size > 1063 p->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1064 return(ENOMEM); 1065 } 1066 1067 /* 1068 * We currently can only deal with page aligned file offsets. 1069 * The check is here rather than in the syscall because the 1070 * kernel calls this function internally for other mmaping 1071 * operations (such as in exec) and non-aligned offsets will 1072 * cause pmap inconsistencies...so we want to be sure to 1073 * disallow this in all cases. 1074 */ 1075 if (foff & PAGE_MASK) 1076 return (EINVAL); 1077 1078 if ((flags & MAP_FIXED) == 0) { 1079 fitit = TRUE; 1080 *addr = round_page(*addr); 1081 } else { 1082 if (*addr != trunc_page(*addr)) 1083 return (EINVAL); 1084 fitit = FALSE; 1085 (void) vm_map_remove(map, *addr, *addr + size); 1086 } 1087 1088 /* 1089 * Lookup/allocate object. 1090 */ 1091 if (flags & MAP_ANON) { 1092 type = OBJT_DEFAULT; 1093 /* 1094 * Unnamed anonymous regions always start at 0. 1095 */ 1096 if (handle == 0) 1097 foff = 0; 1098 } else { 1099 vp = (struct vnode *) handle; 1100 if (vp->v_type == VCHR) { 1101 type = OBJT_DEVICE; 1102 handle = (void *)(intptr_t)vp->v_rdev; 1103 } else { 1104 struct vattr vat; 1105 int error; 1106 1107 error = VOP_GETATTR(vp, &vat, p->p_ucred, p); 1108 if (error) 1109 return (error); 1110 objsize = round_page(vat.va_size); 1111 type = OBJT_VNODE; 1112 /* 1113 * if it is a regular file without any references 1114 * we do not need to sync it. 1115 */ 1116 if (vp->v_type == VREG && vat.va_nlink == 0) { 1117 flags |= MAP_NOSYNC; 1118 } 1119 } 1120 } 1121 1122 if (handle == NULL) { 1123 object = NULL; 1124 docow = 0; 1125 } else { 1126 object = vm_pager_allocate(type, 1127 handle, objsize, prot, foff); 1128 if (object == NULL) 1129 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1130 docow = MAP_PREFAULT_PARTIAL; 1131 } 1132 1133 /* 1134 * Force device mappings to be shared. 1135 */ 1136 if (type == OBJT_DEVICE || type == OBJT_PHYS) { 1137 flags &= ~(MAP_PRIVATE|MAP_COPY); 1138 flags |= MAP_SHARED; 1139 } 1140 1141 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1142 docow |= MAP_COPY_ON_WRITE; 1143 if (flags & MAP_NOSYNC) 1144 docow |= MAP_DISABLE_SYNCER; 1145 if (flags & MAP_NOCORE) 1146 docow |= MAP_DISABLE_COREDUMP; 1147 1148 #if defined(VM_PROT_READ_IS_EXEC) 1149 if (prot & VM_PROT_READ) 1150 prot |= VM_PROT_EXECUTE; 1151 1152 if (maxprot & VM_PROT_READ) 1153 maxprot |= VM_PROT_EXECUTE; 1154 #endif 1155 1156 if (fitit) { 1157 *addr = pmap_addr_hint(object, *addr, size); 1158 } 1159 1160 if (flags & MAP_STACK) 1161 rv = vm_map_stack (map, *addr, size, prot, 1162 maxprot, docow); 1163 else 1164 rv = vm_map_find(map, object, foff, addr, size, fitit, 1165 prot, maxprot, docow); 1166 1167 if (rv != KERN_SUCCESS) { 1168 /* 1169 * Lose the object reference. Will destroy the 1170 * object if it's an unnamed anonymous mapping 1171 * or named anonymous without other references. 1172 */ 1173 vm_object_deallocate(object); 1174 goto out; 1175 } 1176 1177 /* 1178 * Shared memory is also shared with children. 1179 */ 1180 if (flags & (MAP_SHARED|MAP_INHERIT)) { 1181 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1182 if (rv != KERN_SUCCESS) { 1183 (void) vm_map_remove(map, *addr, *addr + size); 1184 goto out; 1185 } 1186 } 1187 out: 1188 switch (rv) { 1189 case KERN_SUCCESS: 1190 return (0); 1191 case KERN_INVALID_ADDRESS: 1192 case KERN_NO_SPACE: 1193 return (ENOMEM); 1194 case KERN_PROTECTION_FAILURE: 1195 return (EACCES); 1196 default: 1197 return (EINVAL); 1198 } 1199 } 1200