1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $ 42 */ 43 44 /* 45 * Mapped file (mmap) interface to VM 46 */ 47 48 #include "opt_compat.h" 49 50 #include <sys/param.h> 51 #include <sys/kernel.h> 52 #include <sys/systm.h> 53 #include <sys/sysproto.h> 54 #include <sys/filedesc.h> 55 #include <sys/proc.h> 56 #include <sys/resource.h> 57 #include <sys/resourcevar.h> 58 #include <sys/vnode.h> 59 #include <sys/fcntl.h> 60 #include <sys/file.h> 61 #include <sys/mman.h> 62 #include <sys/conf.h> 63 #include <sys/stat.h> 64 #include <sys/vmmeter.h> 65 #include <sys/sysctl.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <sys/lock.h> 70 #include <vm/pmap.h> 71 #include <vm/vm_map.h> 72 #include <vm/vm_object.h> 73 #include <vm/vm_page.h> 74 #include <vm/vm_pager.h> 75 #include <vm/vm_pageout.h> 76 #include <vm/vm_extern.h> 77 #include <vm/vm_page.h> 78 #include <vm/vm_kern.h> 79 80 #ifndef _SYS_SYSPROTO_H_ 81 struct sbrk_args { 82 int incr; 83 }; 84 #endif 85 86 static int max_proc_mmap; 87 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 88 89 /* 90 * Set the maximum number of vm_map_entry structures per process. Roughly 91 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 92 * of our KVM malloc space still results in generous limits. We want a 93 * default that is good enough to prevent the kernel running out of resources 94 * if attacked from compromised user account but generous enough such that 95 * multi-threaded processes are not unduly inconvenienced. 96 */ 97 98 static void vmmapentry_rsrc_init __P((void *)); 99 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 100 101 static void 102 vmmapentry_rsrc_init(dummy) 103 void *dummy; 104 { 105 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 106 max_proc_mmap /= 100; 107 } 108 109 /* ARGSUSED */ 110 int 111 sbrk(p, uap) 112 struct proc *p; 113 struct sbrk_args *uap; 114 { 115 116 /* Not yet implemented */ 117 return (EOPNOTSUPP); 118 } 119 120 #ifndef _SYS_SYSPROTO_H_ 121 struct sstk_args { 122 int incr; 123 }; 124 #endif 125 126 /* ARGSUSED */ 127 int 128 sstk(p, uap) 129 struct proc *p; 130 struct sstk_args *uap; 131 { 132 133 /* Not yet implemented */ 134 return (EOPNOTSUPP); 135 } 136 137 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 138 #ifndef _SYS_SYSPROTO_H_ 139 struct getpagesize_args { 140 int dummy; 141 }; 142 #endif 143 144 /* ARGSUSED */ 145 int 146 ogetpagesize(p, uap) 147 struct proc *p; 148 struct getpagesize_args *uap; 149 { 150 151 p->p_retval[0] = PAGE_SIZE; 152 return (0); 153 } 154 #endif /* COMPAT_43 || COMPAT_SUNOS */ 155 156 157 /* 158 * Memory Map (mmap) system call. Note that the file offset 159 * and address are allowed to be NOT page aligned, though if 160 * the MAP_FIXED flag it set, both must have the same remainder 161 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 162 * page-aligned, the actual mapping starts at trunc_page(addr) 163 * and the return value is adjusted up by the page offset. 164 * 165 * Generally speaking, only character devices which are themselves 166 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 167 * there would be no cache coherency between a descriptor and a VM mapping 168 * both to the same character device. 169 * 170 * Block devices can be mmap'd no matter what they represent. Cache coherency 171 * is maintained as long as you do not write directly to the underlying 172 * character device. 173 */ 174 #ifndef _SYS_SYSPROTO_H_ 175 struct mmap_args { 176 void *addr; 177 size_t len; 178 int prot; 179 int flags; 180 int fd; 181 long pad; 182 off_t pos; 183 }; 184 #endif 185 186 int 187 mmap(p, uap) 188 struct proc *p; 189 register struct mmap_args *uap; 190 { 191 register struct filedesc *fdp = p->p_fd; 192 register struct file *fp = NULL; 193 struct vnode *vp; 194 vm_offset_t addr; 195 vm_size_t size, pageoff; 196 vm_prot_t prot, maxprot; 197 void *handle; 198 int flags, error; 199 int disablexworkaround; 200 off_t pos; 201 struct vmspace *vms = p->p_vmspace; 202 vm_object_t obj; 203 204 addr = (vm_offset_t) uap->addr; 205 size = uap->len; 206 prot = uap->prot & VM_PROT_ALL; 207 flags = uap->flags; 208 pos = uap->pos; 209 210 /* make sure mapping fits into numeric range etc */ 211 if ((ssize_t) uap->len < 0 || 212 ((flags & MAP_ANON) && uap->fd != -1)) 213 return (EINVAL); 214 215 if (flags & MAP_STACK) { 216 if ((uap->fd != -1) || 217 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 218 return (EINVAL); 219 flags |= MAP_ANON; 220 pos = 0; 221 } 222 223 /* 224 * Align the file position to a page boundary, 225 * and save its page offset component. 226 */ 227 pageoff = (pos & PAGE_MASK); 228 pos -= pageoff; 229 230 /* Adjust size for rounding (on both ends). */ 231 size += pageoff; /* low end... */ 232 size = (vm_size_t) round_page(size); /* hi end */ 233 234 /* 235 * Check for illegal addresses. Watch out for address wrap... Note 236 * that VM_*_ADDRESS are not constants due to casts (argh). 237 */ 238 if (flags & MAP_FIXED) { 239 /* 240 * The specified address must have the same remainder 241 * as the file offset taken modulo PAGE_SIZE, so it 242 * should be aligned after adjustment by pageoff. 243 */ 244 addr -= pageoff; 245 if (addr & PAGE_MASK) 246 return (EINVAL); 247 /* Address range must be all in user VM space. */ 248 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 249 return (EINVAL); 250 #ifndef i386 251 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 252 return (EINVAL); 253 #endif 254 if (addr + size < addr) 255 return (EINVAL); 256 } 257 /* 258 * XXX for non-fixed mappings where no hint is provided or 259 * the hint would fall in the potential heap space, 260 * place it after the end of the largest possible heap. 261 * 262 * There should really be a pmap call to determine a reasonable 263 * location. 264 */ 265 else if (addr == 0 || 266 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 267 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) 268 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); 269 270 if (flags & MAP_ANON) { 271 /* 272 * Mapping blank space is trivial. 273 */ 274 handle = NULL; 275 maxprot = VM_PROT_ALL; 276 pos = 0; 277 } else { 278 /* 279 * Mapping file, get fp for validation. Obtain vnode and make 280 * sure it is of appropriate type. 281 */ 282 if (((unsigned) uap->fd) >= fdp->fd_nfiles || 283 (fp = fdp->fd_ofiles[uap->fd]) == NULL) 284 return (EBADF); 285 if (fp->f_type != DTYPE_VNODE) 286 return (EINVAL); 287 /* 288 * POSIX shared-memory objects are defined to have 289 * kernel persistence, and are not defined to support 290 * read(2)/write(2) -- or even open(2). Thus, we can 291 * use MAP_ASYNC to trade on-disk coherence for speed. 292 * The shm_open(3) library routine turns on the FPOSIXSHM 293 * flag to request this behavior. 294 */ 295 if (fp->f_flag & FPOSIXSHM) 296 flags |= MAP_NOSYNC; 297 vp = (struct vnode *) fp->f_data; 298 if (vp->v_type != VREG && vp->v_type != VCHR) 299 return (EINVAL); 300 if (vp->v_type == VREG) { 301 /* 302 * Get the proper underlying object 303 */ 304 if (VOP_GETVOBJECT(vp, &obj) != 0) 305 return (EINVAL); 306 vp = (struct vnode*)obj->handle; 307 } 308 309 /* 310 * don't let the descriptor disappear on us if we block 311 */ 312 fhold(fp); 313 314 /* 315 * XXX hack to handle use of /dev/zero to map anon memory (ala 316 * SunOS). 317 */ 318 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 319 handle = NULL; 320 maxprot = VM_PROT_ALL; 321 flags |= MAP_ANON; 322 pos = 0; 323 } else { 324 /* 325 * cdevs does not provide private mappings of any kind. 326 */ 327 /* 328 * However, for XIG X server to continue to work, 329 * we should allow the superuser to do it anyway. 330 * We only allow it at securelevel < 1. 331 * (Because the XIG X server writes directly to video 332 * memory via /dev/mem, it should never work at any 333 * other securelevel. 334 * XXX this will have to go 335 */ 336 if (securelevel >= 1) 337 disablexworkaround = 1; 338 else 339 disablexworkaround = suser(p); 340 if (vp->v_type == VCHR && disablexworkaround && 341 (flags & (MAP_PRIVATE|MAP_COPY))) { 342 error = EINVAL; 343 goto done; 344 } 345 /* 346 * Ensure that file and memory protections are 347 * compatible. Note that we only worry about 348 * writability if mapping is shared; in this case, 349 * current and max prot are dictated by the open file. 350 * XXX use the vnode instead? Problem is: what 351 * credentials do we use for determination? What if 352 * proc does a setuid? 353 */ 354 maxprot = VM_PROT_EXECUTE; /* ??? */ 355 if (fp->f_flag & FREAD) { 356 maxprot |= VM_PROT_READ; 357 } else if (prot & PROT_READ) { 358 error = EACCES; 359 goto done; 360 } 361 /* 362 * If we are sharing potential changes (either via 363 * MAP_SHARED or via the implicit sharing of character 364 * device mappings), and we are trying to get write 365 * permission although we opened it without asking 366 * for it, bail out. Check for superuser, only if 367 * we're at securelevel < 1, to allow the XIG X server 368 * to continue to work. 369 */ 370 371 if ((flags & MAP_SHARED) != 0 || 372 (vp->v_type == VCHR && disablexworkaround)) { 373 if ((fp->f_flag & FWRITE) != 0) { 374 struct vattr va; 375 if ((error = 376 VOP_GETATTR(vp, &va, 377 p->p_ucred, p))) { 378 goto done; 379 } 380 if ((va.va_flags & 381 (IMMUTABLE|APPEND)) == 0) { 382 maxprot |= VM_PROT_WRITE; 383 } else if (prot & PROT_WRITE) { 384 error = EPERM; 385 goto done; 386 } 387 } else if ((prot & PROT_WRITE) != 0) { 388 error = EACCES; 389 goto done; 390 } 391 } else { 392 maxprot |= VM_PROT_WRITE; 393 } 394 handle = (void *)vp; 395 } 396 } 397 398 /* 399 * Do not allow more then a certain number of vm_map_entry structures 400 * per process. Scale with the number of rforks sharing the map 401 * to make the limit reasonable for threads. 402 */ 403 if (max_proc_mmap && 404 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 405 error = ENOMEM; 406 goto done; 407 } 408 409 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 410 flags, handle, pos); 411 if (error == 0) 412 p->p_retval[0] = (register_t) (addr + pageoff); 413 done: 414 if (fp) 415 fdrop(fp, p); 416 return (error); 417 } 418 419 #ifdef COMPAT_43 420 #ifndef _SYS_SYSPROTO_H_ 421 struct ommap_args { 422 caddr_t addr; 423 int len; 424 int prot; 425 int flags; 426 int fd; 427 long pos; 428 }; 429 #endif 430 int 431 ommap(p, uap) 432 struct proc *p; 433 register struct ommap_args *uap; 434 { 435 struct mmap_args nargs; 436 static const char cvtbsdprot[8] = { 437 0, 438 PROT_EXEC, 439 PROT_WRITE, 440 PROT_EXEC | PROT_WRITE, 441 PROT_READ, 442 PROT_EXEC | PROT_READ, 443 PROT_WRITE | PROT_READ, 444 PROT_EXEC | PROT_WRITE | PROT_READ, 445 }; 446 447 #define OMAP_ANON 0x0002 448 #define OMAP_COPY 0x0020 449 #define OMAP_SHARED 0x0010 450 #define OMAP_FIXED 0x0100 451 #define OMAP_INHERIT 0x0800 452 453 nargs.addr = uap->addr; 454 nargs.len = uap->len; 455 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 456 nargs.flags = 0; 457 if (uap->flags & OMAP_ANON) 458 nargs.flags |= MAP_ANON; 459 if (uap->flags & OMAP_COPY) 460 nargs.flags |= MAP_COPY; 461 if (uap->flags & OMAP_SHARED) 462 nargs.flags |= MAP_SHARED; 463 else 464 nargs.flags |= MAP_PRIVATE; 465 if (uap->flags & OMAP_FIXED) 466 nargs.flags |= MAP_FIXED; 467 if (uap->flags & OMAP_INHERIT) 468 nargs.flags |= MAP_INHERIT; 469 nargs.fd = uap->fd; 470 nargs.pos = uap->pos; 471 return (mmap(p, &nargs)); 472 } 473 #endif /* COMPAT_43 */ 474 475 476 #ifndef _SYS_SYSPROTO_H_ 477 struct msync_args { 478 void *addr; 479 int len; 480 int flags; 481 }; 482 #endif 483 int 484 msync(p, uap) 485 struct proc *p; 486 struct msync_args *uap; 487 { 488 vm_offset_t addr; 489 vm_size_t size, pageoff; 490 int flags; 491 vm_map_t map; 492 int rv; 493 494 addr = (vm_offset_t) uap->addr; 495 size = uap->len; 496 flags = uap->flags; 497 498 pageoff = (addr & PAGE_MASK); 499 addr -= pageoff; 500 size += pageoff; 501 size = (vm_size_t) round_page(size); 502 if (addr + size < addr) 503 return(EINVAL); 504 505 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 506 return (EINVAL); 507 508 map = &p->p_vmspace->vm_map; 509 510 /* 511 * XXX Gak! If size is zero we are supposed to sync "all modified 512 * pages with the region containing addr". Unfortunately, we don't 513 * really keep track of individual mmaps so we approximate by flushing 514 * the range of the map entry containing addr. This can be incorrect 515 * if the region splits or is coalesced with a neighbor. 516 */ 517 if (size == 0) { 518 vm_map_entry_t entry; 519 520 vm_map_lock_read(map); 521 rv = vm_map_lookup_entry(map, addr, &entry); 522 vm_map_unlock_read(map); 523 if (rv == FALSE) 524 return (EINVAL); 525 addr = entry->start; 526 size = entry->end - entry->start; 527 } 528 529 /* 530 * Clean the pages and interpret the return value. 531 */ 532 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 533 (flags & MS_INVALIDATE) != 0); 534 535 switch (rv) { 536 case KERN_SUCCESS: 537 break; 538 case KERN_INVALID_ADDRESS: 539 return (EINVAL); /* Sun returns ENOMEM? */ 540 case KERN_FAILURE: 541 return (EIO); 542 default: 543 return (EINVAL); 544 } 545 546 return (0); 547 } 548 549 #ifndef _SYS_SYSPROTO_H_ 550 struct munmap_args { 551 void *addr; 552 size_t len; 553 }; 554 #endif 555 int 556 munmap(p, uap) 557 register struct proc *p; 558 register struct munmap_args *uap; 559 { 560 vm_offset_t addr; 561 vm_size_t size, pageoff; 562 vm_map_t map; 563 564 addr = (vm_offset_t) uap->addr; 565 size = uap->len; 566 567 pageoff = (addr & PAGE_MASK); 568 addr -= pageoff; 569 size += pageoff; 570 size = (vm_size_t) round_page(size); 571 if (addr + size < addr) 572 return(EINVAL); 573 574 if (size == 0) 575 return (0); 576 577 /* 578 * Check for illegal addresses. Watch out for address wrap... Note 579 * that VM_*_ADDRESS are not constants due to casts (argh). 580 */ 581 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 582 return (EINVAL); 583 #ifndef i386 584 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 585 return (EINVAL); 586 #endif 587 map = &p->p_vmspace->vm_map; 588 /* 589 * Make sure entire range is allocated. 590 */ 591 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) 592 return (EINVAL); 593 /* returns nothing but KERN_SUCCESS anyway */ 594 (void) vm_map_remove(map, addr, addr + size); 595 return (0); 596 } 597 598 #if 0 599 void 600 munmapfd(p, fd) 601 struct proc *p; 602 int fd; 603 { 604 /* 605 * XXX should unmap any regions mapped to this file 606 */ 607 p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; 608 } 609 #endif 610 611 #ifndef _SYS_SYSPROTO_H_ 612 struct mprotect_args { 613 const void *addr; 614 size_t len; 615 int prot; 616 }; 617 #endif 618 int 619 mprotect(p, uap) 620 struct proc *p; 621 struct mprotect_args *uap; 622 { 623 vm_offset_t addr; 624 vm_size_t size, pageoff; 625 register vm_prot_t prot; 626 627 addr = (vm_offset_t) uap->addr; 628 size = uap->len; 629 prot = uap->prot & VM_PROT_ALL; 630 #if defined(VM_PROT_READ_IS_EXEC) 631 if (prot & VM_PROT_READ) 632 prot |= VM_PROT_EXECUTE; 633 #endif 634 635 pageoff = (addr & PAGE_MASK); 636 addr -= pageoff; 637 size += pageoff; 638 size = (vm_size_t) round_page(size); 639 if (addr + size < addr) 640 return(EINVAL); 641 642 switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, 643 FALSE)) { 644 case KERN_SUCCESS: 645 return (0); 646 case KERN_PROTECTION_FAILURE: 647 return (EACCES); 648 } 649 return (EINVAL); 650 } 651 652 #ifndef _SYS_SYSPROTO_H_ 653 struct minherit_args { 654 void *addr; 655 size_t len; 656 int inherit; 657 }; 658 #endif 659 int 660 minherit(p, uap) 661 struct proc *p; 662 struct minherit_args *uap; 663 { 664 vm_offset_t addr; 665 vm_size_t size, pageoff; 666 register vm_inherit_t inherit; 667 668 addr = (vm_offset_t)uap->addr; 669 size = uap->len; 670 inherit = uap->inherit; 671 672 pageoff = (addr & PAGE_MASK); 673 addr -= pageoff; 674 size += pageoff; 675 size = (vm_size_t) round_page(size); 676 if (addr + size < addr) 677 return(EINVAL); 678 679 switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, 680 inherit)) { 681 case KERN_SUCCESS: 682 return (0); 683 case KERN_PROTECTION_FAILURE: 684 return (EACCES); 685 } 686 return (EINVAL); 687 } 688 689 #ifndef _SYS_SYSPROTO_H_ 690 struct madvise_args { 691 void *addr; 692 size_t len; 693 int behav; 694 }; 695 #endif 696 697 /* ARGSUSED */ 698 int 699 madvise(p, uap) 700 struct proc *p; 701 struct madvise_args *uap; 702 { 703 vm_offset_t start, end; 704 705 /* 706 * Check for illegal behavior 707 */ 708 if (uap->behav < 0 || uap->behav > MADV_CORE) 709 return (EINVAL); 710 /* 711 * Check for illegal addresses. Watch out for address wrap... Note 712 * that VM_*_ADDRESS are not constants due to casts (argh). 713 */ 714 if (VM_MAXUSER_ADDRESS > 0 && 715 ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS) 716 return (EINVAL); 717 #ifndef i386 718 if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) 719 return (EINVAL); 720 #endif 721 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 722 return (EINVAL); 723 724 /* 725 * Since this routine is only advisory, we default to conservative 726 * behavior. 727 */ 728 start = trunc_page((vm_offset_t) uap->addr); 729 end = round_page((vm_offset_t) uap->addr + uap->len); 730 731 if (vm_map_madvise(&p->p_vmspace->vm_map, start, end, uap->behav)) 732 return (EINVAL); 733 return (0); 734 } 735 736 #ifndef _SYS_SYSPROTO_H_ 737 struct mincore_args { 738 const void *addr; 739 size_t len; 740 char *vec; 741 }; 742 #endif 743 744 /* ARGSUSED */ 745 int 746 mincore(p, uap) 747 struct proc *p; 748 struct mincore_args *uap; 749 { 750 vm_offset_t addr, first_addr; 751 vm_offset_t end, cend; 752 pmap_t pmap; 753 vm_map_t map; 754 char *vec; 755 int error; 756 int vecindex, lastvecindex; 757 register vm_map_entry_t current; 758 vm_map_entry_t entry; 759 int mincoreinfo; 760 unsigned int timestamp; 761 762 /* 763 * Make sure that the addresses presented are valid for user 764 * mode. 765 */ 766 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 767 end = addr + (vm_size_t)round_page(uap->len); 768 if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) 769 return (EINVAL); 770 if (end < addr) 771 return (EINVAL); 772 773 /* 774 * Address of byte vector 775 */ 776 vec = uap->vec; 777 778 map = &p->p_vmspace->vm_map; 779 pmap = vmspace_pmap(p->p_vmspace); 780 781 vm_map_lock_read(map); 782 RestartScan: 783 timestamp = map->timestamp; 784 785 if (!vm_map_lookup_entry(map, addr, &entry)) 786 entry = entry->next; 787 788 /* 789 * Do this on a map entry basis so that if the pages are not 790 * in the current processes address space, we can easily look 791 * up the pages elsewhere. 792 */ 793 lastvecindex = -1; 794 for(current = entry; 795 (current != &map->header) && (current->start < end); 796 current = current->next) { 797 798 /* 799 * ignore submaps (for now) or null objects 800 */ 801 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 802 current->object.vm_object == NULL) 803 continue; 804 805 /* 806 * limit this scan to the current map entry and the 807 * limits for the mincore call 808 */ 809 if (addr < current->start) 810 addr = current->start; 811 cend = current->end; 812 if (cend > end) 813 cend = end; 814 815 /* 816 * scan this entry one page at a time 817 */ 818 while(addr < cend) { 819 /* 820 * Check pmap first, it is likely faster, also 821 * it can provide info as to whether we are the 822 * one referencing or modifying the page. 823 */ 824 mincoreinfo = pmap_mincore(pmap, addr); 825 if (!mincoreinfo) { 826 vm_pindex_t pindex; 827 vm_ooffset_t offset; 828 vm_page_t m; 829 /* 830 * calculate the page index into the object 831 */ 832 offset = current->offset + (addr - current->start); 833 pindex = OFF_TO_IDX(offset); 834 m = vm_page_lookup(current->object.vm_object, 835 pindex); 836 /* 837 * if the page is resident, then gather information about 838 * it. 839 */ 840 if (m) { 841 mincoreinfo = MINCORE_INCORE; 842 if (m->dirty || 843 pmap_is_modified(m)) 844 mincoreinfo |= MINCORE_MODIFIED_OTHER; 845 if ((m->flags & PG_REFERENCED) || 846 pmap_ts_referenced(m)) { 847 vm_page_flag_set(m, PG_REFERENCED); 848 mincoreinfo |= MINCORE_REFERENCED_OTHER; 849 } 850 } 851 } 852 853 /* 854 * subyte may page fault. In case it needs to modify 855 * the map, we release the lock. 856 */ 857 vm_map_unlock_read(map); 858 859 /* 860 * calculate index into user supplied byte vector 861 */ 862 vecindex = OFF_TO_IDX(addr - first_addr); 863 864 /* 865 * If we have skipped map entries, we need to make sure that 866 * the byte vector is zeroed for those skipped entries. 867 */ 868 while((lastvecindex + 1) < vecindex) { 869 error = subyte( vec + lastvecindex, 0); 870 if (error) { 871 return (EFAULT); 872 } 873 ++lastvecindex; 874 } 875 876 /* 877 * Pass the page information to the user 878 */ 879 error = subyte( vec + vecindex, mincoreinfo); 880 if (error) { 881 return (EFAULT); 882 } 883 884 /* 885 * If the map has changed, due to the subyte, the previous 886 * output may be invalid. 887 */ 888 vm_map_lock_read(map); 889 if (timestamp != map->timestamp) 890 goto RestartScan; 891 892 lastvecindex = vecindex; 893 addr += PAGE_SIZE; 894 } 895 } 896 897 /* 898 * subyte may page fault. In case it needs to modify 899 * the map, we release the lock. 900 */ 901 vm_map_unlock_read(map); 902 903 /* 904 * Zero the last entries in the byte vector. 905 */ 906 vecindex = OFF_TO_IDX(end - first_addr); 907 while((lastvecindex + 1) < vecindex) { 908 error = subyte( vec + lastvecindex, 0); 909 if (error) { 910 return (EFAULT); 911 } 912 ++lastvecindex; 913 } 914 915 /* 916 * If the map has changed, due to the subyte, the previous 917 * output may be invalid. 918 */ 919 vm_map_lock_read(map); 920 if (timestamp != map->timestamp) 921 goto RestartScan; 922 vm_map_unlock_read(map); 923 924 return (0); 925 } 926 927 #ifndef _SYS_SYSPROTO_H_ 928 struct mlock_args { 929 const void *addr; 930 size_t len; 931 }; 932 #endif 933 int 934 mlock(p, uap) 935 struct proc *p; 936 struct mlock_args *uap; 937 { 938 vm_offset_t addr; 939 vm_size_t size, pageoff; 940 int error; 941 942 addr = (vm_offset_t) uap->addr; 943 size = uap->len; 944 945 pageoff = (addr & PAGE_MASK); 946 addr -= pageoff; 947 size += pageoff; 948 size = (vm_size_t) round_page(size); 949 950 /* disable wrap around */ 951 if (addr + size < addr) 952 return (EINVAL); 953 954 if (atop(size) + cnt.v_wire_count > vm_page_max_wired) 955 return (EAGAIN); 956 957 #ifdef pmap_wired_count 958 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 959 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 960 return (ENOMEM); 961 #else 962 error = suser(p); 963 if (error) 964 return (error); 965 #endif 966 967 error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, addr + size, FALSE); 968 return (error == KERN_SUCCESS ? 0 : ENOMEM); 969 } 970 971 #ifndef _SYS_SYSPROTO_H_ 972 struct mlockall_args { 973 int how; 974 }; 975 #endif 976 977 int 978 mlockall(p, uap) 979 struct proc *p; 980 struct mlockall_args *uap; 981 { 982 return 0; 983 } 984 985 #ifndef _SYS_SYSPROTO_H_ 986 struct mlockall_args { 987 int how; 988 }; 989 #endif 990 991 int 992 munlockall(p, uap) 993 struct proc *p; 994 struct munlockall_args *uap; 995 { 996 return 0; 997 } 998 999 #ifndef _SYS_SYSPROTO_H_ 1000 struct munlock_args { 1001 const void *addr; 1002 size_t len; 1003 }; 1004 #endif 1005 int 1006 munlock(p, uap) 1007 struct proc *p; 1008 struct munlock_args *uap; 1009 { 1010 vm_offset_t addr; 1011 vm_size_t size, pageoff; 1012 int error; 1013 1014 addr = (vm_offset_t) uap->addr; 1015 size = uap->len; 1016 1017 pageoff = (addr & PAGE_MASK); 1018 addr -= pageoff; 1019 size += pageoff; 1020 size = (vm_size_t) round_page(size); 1021 1022 /* disable wrap around */ 1023 if (addr + size < addr) 1024 return (EINVAL); 1025 1026 #ifndef pmap_wired_count 1027 error = suser(p); 1028 if (error) 1029 return (error); 1030 #endif 1031 1032 error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, addr + size, TRUE); 1033 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1034 } 1035 1036 /* 1037 * Internal version of mmap. 1038 * Currently used by mmap, exec, and sys5 shared memory. 1039 * Handle is either a vnode pointer or NULL for MAP_ANON. 1040 */ 1041 int 1042 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1043 vm_prot_t maxprot, int flags, 1044 void *handle, 1045 vm_ooffset_t foff) 1046 { 1047 boolean_t fitit; 1048 vm_object_t object; 1049 struct vnode *vp = NULL; 1050 objtype_t type; 1051 int rv = KERN_SUCCESS; 1052 vm_ooffset_t objsize; 1053 int docow; 1054 struct proc *p = curproc; 1055 1056 if (size == 0) 1057 return (0); 1058 1059 objsize = size = round_page(size); 1060 1061 if (p->p_vmspace->vm_map.size + size > 1062 p->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1063 return(ENOMEM); 1064 } 1065 1066 /* 1067 * We currently can only deal with page aligned file offsets. 1068 * The check is here rather than in the syscall because the 1069 * kernel calls this function internally for other mmaping 1070 * operations (such as in exec) and non-aligned offsets will 1071 * cause pmap inconsistencies...so we want to be sure to 1072 * disallow this in all cases. 1073 */ 1074 if (foff & PAGE_MASK) 1075 return (EINVAL); 1076 1077 if ((flags & MAP_FIXED) == 0) { 1078 fitit = TRUE; 1079 *addr = round_page(*addr); 1080 } else { 1081 if (*addr != trunc_page(*addr)) 1082 return (EINVAL); 1083 fitit = FALSE; 1084 (void) vm_map_remove(map, *addr, *addr + size); 1085 } 1086 1087 /* 1088 * Lookup/allocate object. 1089 */ 1090 if (flags & MAP_ANON) { 1091 type = OBJT_DEFAULT; 1092 /* 1093 * Unnamed anonymous regions always start at 0. 1094 */ 1095 if (handle == 0) 1096 foff = 0; 1097 } else { 1098 vp = (struct vnode *) handle; 1099 if (vp->v_type == VCHR) { 1100 type = OBJT_DEVICE; 1101 handle = (void *)(intptr_t)vp->v_rdev; 1102 } else { 1103 struct vattr vat; 1104 int error; 1105 1106 error = VOP_GETATTR(vp, &vat, p->p_ucred, p); 1107 if (error) 1108 return (error); 1109 objsize = round_page(vat.va_size); 1110 type = OBJT_VNODE; 1111 /* 1112 * if it is a regular file without any references 1113 * we do not need to sync it. 1114 */ 1115 if (vp->v_type == VREG && vat.va_nlink == 0) { 1116 flags |= MAP_NOSYNC; 1117 } 1118 } 1119 } 1120 1121 if (handle == NULL) { 1122 object = NULL; 1123 docow = 0; 1124 } else { 1125 object = vm_pager_allocate(type, 1126 handle, objsize, prot, foff); 1127 if (object == NULL) 1128 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1129 docow = MAP_PREFAULT_PARTIAL; 1130 } 1131 1132 /* 1133 * Force device mappings to be shared. 1134 */ 1135 if (type == OBJT_DEVICE || type == OBJT_PHYS) { 1136 flags &= ~(MAP_PRIVATE|MAP_COPY); 1137 flags |= MAP_SHARED; 1138 } 1139 1140 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1141 docow |= MAP_COPY_ON_WRITE; 1142 if (flags & MAP_NOSYNC) 1143 docow |= MAP_DISABLE_SYNCER; 1144 if (flags & MAP_NOCORE) 1145 docow |= MAP_DISABLE_COREDUMP; 1146 1147 #if defined(VM_PROT_READ_IS_EXEC) 1148 if (prot & VM_PROT_READ) 1149 prot |= VM_PROT_EXECUTE; 1150 1151 if (maxprot & VM_PROT_READ) 1152 maxprot |= VM_PROT_EXECUTE; 1153 #endif 1154 1155 if (fitit) { 1156 *addr = pmap_addr_hint(object, *addr, size); 1157 } 1158 1159 if (flags & MAP_STACK) 1160 rv = vm_map_stack (map, *addr, size, prot, 1161 maxprot, docow); 1162 else 1163 rv = vm_map_find(map, object, foff, addr, size, fitit, 1164 prot, maxprot, docow); 1165 1166 if (rv != KERN_SUCCESS) { 1167 /* 1168 * Lose the object reference. Will destroy the 1169 * object if it's an unnamed anonymous mapping 1170 * or named anonymous without other references. 1171 */ 1172 vm_object_deallocate(object); 1173 goto out; 1174 } 1175 1176 /* 1177 * Shared memory is also shared with children. 1178 */ 1179 if (flags & (MAP_SHARED|MAP_INHERIT)) { 1180 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1181 if (rv != KERN_SUCCESS) { 1182 (void) vm_map_remove(map, *addr, *addr + size); 1183 goto out; 1184 } 1185 } 1186 out: 1187 switch (rv) { 1188 case KERN_SUCCESS: 1189 return (0); 1190 case KERN_INVALID_ADDRESS: 1191 case KERN_NO_SPACE: 1192 return (ENOMEM); 1193 case KERN_PROTECTION_FAILURE: 1194 return (EACCES); 1195 default: 1196 return (EINVAL); 1197 } 1198 } 1199