1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $ 40 */ 41 42 /* 43 * Mapped file (mmap) interface to VM 44 */ 45 46 #include <sys/param.h> 47 #include <sys/kernel.h> 48 #include <sys/systm.h> 49 #include <sys/sysproto.h> 50 #include <sys/filedesc.h> 51 #include <sys/kern_syscall.h> 52 #include <sys/proc.h> 53 #include <sys/priv.h> 54 #include <sys/resource.h> 55 #include <sys/resourcevar.h> 56 #include <sys/vnode.h> 57 #include <sys/fcntl.h> 58 #include <sys/file.h> 59 #include <sys/mman.h> 60 #include <sys/conf.h> 61 #include <sys/stat.h> 62 #include <sys/vmmeter.h> 63 #include <sys/sysctl.h> 64 65 #include <vm/vm.h> 66 #include <vm/vm_param.h> 67 #include <sys/lock.h> 68 #include <vm/pmap.h> 69 #include <vm/vm_map.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_pageout.h> 74 #include <vm/vm_extern.h> 75 #include <vm/vm_kern.h> 76 77 #include <sys/file2.h> 78 #include <sys/thread.h> 79 #include <sys/thread2.h> 80 #include <vm/vm_page2.h> 81 82 static int max_proc_mmap; 83 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 84 int vkernel_enable; 85 SYSCTL_INT(_vm, OID_AUTO, vkernel_enable, CTLFLAG_RW, &vkernel_enable, 0, ""); 86 87 /* 88 * Set the maximum number of vm_map_entry structures per process. Roughly 89 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 90 * of our KVM malloc space still results in generous limits. We want a 91 * default that is good enough to prevent the kernel running out of resources 92 * if attacked from compromised user account but generous enough such that 93 * multi-threaded processes are not unduly inconvenienced. 94 */ 95 96 static void vmmapentry_rsrc_init (void *); 97 SYSINIT(vmmersrc, SI_BOOT1_POST, SI_ORDER_ANY, vmmapentry_rsrc_init, NULL); 98 99 static void 100 vmmapentry_rsrc_init(void *dummy) 101 { 102 max_proc_mmap = KvaSize / sizeof(struct vm_map_entry); 103 max_proc_mmap /= 100; 104 } 105 106 /* 107 * MPSAFE 108 */ 109 int 110 sys_sbrk(struct sbrk_args *uap) 111 { 112 /* Not yet implemented */ 113 return (EOPNOTSUPP); 114 } 115 116 /* 117 * sstk_args(int incr) 118 * 119 * MPSAFE 120 */ 121 int 122 sys_sstk(struct sstk_args *uap) 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 /* 129 * mmap_args(void *addr, size_t len, int prot, int flags, int fd, 130 * long pad, off_t pos) 131 * 132 * Memory Map (mmap) system call. Note that the file offset 133 * and address are allowed to be NOT page aligned, though if 134 * the MAP_FIXED flag it set, both must have the same remainder 135 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 136 * page-aligned, the actual mapping starts at trunc_page(addr) 137 * and the return value is adjusted up by the page offset. 138 * 139 * Generally speaking, only character devices which are themselves 140 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 141 * there would be no cache coherency between a descriptor and a VM mapping 142 * both to the same character device. 143 * 144 * Block devices can be mmap'd no matter what they represent. Cache coherency 145 * is maintained as long as you do not write directly to the underlying 146 * character device. 147 * 148 * No requirements 149 */ 150 int 151 kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen, 152 int uprot, int uflags, int fd, off_t upos, void **res) 153 { 154 struct thread *td = curthread; 155 struct proc *p = td->td_proc; 156 struct file *fp = NULL; 157 struct vnode *vp; 158 vm_offset_t addr; 159 vm_offset_t tmpaddr; 160 vm_size_t size, pageoff; 161 vm_prot_t prot, maxprot; 162 void *handle; 163 int flags, error; 164 off_t pos; 165 vm_object_t obj; 166 167 KKASSERT(p); 168 169 addr = (vm_offset_t) uaddr; 170 size = ulen; 171 prot = uprot & VM_PROT_ALL; 172 flags = uflags; 173 pos = upos; 174 175 /* 176 * Make sure mapping fits into numeric range etc. 177 * 178 * NOTE: We support the full unsigned range for size now. 179 */ 180 if (((flags & MAP_ANON) && (fd != -1 || pos != 0))) 181 return (EINVAL); 182 183 if (size == 0) 184 return (EINVAL); 185 186 if (flags & MAP_STACK) { 187 if (fd != -1) 188 return (EINVAL); 189 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 190 return (EINVAL); 191 flags |= MAP_ANON; 192 pos = 0; 193 } 194 195 /* 196 * Virtual page tables cannot be used with MAP_STACK. Apart from 197 * it not making any sense, the aux union is used by both 198 * types. 199 * 200 * Because the virtual page table is stored in the backing object 201 * and might be updated by the kernel, the mapping must be R+W. 202 */ 203 if (flags & MAP_VPAGETABLE) { 204 if (vkernel_enable == 0) 205 return (EOPNOTSUPP); 206 if (flags & MAP_STACK) 207 return (EINVAL); 208 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 209 return (EINVAL); 210 } 211 212 /* 213 * Align the file position to a page boundary, 214 * and save its page offset component. 215 */ 216 pageoff = (pos & PAGE_MASK); 217 pos -= pageoff; 218 219 /* Adjust size for rounding (on both ends). */ 220 size += pageoff; /* low end... */ 221 size = (vm_size_t) round_page(size); /* hi end */ 222 if (size < ulen) /* wrap */ 223 return(EINVAL); 224 225 /* 226 * Check for illegal addresses. Watch out for address wrap... Note 227 * that VM_*_ADDRESS are not constants due to casts (argh). 228 */ 229 if (flags & (MAP_FIXED | MAP_TRYFIXED)) { 230 /* 231 * The specified address must have the same remainder 232 * as the file offset taken modulo PAGE_SIZE, so it 233 * should be aligned after adjustment by pageoff. 234 */ 235 addr -= pageoff; 236 if (addr & PAGE_MASK) 237 return (EINVAL); 238 239 /* 240 * Address range must be all in user VM space and not wrap. 241 */ 242 tmpaddr = addr + size; 243 if (tmpaddr < addr) 244 return (EINVAL); 245 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 246 return (EINVAL); 247 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 248 return (EINVAL); 249 } else { 250 /* 251 * Get a hint of where to map. It also provides mmap offset 252 * randomization if enabled. 253 */ 254 addr = vm_map_hint(p, addr, prot); 255 } 256 257 if (flags & MAP_ANON) { 258 /* 259 * Mapping blank space is trivial. 260 */ 261 handle = NULL; 262 maxprot = VM_PROT_ALL; 263 } else { 264 /* 265 * Mapping file, get fp for validation. Obtain vnode and make 266 * sure it is of appropriate type. 267 */ 268 fp = holdfp(p->p_fd, fd, -1); 269 if (fp == NULL) 270 return (EBADF); 271 if (fp->f_type != DTYPE_VNODE) { 272 error = EINVAL; 273 goto done; 274 } 275 /* 276 * POSIX shared-memory objects are defined to have 277 * kernel persistence, and are not defined to support 278 * read(2)/write(2) -- or even open(2). Thus, we can 279 * use MAP_ASYNC to trade on-disk coherence for speed. 280 * The shm_open(3) library routine turns on the FPOSIXSHM 281 * flag to request this behavior. 282 */ 283 if (fp->f_flag & FPOSIXSHM) 284 flags |= MAP_NOSYNC; 285 vp = (struct vnode *) fp->f_data; 286 287 /* 288 * Validate the vnode for the operation. 289 */ 290 switch(vp->v_type) { 291 case VREG: 292 /* 293 * Get the proper underlying object 294 */ 295 if ((obj = vp->v_object) == NULL) { 296 error = EINVAL; 297 goto done; 298 } 299 KKASSERT((struct vnode *)obj->handle == vp); 300 break; 301 case VCHR: 302 /* 303 * Make sure a device has not been revoked. 304 * Mappability is handled by the device layer. 305 */ 306 if (vp->v_rdev == NULL) { 307 error = EBADF; 308 goto done; 309 } 310 break; 311 default: 312 /* 313 * Nothing else is mappable. 314 */ 315 error = EINVAL; 316 goto done; 317 } 318 319 /* 320 * XXX hack to handle use of /dev/zero to map anon memory (ala 321 * SunOS). 322 */ 323 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 324 handle = NULL; 325 maxprot = VM_PROT_ALL; 326 flags |= MAP_ANON; 327 pos = 0; 328 } else { 329 /* 330 * cdevs does not provide private mappings of any kind. 331 */ 332 if (vp->v_type == VCHR && 333 (flags & (MAP_PRIVATE|MAP_COPY))) { 334 error = EINVAL; 335 goto done; 336 } 337 /* 338 * Ensure that file and memory protections are 339 * compatible. Note that we only worry about 340 * writability if mapping is shared; in this case, 341 * current and max prot are dictated by the open file. 342 * XXX use the vnode instead? Problem is: what 343 * credentials do we use for determination? What if 344 * proc does a setuid? 345 */ 346 maxprot = VM_PROT_EXECUTE; 347 if (fp->f_flag & FREAD) { 348 maxprot |= VM_PROT_READ; 349 } else if (prot & PROT_READ) { 350 error = EACCES; 351 goto done; 352 } 353 /* 354 * If we are sharing potential changes (either via 355 * MAP_SHARED or via the implicit sharing of character 356 * device mappings), and we are trying to get write 357 * permission although we opened it without asking 358 * for it, bail out. Check for superuser, only if 359 * we're at securelevel < 1, to allow the XIG X server 360 * to continue to work. 361 */ 362 if ((flags & MAP_SHARED) != 0 || vp->v_type == VCHR) { 363 if ((fp->f_flag & FWRITE) != 0) { 364 struct vattr va; 365 if ((error = VOP_GETATTR(vp, &va))) { 366 goto done; 367 } 368 if ((va.va_flags & 369 (IMMUTABLE|APPEND)) == 0) { 370 maxprot |= VM_PROT_WRITE; 371 } else if (prot & PROT_WRITE) { 372 error = EPERM; 373 goto done; 374 } 375 } else if ((prot & PROT_WRITE) != 0) { 376 error = EACCES; 377 goto done; 378 } 379 } else { 380 maxprot |= VM_PROT_WRITE; 381 } 382 handle = (void *)vp; 383 } 384 } 385 386 lwkt_gettoken(&vms->vm_map.token); 387 388 /* 389 * Do not allow more then a certain number of vm_map_entry structures 390 * per process. Scale with the number of rforks sharing the map 391 * to make the limit reasonable for threads. 392 */ 393 if (max_proc_mmap && 394 vms->vm_map.nentries >= max_proc_mmap * vmspace_getrefs(vms)) { 395 error = ENOMEM; 396 lwkt_reltoken(&vms->vm_map.token); 397 goto done; 398 } 399 400 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 401 flags, handle, pos); 402 if (error == 0) 403 *res = (void *)(addr + pageoff); 404 405 lwkt_reltoken(&vms->vm_map.token); 406 done: 407 if (fp) 408 fdrop(fp); 409 410 return (error); 411 } 412 413 /* 414 * mmap system call handler 415 * 416 * No requirements. 417 */ 418 int 419 sys_mmap(struct mmap_args *uap) 420 { 421 int error; 422 423 error = kern_mmap(curproc->p_vmspace, uap->addr, uap->len, 424 uap->prot, uap->flags, 425 uap->fd, uap->pos, &uap->sysmsg_resultp); 426 427 return (error); 428 } 429 430 /* 431 * msync system call handler 432 * 433 * msync_args(void *addr, size_t len, int flags) 434 * 435 * No requirements 436 */ 437 int 438 sys_msync(struct msync_args *uap) 439 { 440 struct proc *p = curproc; 441 vm_offset_t addr; 442 vm_offset_t tmpaddr; 443 vm_size_t size, pageoff; 444 int flags; 445 vm_map_t map; 446 int rv; 447 448 addr = (vm_offset_t) uap->addr; 449 size = uap->len; 450 flags = uap->flags; 451 452 pageoff = (addr & PAGE_MASK); 453 addr -= pageoff; 454 size += pageoff; 455 size = (vm_size_t) round_page(size); 456 if (size < uap->len) /* wrap */ 457 return(EINVAL); 458 tmpaddr = addr + size; /* workaround gcc4 opt */ 459 if (tmpaddr < addr) /* wrap */ 460 return(EINVAL); 461 462 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 463 return (EINVAL); 464 465 map = &p->p_vmspace->vm_map; 466 467 /* 468 * map->token serializes extracting the address range for size == 0 469 * msyncs with the vm_map_clean call; if the token were not held 470 * across the two calls, an intervening munmap/mmap pair, for example, 471 * could cause msync to occur on a wrong region. 472 */ 473 lwkt_gettoken(&map->token); 474 475 /* 476 * XXX Gak! If size is zero we are supposed to sync "all modified 477 * pages with the region containing addr". Unfortunately, we don't 478 * really keep track of individual mmaps so we approximate by flushing 479 * the range of the map entry containing addr. This can be incorrect 480 * if the region splits or is coalesced with a neighbor. 481 */ 482 if (size == 0) { 483 vm_map_entry_t entry; 484 485 vm_map_lock_read(map); 486 rv = vm_map_lookup_entry(map, addr, &entry); 487 if (rv == FALSE) { 488 vm_map_unlock_read(map); 489 rv = KERN_INVALID_ADDRESS; 490 goto done; 491 } 492 addr = entry->start; 493 size = entry->end - entry->start; 494 vm_map_unlock_read(map); 495 } 496 497 /* 498 * Clean the pages and interpret the return value. 499 */ 500 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 501 (flags & MS_INVALIDATE) != 0); 502 done: 503 lwkt_reltoken(&map->token); 504 505 switch (rv) { 506 case KERN_SUCCESS: 507 break; 508 case KERN_INVALID_ADDRESS: 509 return (EINVAL); /* Sun returns ENOMEM? */ 510 case KERN_FAILURE: 511 return (EIO); 512 default: 513 return (EINVAL); 514 } 515 516 return (0); 517 } 518 519 /* 520 * munmap system call handler 521 * 522 * munmap_args(void *addr, size_t len) 523 * 524 * No requirements 525 */ 526 int 527 sys_munmap(struct munmap_args *uap) 528 { 529 struct proc *p = curproc; 530 vm_offset_t addr; 531 vm_offset_t tmpaddr; 532 vm_size_t size, pageoff; 533 vm_map_t map; 534 535 addr = (vm_offset_t) uap->addr; 536 size = uap->len; 537 538 pageoff = (addr & PAGE_MASK); 539 addr -= pageoff; 540 size += pageoff; 541 size = (vm_size_t) round_page(size); 542 if (size < uap->len) /* wrap */ 543 return(EINVAL); 544 tmpaddr = addr + size; /* workaround gcc4 opt */ 545 if (tmpaddr < addr) /* wrap */ 546 return(EINVAL); 547 548 if (size == 0) 549 return (0); 550 551 /* 552 * Check for illegal addresses. Watch out for address wrap... Note 553 * that VM_*_ADDRESS are not constants due to casts (argh). 554 */ 555 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 556 return (EINVAL); 557 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 558 return (EINVAL); 559 560 map = &p->p_vmspace->vm_map; 561 562 /* map->token serializes between the map check and the actual unmap */ 563 lwkt_gettoken(&map->token); 564 565 /* 566 * Make sure entire range is allocated. 567 */ 568 if (!vm_map_check_protection(map, addr, addr + size, 569 VM_PROT_NONE, FALSE)) { 570 lwkt_reltoken(&map->token); 571 return (EINVAL); 572 } 573 /* returns nothing but KERN_SUCCESS anyway */ 574 vm_map_remove(map, addr, addr + size); 575 lwkt_reltoken(&map->token); 576 return (0); 577 } 578 579 /* 580 * mprotect_args(const void *addr, size_t len, int prot) 581 * 582 * No requirements. 583 */ 584 int 585 sys_mprotect(struct mprotect_args *uap) 586 { 587 struct proc *p = curproc; 588 vm_offset_t addr; 589 vm_offset_t tmpaddr; 590 vm_size_t size, pageoff; 591 vm_prot_t prot; 592 int error; 593 594 addr = (vm_offset_t) uap->addr; 595 size = uap->len; 596 prot = uap->prot & VM_PROT_ALL; 597 598 pageoff = (addr & PAGE_MASK); 599 addr -= pageoff; 600 size += pageoff; 601 size = (vm_size_t) round_page(size); 602 if (size < uap->len) /* wrap */ 603 return(EINVAL); 604 tmpaddr = addr + size; /* workaround gcc4 opt */ 605 if (tmpaddr < addr) /* wrap */ 606 return(EINVAL); 607 608 switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, 609 prot, FALSE)) { 610 case KERN_SUCCESS: 611 error = 0; 612 break; 613 case KERN_PROTECTION_FAILURE: 614 error = EACCES; 615 break; 616 default: 617 error = EINVAL; 618 break; 619 } 620 return (error); 621 } 622 623 /* 624 * minherit system call handler 625 * 626 * minherit_args(void *addr, size_t len, int inherit) 627 * 628 * No requirements. 629 */ 630 int 631 sys_minherit(struct minherit_args *uap) 632 { 633 struct proc *p = curproc; 634 vm_offset_t addr; 635 vm_offset_t tmpaddr; 636 vm_size_t size, pageoff; 637 vm_inherit_t inherit; 638 int error; 639 640 addr = (vm_offset_t)uap->addr; 641 size = uap->len; 642 inherit = uap->inherit; 643 644 pageoff = (addr & PAGE_MASK); 645 addr -= pageoff; 646 size += pageoff; 647 size = (vm_size_t) round_page(size); 648 if (size < uap->len) /* wrap */ 649 return(EINVAL); 650 tmpaddr = addr + size; /* workaround gcc4 opt */ 651 if (tmpaddr < addr) /* wrap */ 652 return(EINVAL); 653 654 switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, 655 addr + size, inherit)) { 656 case KERN_SUCCESS: 657 error = 0; 658 break; 659 case KERN_PROTECTION_FAILURE: 660 error = EACCES; 661 break; 662 default: 663 error = EINVAL; 664 break; 665 } 666 return (error); 667 } 668 669 /* 670 * madvise system call handler 671 * 672 * madvise_args(void *addr, size_t len, int behav) 673 * 674 * No requirements. 675 */ 676 int 677 sys_madvise(struct madvise_args *uap) 678 { 679 struct proc *p = curproc; 680 vm_offset_t start, end; 681 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 682 int error; 683 684 /* 685 * Check for illegal behavior 686 */ 687 if (uap->behav < 0 || uap->behav >= MADV_CONTROL_END) 688 return (EINVAL); 689 /* 690 * Check for illegal addresses. Watch out for address wrap... Note 691 * that VM_*_ADDRESS are not constants due to casts (argh). 692 */ 693 if (tmpaddr < (vm_offset_t)uap->addr) 694 return (EINVAL); 695 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 696 return (EINVAL); 697 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 698 return (EINVAL); 699 700 /* 701 * Since this routine is only advisory, we default to conservative 702 * behavior. 703 */ 704 start = trunc_page((vm_offset_t)uap->addr); 705 end = round_page(tmpaddr); 706 707 error = vm_map_madvise(&p->p_vmspace->vm_map, start, end, 708 uap->behav, 0); 709 return (error); 710 } 711 712 /* 713 * mcontrol system call handler 714 * 715 * mcontrol_args(void *addr, size_t len, int behav, off_t value) 716 * 717 * No requirements 718 */ 719 int 720 sys_mcontrol(struct mcontrol_args *uap) 721 { 722 struct proc *p = curproc; 723 vm_offset_t start, end; 724 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 725 int error; 726 727 /* 728 * Check for illegal behavior 729 */ 730 if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) 731 return (EINVAL); 732 /* 733 * Check for illegal addresses. Watch out for address wrap... Note 734 * that VM_*_ADDRESS are not constants due to casts (argh). 735 */ 736 if (tmpaddr < (vm_offset_t) uap->addr) 737 return (EINVAL); 738 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 739 return (EINVAL); 740 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 741 return (EINVAL); 742 743 /* 744 * Since this routine is only advisory, we default to conservative 745 * behavior. 746 */ 747 start = trunc_page((vm_offset_t)uap->addr); 748 end = round_page(tmpaddr); 749 750 error = vm_map_madvise(&p->p_vmspace->vm_map, start, end, 751 uap->behav, uap->value); 752 return (error); 753 } 754 755 756 /* 757 * mincore system call handler 758 * 759 * mincore_args(const void *addr, size_t len, char *vec) 760 * 761 * No requirements 762 */ 763 int 764 sys_mincore(struct mincore_args *uap) 765 { 766 struct proc *p = curproc; 767 vm_offset_t addr, first_addr; 768 vm_offset_t end, cend; 769 pmap_t pmap; 770 vm_map_t map; 771 char *vec; 772 int error; 773 int vecindex, lastvecindex; 774 vm_map_entry_t current; 775 vm_map_entry_t entry; 776 int mincoreinfo; 777 unsigned int timestamp; 778 779 /* 780 * Make sure that the addresses presented are valid for user 781 * mode. 782 */ 783 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 784 end = addr + (vm_size_t)round_page(uap->len); 785 if (end < addr) 786 return (EINVAL); 787 if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS) 788 return (EINVAL); 789 790 /* 791 * Address of byte vector 792 */ 793 vec = uap->vec; 794 795 map = &p->p_vmspace->vm_map; 796 pmap = vmspace_pmap(p->p_vmspace); 797 798 lwkt_gettoken(&map->token); 799 vm_map_lock_read(map); 800 RestartScan: 801 timestamp = map->timestamp; 802 803 if (!vm_map_lookup_entry(map, addr, &entry)) 804 entry = entry->next; 805 806 /* 807 * Do this on a map entry basis so that if the pages are not 808 * in the current processes address space, we can easily look 809 * up the pages elsewhere. 810 */ 811 lastvecindex = -1; 812 for(current = entry; 813 (current != &map->header) && (current->start < end); 814 current = current->next) { 815 816 /* 817 * ignore submaps (for now) or null objects 818 */ 819 if (current->maptype != VM_MAPTYPE_NORMAL && 820 current->maptype != VM_MAPTYPE_VPAGETABLE) { 821 continue; 822 } 823 if (current->object.vm_object == NULL) 824 continue; 825 826 /* 827 * limit this scan to the current map entry and the 828 * limits for the mincore call 829 */ 830 if (addr < current->start) 831 addr = current->start; 832 cend = current->end; 833 if (cend > end) 834 cend = end; 835 836 /* 837 * scan this entry one page at a time 838 */ 839 while (addr < cend) { 840 /* 841 * Check pmap first, it is likely faster, also 842 * it can provide info as to whether we are the 843 * one referencing or modifying the page. 844 * 845 * If we have to check the VM object, only mess 846 * around with normal maps. Do not mess around 847 * with virtual page tables (XXX). 848 */ 849 mincoreinfo = pmap_mincore(pmap, addr); 850 if (mincoreinfo == 0 && 851 current->maptype == VM_MAPTYPE_NORMAL) { 852 vm_pindex_t pindex; 853 vm_ooffset_t offset; 854 vm_page_t m; 855 856 /* 857 * calculate the page index into the object 858 */ 859 offset = current->offset + (addr - current->start); 860 pindex = OFF_TO_IDX(offset); 861 862 /* 863 * if the page is resident, then gather 864 * information about it. spl protection is 865 * required to maintain the object 866 * association. And XXX what if the page is 867 * busy? What's the deal with that? 868 * 869 * XXX vm_token - legacy for pmap_ts_referenced 870 * in i386 and vkernel pmap code. 871 */ 872 lwkt_gettoken(&vm_token); 873 vm_object_hold(current->object.vm_object); 874 m = vm_page_lookup(current->object.vm_object, 875 pindex); 876 if (m && m->valid) { 877 mincoreinfo = MINCORE_INCORE; 878 if (m->dirty || 879 pmap_is_modified(m)) 880 mincoreinfo |= MINCORE_MODIFIED_OTHER; 881 if ((m->flags & PG_REFERENCED) || 882 pmap_ts_referenced(m)) { 883 vm_page_flag_set(m, PG_REFERENCED); 884 mincoreinfo |= MINCORE_REFERENCED_OTHER; 885 } 886 } 887 vm_object_drop(current->object.vm_object); 888 lwkt_reltoken(&vm_token); 889 } 890 891 /* 892 * subyte may page fault. In case it needs to modify 893 * the map, we release the lock. 894 */ 895 vm_map_unlock_read(map); 896 897 /* 898 * calculate index into user supplied byte vector 899 */ 900 vecindex = OFF_TO_IDX(addr - first_addr); 901 902 /* 903 * If we have skipped map entries, we need to make sure that 904 * the byte vector is zeroed for those skipped entries. 905 */ 906 while((lastvecindex + 1) < vecindex) { 907 error = subyte( vec + lastvecindex, 0); 908 if (error) { 909 error = EFAULT; 910 goto done; 911 } 912 ++lastvecindex; 913 } 914 915 /* 916 * Pass the page information to the user 917 */ 918 error = subyte( vec + vecindex, mincoreinfo); 919 if (error) { 920 error = EFAULT; 921 goto done; 922 } 923 924 /* 925 * If the map has changed, due to the subyte, the previous 926 * output may be invalid. 927 */ 928 vm_map_lock_read(map); 929 if (timestamp != map->timestamp) 930 goto RestartScan; 931 932 lastvecindex = vecindex; 933 addr += PAGE_SIZE; 934 } 935 } 936 937 /* 938 * subyte may page fault. In case it needs to modify 939 * the map, we release the lock. 940 */ 941 vm_map_unlock_read(map); 942 943 /* 944 * Zero the last entries in the byte vector. 945 */ 946 vecindex = OFF_TO_IDX(end - first_addr); 947 while((lastvecindex + 1) < vecindex) { 948 error = subyte( vec + lastvecindex, 0); 949 if (error) { 950 error = EFAULT; 951 goto done; 952 } 953 ++lastvecindex; 954 } 955 956 /* 957 * If the map has changed, due to the subyte, the previous 958 * output may be invalid. 959 */ 960 vm_map_lock_read(map); 961 if (timestamp != map->timestamp) 962 goto RestartScan; 963 vm_map_unlock_read(map); 964 965 error = 0; 966 done: 967 lwkt_reltoken(&map->token); 968 return (error); 969 } 970 971 /* 972 * mlock system call handler 973 * 974 * mlock_args(const void *addr, size_t len) 975 * 976 * No requirements 977 */ 978 int 979 sys_mlock(struct mlock_args *uap) 980 { 981 vm_offset_t addr; 982 vm_offset_t tmpaddr; 983 vm_size_t size, pageoff; 984 struct thread *td = curthread; 985 struct proc *p = td->td_proc; 986 int error; 987 988 addr = (vm_offset_t) uap->addr; 989 size = uap->len; 990 991 pageoff = (addr & PAGE_MASK); 992 addr -= pageoff; 993 size += pageoff; 994 size = (vm_size_t) round_page(size); 995 if (size < uap->len) /* wrap */ 996 return(EINVAL); 997 tmpaddr = addr + size; /* workaround gcc4 opt */ 998 if (tmpaddr < addr) /* wrap */ 999 return (EINVAL); 1000 1001 if (atop(size) + vmstats.v_wire_count > vm_page_max_wired) 1002 return (EAGAIN); 1003 1004 /* 1005 * We do not need to synchronize against other threads updating ucred; 1006 * they update p->ucred, which is synchronized into td_ucred ourselves. 1007 */ 1008 #ifdef pmap_wired_count 1009 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 1010 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) { 1011 return (ENOMEM); 1012 } 1013 #else 1014 error = priv_check_cred(td->td_ucred, PRIV_ROOT, 0); 1015 if (error) { 1016 return (error); 1017 } 1018 #endif 1019 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, FALSE); 1020 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1021 } 1022 1023 /* 1024 * mlockall(int how) 1025 * 1026 * No requirements 1027 */ 1028 int 1029 sys_mlockall(struct mlockall_args *uap) 1030 { 1031 struct thread *td = curthread; 1032 struct proc *p = td->td_proc; 1033 vm_map_t map = &p->p_vmspace->vm_map; 1034 vm_map_entry_t entry; 1035 int how = uap->how; 1036 int rc = KERN_SUCCESS; 1037 1038 if (((how & MCL_CURRENT) == 0) && ((how & MCL_FUTURE) == 0)) 1039 return (EINVAL); 1040 1041 rc = priv_check_cred(td->td_ucred, PRIV_ROOT, 0); 1042 if (rc) 1043 return (rc); 1044 1045 vm_map_lock(map); 1046 do { 1047 if (how & MCL_CURRENT) { 1048 for(entry = map->header.next; 1049 entry != &map->header; 1050 entry = entry->next); 1051 1052 rc = ENOSYS; 1053 break; 1054 } 1055 1056 if (how & MCL_FUTURE) 1057 map->flags |= MAP_WIREFUTURE; 1058 } while(0); 1059 vm_map_unlock(map); 1060 1061 return (rc); 1062 } 1063 1064 /* 1065 * munlockall(void) 1066 * 1067 * Unwire all user-wired map entries, cancel MCL_FUTURE. 1068 * 1069 * No requirements 1070 */ 1071 int 1072 sys_munlockall(struct munlockall_args *uap) 1073 { 1074 struct thread *td = curthread; 1075 struct proc *p = td->td_proc; 1076 vm_map_t map = &p->p_vmspace->vm_map; 1077 vm_map_entry_t entry; 1078 int rc = KERN_SUCCESS; 1079 1080 vm_map_lock(map); 1081 1082 /* Clear MAP_WIREFUTURE to cancel mlockall(MCL_FUTURE) */ 1083 map->flags &= ~MAP_WIREFUTURE; 1084 1085 retry: 1086 for (entry = map->header.next; 1087 entry != &map->header; 1088 entry = entry->next) { 1089 if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) 1090 continue; 1091 1092 /* 1093 * If we encounter an in-transition entry, we release the 1094 * map lock and retry the scan; we do not decrement any 1095 * wired_count more than once because we do not touch 1096 * any entries with MAP_ENTRY_USER_WIRED not set. 1097 * 1098 * There is a potential interleaving with concurrent 1099 * mlockall()s here -- if we abort a scan, an mlockall() 1100 * could start, wire a number of entries before our 1101 * current position in, and then stall itself on this 1102 * or any other in-transition entry. If that occurs, when 1103 * we resume, we will unwire those entries. 1104 */ 1105 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 1106 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 1107 ++mycpu->gd_cnt.v_intrans_coll; 1108 ++mycpu->gd_cnt.v_intrans_wait; 1109 vm_map_transition_wait(map); 1110 goto retry; 1111 } 1112 1113 KASSERT(entry->wired_count > 0, 1114 ("wired_count was 0 with USER_WIRED set! %p", entry)); 1115 1116 /* Drop wired count, if it hits zero, unwire the entry */ 1117 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 1118 entry->wired_count--; 1119 if (entry->wired_count == 0) 1120 vm_fault_unwire(map, entry); 1121 } 1122 1123 map->timestamp++; 1124 vm_map_unlock(map); 1125 1126 return (rc); 1127 } 1128 1129 /* 1130 * munlock system call handler 1131 * 1132 * munlock_args(const void *addr, size_t len) 1133 * 1134 * No requirements 1135 */ 1136 int 1137 sys_munlock(struct munlock_args *uap) 1138 { 1139 struct thread *td = curthread; 1140 struct proc *p = td->td_proc; 1141 vm_offset_t addr; 1142 vm_offset_t tmpaddr; 1143 vm_size_t size, pageoff; 1144 int error; 1145 1146 addr = (vm_offset_t) uap->addr; 1147 size = uap->len; 1148 1149 pageoff = (addr & PAGE_MASK); 1150 addr -= pageoff; 1151 size += pageoff; 1152 size = (vm_size_t) round_page(size); 1153 1154 tmpaddr = addr + size; 1155 if (tmpaddr < addr) /* wrap */ 1156 return (EINVAL); 1157 1158 #ifndef pmap_wired_count 1159 error = priv_check(td, PRIV_ROOT); 1160 if (error) 1161 return (error); 1162 #endif 1163 1164 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, TRUE); 1165 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1166 } 1167 1168 /* 1169 * Internal version of mmap. 1170 * Currently used by mmap, exec, and sys5 shared memory. 1171 * Handle is either a vnode pointer or NULL for MAP_ANON. 1172 * 1173 * No requirements 1174 */ 1175 int 1176 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1177 vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff) 1178 { 1179 boolean_t fitit; 1180 vm_object_t object; 1181 vm_offset_t eaddr; 1182 vm_size_t esize; 1183 vm_size_t align; 1184 int (*uksmap)(cdev_t dev, vm_page_t fake); 1185 struct vnode *vp; 1186 struct thread *td = curthread; 1187 struct proc *p; 1188 int rv = KERN_SUCCESS; 1189 off_t objsize; 1190 int docow; 1191 int error; 1192 1193 if (size == 0) 1194 return (0); 1195 1196 objsize = round_page(size); 1197 if (objsize < size) 1198 return (EINVAL); 1199 size = objsize; 1200 1201 lwkt_gettoken(&map->token); 1202 1203 /* 1204 * XXX messy code, fixme 1205 * 1206 * NOTE: Overflow checks require discrete statements or GCC4 1207 * will optimize it out. 1208 */ 1209 if ((p = curproc) != NULL && map == &p->p_vmspace->vm_map) { 1210 esize = map->size + size; /* workaround gcc4 opt */ 1211 if (esize < map->size || 1212 esize > p->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1213 lwkt_reltoken(&map->token); 1214 return(ENOMEM); 1215 } 1216 } 1217 1218 /* 1219 * We currently can only deal with page aligned file offsets. 1220 * The check is here rather than in the syscall because the 1221 * kernel calls this function internally for other mmaping 1222 * operations (such as in exec) and non-aligned offsets will 1223 * cause pmap inconsistencies...so we want to be sure to 1224 * disallow this in all cases. 1225 * 1226 * NOTE: Overflow checks require discrete statements or GCC4 1227 * will optimize it out. 1228 */ 1229 if (foff & PAGE_MASK) { 1230 lwkt_reltoken(&map->token); 1231 return (EINVAL); 1232 } 1233 1234 /* 1235 * Handle alignment. For large memory maps it is possible 1236 * that the MMU can optimize the page table so align anything 1237 * that is a multiple of SEG_SIZE to SEG_SIZE. 1238 * 1239 * Also align any large mapping (bigger than 16x SG_SIZE) to a 1240 * SEG_SIZE address boundary. 1241 */ 1242 if (flags & MAP_SIZEALIGN) { 1243 align = size; 1244 if ((align ^ (align - 1)) != (align << 1) - 1) { 1245 lwkt_reltoken(&map->token); 1246 return (EINVAL); 1247 } 1248 } else if ((flags & MAP_FIXED) == 0 && 1249 ((size & SEG_MASK) == 0 || size > SEG_SIZE * 16)) { 1250 align = SEG_SIZE; 1251 } else { 1252 align = PAGE_SIZE; 1253 } 1254 1255 if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) { 1256 fitit = TRUE; 1257 *addr = round_page(*addr); 1258 } else { 1259 if (*addr != trunc_page(*addr)) { 1260 lwkt_reltoken(&map->token); 1261 return (EINVAL); 1262 } 1263 eaddr = *addr + size; 1264 if (eaddr < *addr) { 1265 lwkt_reltoken(&map->token); 1266 return (EINVAL); 1267 } 1268 fitit = FALSE; 1269 if ((flags & MAP_TRYFIXED) == 0) 1270 vm_map_remove(map, *addr, *addr + size); 1271 } 1272 1273 uksmap = NULL; 1274 1275 /* 1276 * Lookup/allocate object. 1277 */ 1278 if (flags & MAP_ANON) { 1279 /* 1280 * Unnamed anonymous regions always start at 0. 1281 */ 1282 if (handle) { 1283 /* 1284 * Default memory object 1285 */ 1286 object = default_pager_alloc(handle, objsize, 1287 prot, foff); 1288 if (object == NULL) { 1289 lwkt_reltoken(&map->token); 1290 return(ENOMEM); 1291 } 1292 docow = MAP_PREFAULT_PARTIAL; 1293 } else { 1294 /* 1295 * Implicit single instance of a default memory 1296 * object, so we don't need a VM object yet. 1297 */ 1298 foff = 0; 1299 object = NULL; 1300 docow = 0; 1301 } 1302 vp = NULL; 1303 } else { 1304 vp = (struct vnode *)handle; 1305 1306 /* 1307 * Non-anonymous mappings of VCHR (aka not /dev/zero) 1308 * cannot specify MAP_STACK or MAP_VPAGETABLE. 1309 */ 1310 if (vp->v_type == VCHR) { 1311 if (flags & (MAP_STACK | MAP_VPAGETABLE)) { 1312 lwkt_reltoken(&map->token); 1313 return(EINVAL); 1314 } 1315 } 1316 1317 if (vp->v_type == VCHR && vp->v_rdev->si_ops->d_uksmap) { 1318 /* 1319 * Device mappings without a VM object, typically 1320 * sharing permanently allocated kernel memory or 1321 * process-context-specific (per-process) data. 1322 * 1323 * Force them to be shared. 1324 */ 1325 uksmap = vp->v_rdev->si_ops->d_uksmap; 1326 object = NULL; 1327 docow = MAP_PREFAULT_PARTIAL; 1328 flags &= ~(MAP_PRIVATE|MAP_COPY); 1329 flags |= MAP_SHARED; 1330 } else if (vp->v_type == VCHR) { 1331 /* 1332 * Device mappings (device size unknown?). 1333 * Force them to be shared. 1334 */ 1335 error = dev_dmmap_single(vp->v_rdev, &foff, objsize, 1336 &object, prot, NULL); 1337 1338 if (error == ENODEV) { 1339 handle = (void *)(intptr_t)vp->v_rdev; 1340 object = dev_pager_alloc(handle, objsize, prot, foff); 1341 if (object == NULL) { 1342 lwkt_reltoken(&map->token); 1343 return(EINVAL); 1344 } 1345 } else if (error) { 1346 lwkt_reltoken(&map->token); 1347 return(error); 1348 } 1349 1350 docow = MAP_PREFAULT_PARTIAL; 1351 flags &= ~(MAP_PRIVATE|MAP_COPY); 1352 flags |= MAP_SHARED; 1353 } else { 1354 /* 1355 * Regular file mapping (typically). The attribute 1356 * check is for the link count test only. mmapable 1357 * vnodes must already have a VM object assigned. 1358 */ 1359 struct vattr vat; 1360 int error; 1361 1362 error = VOP_GETATTR(vp, &vat); 1363 if (error) { 1364 lwkt_reltoken(&map->token); 1365 return (error); 1366 } 1367 docow = MAP_PREFAULT_PARTIAL; 1368 object = vnode_pager_reference(vp); 1369 if (object == NULL && vp->v_type == VREG) { 1370 lwkt_reltoken(&map->token); 1371 kprintf("Warning: cannot mmap vnode %p, no " 1372 "object\n", vp); 1373 return(EINVAL); 1374 } 1375 1376 /* 1377 * If it is a regular file without any references 1378 * we do not need to sync it. 1379 */ 1380 if (vp->v_type == VREG && vat.va_nlink == 0) { 1381 flags |= MAP_NOSYNC; 1382 } 1383 } 1384 } 1385 1386 /* 1387 * Deal with the adjusted flags 1388 */ 1389 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1390 docow |= MAP_COPY_ON_WRITE; 1391 if (flags & MAP_NOSYNC) 1392 docow |= MAP_DISABLE_SYNCER; 1393 if (flags & MAP_NOCORE) 1394 docow |= MAP_DISABLE_COREDUMP; 1395 1396 /* 1397 * This may place the area in its own page directory if (size) is 1398 * large enough, otherwise it typically returns its argument. 1399 * 1400 * (object can be NULL) 1401 */ 1402 if (fitit) { 1403 *addr = pmap_addr_hint(object, *addr, size); 1404 } 1405 1406 /* 1407 * Stack mappings need special attention. 1408 * 1409 * Mappings that use virtual page tables will default to storing 1410 * the page table at offset 0. 1411 */ 1412 if (uksmap) { 1413 rv = vm_map_find(map, uksmap, vp->v_rdev, 1414 foff, addr, size, 1415 align, fitit, 1416 VM_MAPTYPE_UKSMAP, VM_SUBSYS_MMAP, 1417 prot, maxprot, docow); 1418 } else if (flags & MAP_STACK) { 1419 rv = vm_map_stack(map, *addr, size, flags, 1420 prot, maxprot, docow); 1421 } else if (flags & MAP_VPAGETABLE) { 1422 rv = vm_map_find(map, object, NULL, 1423 foff, addr, size, 1424 align, fitit, 1425 VM_MAPTYPE_VPAGETABLE, VM_SUBSYS_MMAP, 1426 prot, maxprot, docow); 1427 } else { 1428 rv = vm_map_find(map, object, NULL, 1429 foff, addr, size, 1430 align, fitit, 1431 VM_MAPTYPE_NORMAL, VM_SUBSYS_MMAP, 1432 prot, maxprot, docow); 1433 } 1434 1435 if (rv != KERN_SUCCESS) { 1436 /* 1437 * Lose the object reference. Will destroy the 1438 * object if it's an unnamed anonymous mapping 1439 * or named anonymous without other references. 1440 * 1441 * (NOTE: object can be NULL) 1442 */ 1443 vm_object_deallocate(object); 1444 goto out; 1445 } 1446 1447 /* 1448 * Shared memory is also shared with children. 1449 */ 1450 if (flags & (MAP_SHARED|MAP_INHERIT)) { 1451 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1452 if (rv != KERN_SUCCESS) { 1453 vm_map_remove(map, *addr, *addr + size); 1454 goto out; 1455 } 1456 } 1457 1458 /* If a process has marked all future mappings for wiring, do so */ 1459 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1460 vm_map_unwire(map, *addr, *addr + size, FALSE); 1461 1462 /* 1463 * Set the access time on the vnode 1464 */ 1465 if (vp != NULL) 1466 vn_mark_atime(vp, td); 1467 out: 1468 lwkt_reltoken(&map->token); 1469 1470 switch (rv) { 1471 case KERN_SUCCESS: 1472 return (0); 1473 case KERN_INVALID_ADDRESS: 1474 case KERN_NO_SPACE: 1475 return (ENOMEM); 1476 case KERN_PROTECTION_FAILURE: 1477 return (EACCES); 1478 default: 1479 return (EINVAL); 1480 } 1481 } 1482 1483 /* 1484 * Translate a Mach VM return code to zero on success or the appropriate errno 1485 * on failure. 1486 */ 1487 int 1488 vm_mmap_to_errno(int rv) 1489 { 1490 1491 switch (rv) { 1492 case KERN_SUCCESS: 1493 return (0); 1494 case KERN_INVALID_ADDRESS: 1495 case KERN_NO_SPACE: 1496 return (ENOMEM); 1497 case KERN_PROTECTION_FAILURE: 1498 return (EACCES); 1499 default: 1500 return (EINVAL); 1501 } 1502 } 1503