1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $ 40 */ 41 42 /* 43 * Mapped file (mmap) interface to VM 44 */ 45 46 #include <sys/param.h> 47 #include <sys/kernel.h> 48 #include <sys/systm.h> 49 #include <sys/sysproto.h> 50 #include <sys/filedesc.h> 51 #include <sys/kern_syscall.h> 52 #include <sys/proc.h> 53 #include <sys/priv.h> 54 #include <sys/resource.h> 55 #include <sys/resourcevar.h> 56 #include <sys/vnode.h> 57 #include <sys/fcntl.h> 58 #include <sys/file.h> 59 #include <sys/mman.h> 60 #include <sys/conf.h> 61 #include <sys/stat.h> 62 #include <sys/vmmeter.h> 63 #include <sys/sysctl.h> 64 65 #include <vm/vm.h> 66 #include <vm/vm_param.h> 67 #include <sys/lock.h> 68 #include <vm/pmap.h> 69 #include <vm/vm_map.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_pageout.h> 74 #include <vm/vm_extern.h> 75 #include <vm/vm_kern.h> 76 77 #include <sys/file2.h> 78 #include <sys/thread.h> 79 #include <sys/thread2.h> 80 #include <vm/vm_page2.h> 81 82 static int max_proc_mmap = 1000000; 83 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 84 int vkernel_enable; 85 SYSCTL_INT(_vm, OID_AUTO, vkernel_enable, CTLFLAG_RW, &vkernel_enable, 0, ""); 86 87 /* 88 * MPSAFE 89 */ 90 int 91 sys_sbrk(struct sbrk_args *uap) 92 { 93 /* Not yet implemented */ 94 return (EOPNOTSUPP); 95 } 96 97 /* 98 * sstk_args(int incr) 99 * 100 * MPSAFE 101 */ 102 int 103 sys_sstk(struct sstk_args *uap) 104 { 105 /* Not yet implemented */ 106 return (EOPNOTSUPP); 107 } 108 109 /* 110 * mmap_args(void *addr, size_t len, int prot, int flags, int fd, 111 * long pad, off_t pos) 112 * 113 * Memory Map (mmap) system call. Note that the file offset 114 * and address are allowed to be NOT page aligned, though if 115 * the MAP_FIXED flag it set, both must have the same remainder 116 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 117 * page-aligned, the actual mapping starts at trunc_page(addr) 118 * and the return value is adjusted up by the page offset. 119 * 120 * Generally speaking, only character devices which are themselves 121 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 122 * there would be no cache coherency between a descriptor and a VM mapping 123 * both to the same character device. 124 * 125 * Block devices can be mmap'd no matter what they represent. Cache coherency 126 * is maintained as long as you do not write directly to the underlying 127 * character device. 128 * 129 * No requirements 130 */ 131 int 132 kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen, 133 int uprot, int uflags, int fd, off_t upos, void **res) 134 { 135 struct thread *td = curthread; 136 struct proc *p = td->td_proc; 137 struct file *fp = NULL; 138 struct vnode *vp; 139 vm_offset_t addr; 140 vm_offset_t tmpaddr; 141 vm_size_t size, pageoff; 142 vm_prot_t prot, maxprot; 143 void *handle; 144 int flags, error; 145 off_t pos; 146 vm_object_t obj; 147 148 KKASSERT(p); 149 150 addr = (vm_offset_t) uaddr; 151 size = ulen; 152 prot = uprot & VM_PROT_ALL; 153 flags = uflags; 154 pos = upos; 155 156 /* 157 * Make sure mapping fits into numeric range etc. 158 * 159 * NOTE: We support the full unsigned range for size now. 160 */ 161 if (((flags & MAP_ANON) && (fd != -1 || pos != 0))) 162 return (EINVAL); 163 164 if (size == 0) 165 return (EINVAL); 166 167 if (flags & MAP_STACK) { 168 if (fd != -1) 169 return (EINVAL); 170 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 171 return (EINVAL); 172 flags |= MAP_ANON; 173 pos = 0; 174 } 175 176 /* 177 * Virtual page tables cannot be used with MAP_STACK. Apart from 178 * it not making any sense, the aux union is used by both 179 * types. 180 * 181 * Because the virtual page table is stored in the backing object 182 * and might be updated by the kernel, the mapping must be R+W. 183 */ 184 if (flags & MAP_VPAGETABLE) { 185 if (vkernel_enable == 0) 186 return (EOPNOTSUPP); 187 if (flags & MAP_STACK) 188 return (EINVAL); 189 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 190 return (EINVAL); 191 } 192 193 /* 194 * Align the file position to a page boundary, 195 * and save its page offset component. 196 */ 197 pageoff = (pos & PAGE_MASK); 198 pos -= pageoff; 199 200 /* Adjust size for rounding (on both ends). */ 201 size += pageoff; /* low end... */ 202 size = (vm_size_t) round_page(size); /* hi end */ 203 if (size < ulen) /* wrap */ 204 return(EINVAL); 205 206 /* 207 * Check for illegal addresses. Watch out for address wrap... Note 208 * that VM_*_ADDRESS are not constants due to casts (argh). 209 */ 210 if (flags & (MAP_FIXED | MAP_TRYFIXED)) { 211 /* 212 * The specified address must have the same remainder 213 * as the file offset taken modulo PAGE_SIZE, so it 214 * should be aligned after adjustment by pageoff. 215 */ 216 addr -= pageoff; 217 if (addr & PAGE_MASK) 218 return (EINVAL); 219 220 /* 221 * Address range must be all in user VM space and not wrap. 222 */ 223 tmpaddr = addr + size; 224 if (tmpaddr < addr) 225 return (EINVAL); 226 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 227 return (EINVAL); 228 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 229 return (EINVAL); 230 } else { 231 /* 232 * Get a hint of where to map. It also provides mmap offset 233 * randomization if enabled. 234 */ 235 addr = vm_map_hint(p, addr, prot); 236 } 237 238 if (flags & MAP_ANON) { 239 /* 240 * Mapping blank space is trivial. 241 */ 242 handle = NULL; 243 maxprot = VM_PROT_ALL; 244 } else { 245 /* 246 * Mapping file, get fp for validation. Obtain vnode and make 247 * sure it is of appropriate type. 248 */ 249 fp = holdfp(td, fd, -1); 250 if (fp == NULL) 251 return (EBADF); 252 if (fp->f_type != DTYPE_VNODE) { 253 error = EINVAL; 254 goto done; 255 } 256 /* 257 * POSIX shared-memory objects are defined to have 258 * kernel persistence, and are not defined to support 259 * read(2)/write(2) -- or even open(2). Thus, we can 260 * use MAP_ASYNC to trade on-disk coherence for speed. 261 * The shm_open(3) library routine turns on the FPOSIXSHM 262 * flag to request this behavior. 263 */ 264 if (fp->f_flag & FPOSIXSHM) 265 flags |= MAP_NOSYNC; 266 vp = (struct vnode *) fp->f_data; 267 268 /* 269 * Validate the vnode for the operation. 270 */ 271 switch(vp->v_type) { 272 case VREG: 273 /* 274 * Get the proper underlying object 275 */ 276 if ((obj = vp->v_object) == NULL) { 277 error = EINVAL; 278 goto done; 279 } 280 KKASSERT((struct vnode *)obj->handle == vp); 281 break; 282 case VCHR: 283 /* 284 * Make sure a device has not been revoked. 285 * Mappability is handled by the device layer. 286 */ 287 if (vp->v_rdev == NULL) { 288 error = EBADF; 289 goto done; 290 } 291 break; 292 default: 293 /* 294 * Nothing else is mappable. 295 */ 296 error = EINVAL; 297 goto done; 298 } 299 300 /* 301 * XXX hack to handle use of /dev/zero to map anon memory (ala 302 * SunOS). 303 */ 304 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 305 handle = NULL; 306 maxprot = VM_PROT_ALL; 307 flags |= MAP_ANON; 308 pos = 0; 309 } else { 310 /* 311 * cdevs does not provide private mappings of any kind. 312 */ 313 if (vp->v_type == VCHR && 314 (flags & (MAP_PRIVATE|MAP_COPY))) { 315 error = EINVAL; 316 goto done; 317 } 318 /* 319 * Ensure that file and memory protections are 320 * compatible. Note that we only worry about 321 * writability if mapping is shared; in this case, 322 * current and max prot are dictated by the open file. 323 * XXX use the vnode instead? Problem is: what 324 * credentials do we use for determination? What if 325 * proc does a setuid? 326 */ 327 maxprot = VM_PROT_EXECUTE; 328 if (fp->f_flag & FREAD) { 329 maxprot |= VM_PROT_READ; 330 } else if (prot & PROT_READ) { 331 error = EACCES; 332 goto done; 333 } 334 /* 335 * If we are sharing potential changes (either via 336 * MAP_SHARED or via the implicit sharing of character 337 * device mappings), and we are trying to get write 338 * permission although we opened it without asking 339 * for it, bail out. Check for superuser, only if 340 * we're at securelevel < 1, to allow the XIG X server 341 * to continue to work. 342 * 343 * PROT_WRITE + MAP_SHARED 344 */ 345 if ((flags & MAP_SHARED) != 0 || vp->v_type == VCHR) { 346 if ((fp->f_flag & FWRITE) != 0) { 347 struct vattr va; 348 if ((error = VOP_GETATTR(vp, &va))) { 349 goto done; 350 } 351 if ((va.va_flags & 352 (IMMUTABLE|APPEND)) == 0) { 353 maxprot |= VM_PROT_WRITE; 354 355 /* 356 * SHARED+RW file mmap() 357 * updates v_lastwrite_ts. 358 */ 359 if ((prot & PROT_WRITE) && 360 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY) == 0) { 361 vfs_timestamp(&vp->v_lastwrite_ts); 362 vsetflags(vp, VLASTWRITETS); 363 vn_unlock(vp); 364 } 365 } else if (prot & PROT_WRITE) { 366 error = EPERM; 367 goto done; 368 } 369 } else if ((prot & PROT_WRITE) != 0) { 370 error = EACCES; 371 goto done; 372 } 373 } else { 374 maxprot |= VM_PROT_WRITE; 375 } 376 handle = (void *)vp; 377 } 378 } 379 380 lwkt_gettoken(&vms->vm_map.token); 381 382 /* 383 * Do not allow more then a certain number of vm_map_entry structures 384 * per process. 0 to disable. 385 */ 386 if (max_proc_mmap && vms->vm_map.nentries >= max_proc_mmap) { 387 error = ENOMEM; 388 lwkt_reltoken(&vms->vm_map.token); 389 goto done; 390 } 391 392 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 393 flags, handle, pos); 394 if (error == 0) 395 *res = (void *)(addr + pageoff); 396 397 lwkt_reltoken(&vms->vm_map.token); 398 done: 399 if (fp) 400 dropfp(td, fd, fp); 401 402 return (error); 403 } 404 405 /* 406 * mmap system call handler 407 * 408 * No requirements. 409 */ 410 int 411 sys_mmap(struct mmap_args *uap) 412 { 413 int error; 414 415 error = kern_mmap(curproc->p_vmspace, uap->addr, uap->len, 416 uap->prot, uap->flags, 417 uap->fd, uap->pos, &uap->sysmsg_resultp); 418 419 return (error); 420 } 421 422 /* 423 * msync system call handler 424 * 425 * msync_args(void *addr, size_t len, int flags) 426 * 427 * No requirements 428 */ 429 int 430 sys_msync(struct msync_args *uap) 431 { 432 struct proc *p = curproc; 433 vm_offset_t addr; 434 vm_offset_t tmpaddr; 435 vm_size_t size, pageoff; 436 int flags; 437 vm_map_t map; 438 int rv; 439 440 addr = (vm_offset_t) uap->addr; 441 size = uap->len; 442 flags = uap->flags; 443 444 pageoff = (addr & PAGE_MASK); 445 addr -= pageoff; 446 size += pageoff; 447 size = (vm_size_t) round_page(size); 448 if (size < uap->len) /* wrap */ 449 return(EINVAL); 450 tmpaddr = addr + size; /* workaround gcc4 opt */ 451 if (tmpaddr < addr) /* wrap */ 452 return(EINVAL); 453 454 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 455 return (EINVAL); 456 457 map = &p->p_vmspace->vm_map; 458 459 /* 460 * map->token serializes extracting the address range for size == 0 461 * msyncs with the vm_map_clean call; if the token were not held 462 * across the two calls, an intervening munmap/mmap pair, for example, 463 * could cause msync to occur on a wrong region. 464 */ 465 lwkt_gettoken(&map->token); 466 467 /* 468 * XXX Gak! If size is zero we are supposed to sync "all modified 469 * pages with the region containing addr". Unfortunately, we don't 470 * really keep track of individual mmaps so we approximate by flushing 471 * the range of the map entry containing addr. This can be incorrect 472 * if the region splits or is coalesced with a neighbor. 473 */ 474 if (size == 0) { 475 vm_map_entry_t entry; 476 477 vm_map_lock_read(map); 478 rv = vm_map_lookup_entry(map, addr, &entry); 479 if (rv == FALSE) { 480 vm_map_unlock_read(map); 481 rv = KERN_INVALID_ADDRESS; 482 goto done; 483 } 484 addr = entry->start; 485 size = entry->end - entry->start; 486 vm_map_unlock_read(map); 487 } 488 489 /* 490 * Clean the pages and interpret the return value. 491 */ 492 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 493 (flags & MS_INVALIDATE) != 0); 494 done: 495 lwkt_reltoken(&map->token); 496 497 switch (rv) { 498 case KERN_SUCCESS: 499 break; 500 case KERN_INVALID_ADDRESS: 501 return (EINVAL); /* Sun returns ENOMEM? */ 502 case KERN_FAILURE: 503 return (EIO); 504 default: 505 return (EINVAL); 506 } 507 508 return (0); 509 } 510 511 /* 512 * munmap system call handler 513 * 514 * munmap_args(void *addr, size_t len) 515 * 516 * No requirements 517 */ 518 int 519 sys_munmap(struct munmap_args *uap) 520 { 521 struct proc *p = curproc; 522 vm_offset_t addr; 523 vm_offset_t tmpaddr; 524 vm_size_t size, pageoff; 525 vm_map_t map; 526 527 addr = (vm_offset_t) uap->addr; 528 size = uap->len; 529 530 pageoff = (addr & PAGE_MASK); 531 addr -= pageoff; 532 size += pageoff; 533 size = (vm_size_t) round_page(size); 534 if (size < uap->len) /* wrap */ 535 return(EINVAL); 536 tmpaddr = addr + size; /* workaround gcc4 opt */ 537 if (tmpaddr < addr) /* wrap */ 538 return(EINVAL); 539 540 if (size == 0) 541 return (0); 542 543 /* 544 * Check for illegal addresses. Watch out for address wrap... Note 545 * that VM_*_ADDRESS are not constants due to casts (argh). 546 */ 547 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 548 return (EINVAL); 549 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 550 return (EINVAL); 551 552 map = &p->p_vmspace->vm_map; 553 554 /* map->token serializes between the map check and the actual unmap */ 555 lwkt_gettoken(&map->token); 556 557 /* 558 * Make sure entire range is allocated. 559 */ 560 if (!vm_map_check_protection(map, addr, addr + size, 561 VM_PROT_NONE, FALSE)) { 562 lwkt_reltoken(&map->token); 563 return (EINVAL); 564 } 565 /* returns nothing but KERN_SUCCESS anyway */ 566 vm_map_remove(map, addr, addr + size); 567 lwkt_reltoken(&map->token); 568 return (0); 569 } 570 571 /* 572 * mprotect_args(const void *addr, size_t len, int prot) 573 * 574 * No requirements. 575 */ 576 int 577 sys_mprotect(struct mprotect_args *uap) 578 { 579 struct proc *p = curproc; 580 vm_offset_t addr; 581 vm_offset_t tmpaddr; 582 vm_size_t size, pageoff; 583 vm_prot_t prot; 584 int error; 585 586 addr = (vm_offset_t) uap->addr; 587 size = uap->len; 588 prot = uap->prot & VM_PROT_ALL; 589 590 pageoff = (addr & PAGE_MASK); 591 addr -= pageoff; 592 size += pageoff; 593 size = (vm_size_t) round_page(size); 594 if (size < uap->len) /* wrap */ 595 return(EINVAL); 596 tmpaddr = addr + size; /* workaround gcc4 opt */ 597 if (tmpaddr < addr) /* wrap */ 598 return(EINVAL); 599 600 switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, 601 prot, FALSE)) { 602 case KERN_SUCCESS: 603 error = 0; 604 break; 605 case KERN_PROTECTION_FAILURE: 606 error = EACCES; 607 break; 608 default: 609 error = EINVAL; 610 break; 611 } 612 return (error); 613 } 614 615 /* 616 * minherit system call handler 617 * 618 * minherit_args(void *addr, size_t len, int inherit) 619 * 620 * No requirements. 621 */ 622 int 623 sys_minherit(struct minherit_args *uap) 624 { 625 struct proc *p = curproc; 626 vm_offset_t addr; 627 vm_offset_t tmpaddr; 628 vm_size_t size, pageoff; 629 vm_inherit_t inherit; 630 int error; 631 632 addr = (vm_offset_t)uap->addr; 633 size = uap->len; 634 inherit = uap->inherit; 635 636 pageoff = (addr & PAGE_MASK); 637 addr -= pageoff; 638 size += pageoff; 639 size = (vm_size_t) round_page(size); 640 if (size < uap->len) /* wrap */ 641 return(EINVAL); 642 tmpaddr = addr + size; /* workaround gcc4 opt */ 643 if (tmpaddr < addr) /* wrap */ 644 return(EINVAL); 645 646 switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, 647 addr + size, inherit)) { 648 case KERN_SUCCESS: 649 error = 0; 650 break; 651 case KERN_PROTECTION_FAILURE: 652 error = EACCES; 653 break; 654 default: 655 error = EINVAL; 656 break; 657 } 658 return (error); 659 } 660 661 /* 662 * madvise system call handler 663 * 664 * madvise_args(void *addr, size_t len, int behav) 665 * 666 * No requirements. 667 */ 668 int 669 sys_madvise(struct madvise_args *uap) 670 { 671 struct proc *p = curproc; 672 vm_offset_t start, end; 673 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 674 int error; 675 676 /* 677 * Check for illegal behavior 678 */ 679 if (uap->behav < 0 || uap->behav >= MADV_CONTROL_END) 680 return (EINVAL); 681 /* 682 * Check for illegal addresses. Watch out for address wrap... Note 683 * that VM_*_ADDRESS are not constants due to casts (argh). 684 */ 685 if (tmpaddr < (vm_offset_t)uap->addr) 686 return (EINVAL); 687 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 688 return (EINVAL); 689 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 690 return (EINVAL); 691 692 /* 693 * Since this routine is only advisory, we default to conservative 694 * behavior. 695 */ 696 start = trunc_page((vm_offset_t)uap->addr); 697 end = round_page(tmpaddr); 698 699 error = vm_map_madvise(&p->p_vmspace->vm_map, start, end, 700 uap->behav, 0); 701 return (error); 702 } 703 704 /* 705 * mcontrol system call handler 706 * 707 * mcontrol_args(void *addr, size_t len, int behav, off_t value) 708 * 709 * No requirements 710 */ 711 int 712 sys_mcontrol(struct mcontrol_args *uap) 713 { 714 struct proc *p = curproc; 715 vm_offset_t start, end; 716 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 717 int error; 718 719 /* 720 * Check for illegal behavior 721 */ 722 if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) 723 return (EINVAL); 724 /* 725 * Check for illegal addresses. Watch out for address wrap... Note 726 * that VM_*_ADDRESS are not constants due to casts (argh). 727 */ 728 if (tmpaddr < (vm_offset_t) uap->addr) 729 return (EINVAL); 730 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 731 return (EINVAL); 732 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 733 return (EINVAL); 734 735 /* 736 * Since this routine is only advisory, we default to conservative 737 * behavior. 738 */ 739 start = trunc_page((vm_offset_t)uap->addr); 740 end = round_page(tmpaddr); 741 742 error = vm_map_madvise(&p->p_vmspace->vm_map, start, end, 743 uap->behav, uap->value); 744 return (error); 745 } 746 747 748 /* 749 * mincore system call handler 750 * 751 * mincore_args(const void *addr, size_t len, char *vec) 752 * 753 * No requirements 754 */ 755 int 756 sys_mincore(struct mincore_args *uap) 757 { 758 struct proc *p = curproc; 759 vm_offset_t addr, first_addr; 760 vm_offset_t end, cend; 761 pmap_t pmap; 762 vm_map_t map; 763 char *vec; 764 int error; 765 int vecindex, lastvecindex; 766 vm_map_entry_t current; 767 vm_map_entry_t entry; 768 int mincoreinfo; 769 unsigned int timestamp; 770 771 /* 772 * Make sure that the addresses presented are valid for user 773 * mode. 774 */ 775 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 776 end = addr + (vm_size_t)round_page(uap->len); 777 if (end < addr) 778 return (EINVAL); 779 if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS) 780 return (EINVAL); 781 782 /* 783 * Address of byte vector 784 */ 785 vec = uap->vec; 786 787 map = &p->p_vmspace->vm_map; 788 pmap = vmspace_pmap(p->p_vmspace); 789 790 lwkt_gettoken(&map->token); 791 vm_map_lock_read(map); 792 RestartScan: 793 timestamp = map->timestamp; 794 795 if (!vm_map_lookup_entry(map, addr, &entry)) 796 entry = entry->next; 797 798 /* 799 * Do this on a map entry basis so that if the pages are not 800 * in the current processes address space, we can easily look 801 * up the pages elsewhere. 802 */ 803 lastvecindex = -1; 804 for(current = entry; 805 (current != &map->header) && (current->start < end); 806 current = current->next) { 807 808 /* 809 * ignore submaps (for now) or null objects 810 */ 811 if (current->maptype != VM_MAPTYPE_NORMAL && 812 current->maptype != VM_MAPTYPE_VPAGETABLE) { 813 continue; 814 } 815 if (current->object.vm_object == NULL) 816 continue; 817 818 /* 819 * limit this scan to the current map entry and the 820 * limits for the mincore call 821 */ 822 if (addr < current->start) 823 addr = current->start; 824 cend = current->end; 825 if (cend > end) 826 cend = end; 827 828 /* 829 * scan this entry one page at a time 830 */ 831 while (addr < cend) { 832 /* 833 * Check pmap first, it is likely faster, also 834 * it can provide info as to whether we are the 835 * one referencing or modifying the page. 836 * 837 * If we have to check the VM object, only mess 838 * around with normal maps. Do not mess around 839 * with virtual page tables (XXX). 840 */ 841 mincoreinfo = pmap_mincore(pmap, addr); 842 if (mincoreinfo == 0 && 843 current->maptype == VM_MAPTYPE_NORMAL) { 844 vm_pindex_t pindex; 845 vm_ooffset_t offset; 846 vm_page_t m; 847 848 /* 849 * calculate the page index into the object 850 */ 851 offset = current->offset + (addr - current->start); 852 pindex = OFF_TO_IDX(offset); 853 854 /* 855 * if the page is resident, then gather 856 * information about it. spl protection is 857 * required to maintain the object 858 * association. And XXX what if the page is 859 * busy? What's the deal with that? 860 * 861 * XXX vm_token - legacy for pmap_ts_referenced 862 * in x86 and vkernel pmap code. 863 */ 864 lwkt_gettoken(&vm_token); 865 vm_object_hold(current->object.vm_object); 866 m = vm_page_lookup(current->object.vm_object, 867 pindex); 868 if (m && m->valid) { 869 mincoreinfo = MINCORE_INCORE; 870 if (m->dirty || pmap_is_modified(m)) 871 mincoreinfo |= MINCORE_MODIFIED_OTHER; 872 if ((m->flags & PG_REFERENCED) || 873 pmap_ts_referenced(m)) { 874 vm_page_flag_set(m, PG_REFERENCED); 875 mincoreinfo |= MINCORE_REFERENCED_OTHER; 876 } 877 } 878 vm_object_drop(current->object.vm_object); 879 lwkt_reltoken(&vm_token); 880 } 881 882 /* 883 * subyte may page fault. In case it needs to modify 884 * the map, we release the lock. 885 */ 886 vm_map_unlock_read(map); 887 888 /* 889 * calculate index into user supplied byte vector 890 */ 891 vecindex = OFF_TO_IDX(addr - first_addr); 892 893 /* 894 * If we have skipped map entries, we need to make sure that 895 * the byte vector is zeroed for those skipped entries. 896 */ 897 while((lastvecindex + 1) < vecindex) { 898 error = subyte( vec + lastvecindex, 0); 899 if (error) { 900 error = EFAULT; 901 goto done; 902 } 903 ++lastvecindex; 904 } 905 906 /* 907 * Pass the page information to the user 908 */ 909 error = subyte(vec + vecindex, mincoreinfo); 910 if (error) { 911 error = EFAULT; 912 goto done; 913 } 914 915 /* 916 * If the map has changed, due to the subyte, 917 * the previous output may be invalid. 918 */ 919 vm_map_lock_read(map); 920 if (timestamp != map->timestamp) 921 goto RestartScan; 922 923 lastvecindex = vecindex; 924 addr += PAGE_SIZE; 925 } 926 } 927 928 /* 929 * subyte may page fault. In case it needs to modify 930 * the map, we release the lock. 931 */ 932 vm_map_unlock_read(map); 933 934 /* 935 * Zero the last entries in the byte vector. 936 */ 937 vecindex = OFF_TO_IDX(end - first_addr); 938 while((lastvecindex + 1) < vecindex) { 939 error = subyte( vec + lastvecindex, 0); 940 if (error) { 941 error = EFAULT; 942 goto done; 943 } 944 ++lastvecindex; 945 } 946 947 /* 948 * If the map has changed, due to the subyte, the previous 949 * output may be invalid. 950 */ 951 vm_map_lock_read(map); 952 if (timestamp != map->timestamp) 953 goto RestartScan; 954 vm_map_unlock_read(map); 955 956 error = 0; 957 done: 958 lwkt_reltoken(&map->token); 959 return (error); 960 } 961 962 /* 963 * mlock system call handler 964 * 965 * mlock_args(const void *addr, size_t len) 966 * 967 * No requirements 968 */ 969 int 970 sys_mlock(struct mlock_args *uap) 971 { 972 vm_offset_t addr; 973 vm_offset_t tmpaddr; 974 vm_size_t size, pageoff; 975 struct thread *td = curthread; 976 struct proc *p = td->td_proc; 977 int error; 978 979 addr = (vm_offset_t) uap->addr; 980 size = uap->len; 981 982 pageoff = (addr & PAGE_MASK); 983 addr -= pageoff; 984 size += pageoff; 985 size = (vm_size_t) round_page(size); 986 if (size < uap->len) /* wrap */ 987 return(EINVAL); 988 tmpaddr = addr + size; /* workaround gcc4 opt */ 989 if (tmpaddr < addr) /* wrap */ 990 return (EINVAL); 991 992 if (atop(size) + vmstats.v_wire_count > vm_page_max_wired) 993 return (EAGAIN); 994 995 /* 996 * We do not need to synchronize against other threads updating ucred; 997 * they update p->ucred, which is synchronized into td_ucred ourselves. 998 */ 999 #ifdef pmap_wired_count 1000 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 1001 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) { 1002 return (ENOMEM); 1003 } 1004 #else 1005 error = priv_check_cred(td->td_ucred, PRIV_ROOT, 0); 1006 if (error) { 1007 return (error); 1008 } 1009 #endif 1010 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, FALSE); 1011 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1012 } 1013 1014 /* 1015 * mlockall(int how) 1016 * 1017 * No requirements 1018 */ 1019 int 1020 sys_mlockall(struct mlockall_args *uap) 1021 { 1022 struct thread *td = curthread; 1023 struct proc *p = td->td_proc; 1024 vm_map_t map = &p->p_vmspace->vm_map; 1025 vm_map_entry_t entry; 1026 int how = uap->how; 1027 int rc = KERN_SUCCESS; 1028 1029 if (((how & MCL_CURRENT) == 0) && ((how & MCL_FUTURE) == 0)) 1030 return (EINVAL); 1031 1032 rc = priv_check_cred(td->td_ucred, PRIV_ROOT, 0); 1033 if (rc) 1034 return (rc); 1035 1036 vm_map_lock(map); 1037 do { 1038 if (how & MCL_CURRENT) { 1039 for(entry = map->header.next; 1040 entry != &map->header; 1041 entry = entry->next); 1042 1043 rc = ENOSYS; 1044 break; 1045 } 1046 1047 if (how & MCL_FUTURE) 1048 map->flags |= MAP_WIREFUTURE; 1049 } while(0); 1050 vm_map_unlock(map); 1051 1052 return (rc); 1053 } 1054 1055 /* 1056 * munlockall(void) 1057 * 1058 * Unwire all user-wired map entries, cancel MCL_FUTURE. 1059 * 1060 * No requirements 1061 */ 1062 int 1063 sys_munlockall(struct munlockall_args *uap) 1064 { 1065 struct thread *td = curthread; 1066 struct proc *p = td->td_proc; 1067 vm_map_t map = &p->p_vmspace->vm_map; 1068 vm_map_entry_t entry; 1069 int rc = KERN_SUCCESS; 1070 1071 vm_map_lock(map); 1072 1073 /* Clear MAP_WIREFUTURE to cancel mlockall(MCL_FUTURE) */ 1074 map->flags &= ~MAP_WIREFUTURE; 1075 1076 retry: 1077 for (entry = map->header.next; 1078 entry != &map->header; 1079 entry = entry->next) { 1080 if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) 1081 continue; 1082 1083 /* 1084 * If we encounter an in-transition entry, we release the 1085 * map lock and retry the scan; we do not decrement any 1086 * wired_count more than once because we do not touch 1087 * any entries with MAP_ENTRY_USER_WIRED not set. 1088 * 1089 * There is a potential interleaving with concurrent 1090 * mlockall()s here -- if we abort a scan, an mlockall() 1091 * could start, wire a number of entries before our 1092 * current position in, and then stall itself on this 1093 * or any other in-transition entry. If that occurs, when 1094 * we resume, we will unwire those entries. 1095 */ 1096 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 1097 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 1098 ++mycpu->gd_cnt.v_intrans_coll; 1099 ++mycpu->gd_cnt.v_intrans_wait; 1100 vm_map_transition_wait(map, 1); 1101 goto retry; 1102 } 1103 1104 KASSERT(entry->wired_count > 0, 1105 ("wired_count was 0 with USER_WIRED set! %p", entry)); 1106 1107 /* Drop wired count, if it hits zero, unwire the entry */ 1108 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 1109 entry->wired_count--; 1110 if (entry->wired_count == 0) 1111 vm_fault_unwire(map, entry); 1112 } 1113 1114 vm_map_unlock(map); 1115 1116 return (rc); 1117 } 1118 1119 /* 1120 * munlock system call handler 1121 * 1122 * munlock_args(const void *addr, size_t len) 1123 * 1124 * No requirements 1125 */ 1126 int 1127 sys_munlock(struct munlock_args *uap) 1128 { 1129 struct thread *td = curthread; 1130 struct proc *p = td->td_proc; 1131 vm_offset_t addr; 1132 vm_offset_t tmpaddr; 1133 vm_size_t size, pageoff; 1134 int error; 1135 1136 addr = (vm_offset_t) uap->addr; 1137 size = uap->len; 1138 1139 pageoff = (addr & PAGE_MASK); 1140 addr -= pageoff; 1141 size += pageoff; 1142 size = (vm_size_t) round_page(size); 1143 1144 tmpaddr = addr + size; 1145 if (tmpaddr < addr) /* wrap */ 1146 return (EINVAL); 1147 1148 #ifndef pmap_wired_count 1149 error = priv_check(td, PRIV_ROOT); 1150 if (error) 1151 return (error); 1152 #endif 1153 1154 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, TRUE); 1155 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1156 } 1157 1158 /* 1159 * Internal version of mmap. 1160 * Currently used by mmap, exec, and sys5 shared memory. 1161 * Handle is either a vnode pointer or NULL for MAP_ANON. 1162 * 1163 * No requirements 1164 */ 1165 int 1166 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1167 vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff) 1168 { 1169 boolean_t fitit; 1170 vm_object_t object; 1171 vm_offset_t eaddr; 1172 vm_size_t esize; 1173 vm_size_t align; 1174 int (*uksmap)(cdev_t dev, vm_page_t fake); 1175 struct vnode *vp; 1176 struct thread *td = curthread; 1177 struct proc *p; 1178 int rv = KERN_SUCCESS; 1179 off_t objsize; 1180 int docow; 1181 int error; 1182 1183 if (size == 0) 1184 return (0); 1185 1186 objsize = round_page(size); 1187 if (objsize < size) 1188 return (EINVAL); 1189 size = objsize; 1190 1191 lwkt_gettoken(&map->token); 1192 1193 /* 1194 * XXX messy code, fixme 1195 * 1196 * NOTE: Overflow checks require discrete statements or GCC4 1197 * will optimize it out. 1198 */ 1199 if ((p = curproc) != NULL && map == &p->p_vmspace->vm_map) { 1200 esize = map->size + size; /* workaround gcc4 opt */ 1201 if (esize < map->size || 1202 esize > p->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1203 lwkt_reltoken(&map->token); 1204 return(ENOMEM); 1205 } 1206 } 1207 1208 /* 1209 * We currently can only deal with page aligned file offsets. 1210 * The check is here rather than in the syscall because the 1211 * kernel calls this function internally for other mmaping 1212 * operations (such as in exec) and non-aligned offsets will 1213 * cause pmap inconsistencies...so we want to be sure to 1214 * disallow this in all cases. 1215 * 1216 * NOTE: Overflow checks require discrete statements or GCC4 1217 * will optimize it out. 1218 */ 1219 if (foff & PAGE_MASK) { 1220 lwkt_reltoken(&map->token); 1221 return (EINVAL); 1222 } 1223 1224 /* 1225 * Handle alignment. For large memory maps it is possible 1226 * that the MMU can optimize the page table so align anything 1227 * that is a multiple of SEG_SIZE to SEG_SIZE. 1228 * 1229 * Also align any large mapping (bigger than 16x SG_SIZE) to a 1230 * SEG_SIZE address boundary. 1231 */ 1232 if (flags & MAP_SIZEALIGN) { 1233 align = size; 1234 if ((align ^ (align - 1)) != (align << 1) - 1) { 1235 lwkt_reltoken(&map->token); 1236 return (EINVAL); 1237 } 1238 } else if ((flags & MAP_FIXED) == 0 && 1239 ((size & SEG_MASK) == 0 || size > SEG_SIZE * 16)) { 1240 align = SEG_SIZE; 1241 } else { 1242 align = PAGE_SIZE; 1243 } 1244 1245 if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) { 1246 fitit = TRUE; 1247 *addr = round_page(*addr); 1248 } else { 1249 if (*addr != trunc_page(*addr)) { 1250 lwkt_reltoken(&map->token); 1251 return (EINVAL); 1252 } 1253 eaddr = *addr + size; 1254 if (eaddr < *addr) { 1255 lwkt_reltoken(&map->token); 1256 return (EINVAL); 1257 } 1258 fitit = FALSE; 1259 if ((flags & MAP_TRYFIXED) == 0) 1260 vm_map_remove(map, *addr, *addr + size); 1261 } 1262 1263 uksmap = NULL; 1264 1265 /* 1266 * Lookup/allocate object. 1267 */ 1268 if (flags & MAP_ANON) { 1269 /* 1270 * Unnamed anonymous regions always start at 0. 1271 */ 1272 if (handle) { 1273 /* 1274 * Default memory object 1275 */ 1276 object = default_pager_alloc(handle, objsize, 1277 prot, foff); 1278 if (object == NULL) { 1279 lwkt_reltoken(&map->token); 1280 return(ENOMEM); 1281 } 1282 docow = MAP_PREFAULT_PARTIAL; 1283 } else { 1284 /* 1285 * Implicit single instance of a default memory 1286 * object, so we don't need a VM object yet. 1287 */ 1288 foff = 0; 1289 object = NULL; 1290 docow = 0; 1291 } 1292 vp = NULL; 1293 } else { 1294 vp = (struct vnode *)handle; 1295 1296 /* 1297 * Non-anonymous mappings of VCHR (aka not /dev/zero) 1298 * cannot specify MAP_STACK or MAP_VPAGETABLE. 1299 */ 1300 if (vp->v_type == VCHR) { 1301 if (flags & (MAP_STACK | MAP_VPAGETABLE)) { 1302 lwkt_reltoken(&map->token); 1303 return(EINVAL); 1304 } 1305 } 1306 1307 if (vp->v_type == VCHR && vp->v_rdev->si_ops->d_uksmap) { 1308 /* 1309 * Device mappings without a VM object, typically 1310 * sharing permanently allocated kernel memory or 1311 * process-context-specific (per-process) data. 1312 * 1313 * Force them to be shared. 1314 */ 1315 uksmap = vp->v_rdev->si_ops->d_uksmap; 1316 object = NULL; 1317 docow = MAP_PREFAULT_PARTIAL; 1318 flags &= ~(MAP_PRIVATE|MAP_COPY); 1319 flags |= MAP_SHARED; 1320 } else if (vp->v_type == VCHR) { 1321 /* 1322 * Device mappings (device size unknown?). 1323 * Force them to be shared. 1324 */ 1325 error = dev_dmmap_single(vp->v_rdev, &foff, objsize, 1326 &object, prot, NULL); 1327 1328 if (error == ENODEV) { 1329 handle = (void *)(intptr_t)vp->v_rdev; 1330 object = dev_pager_alloc(handle, objsize, prot, foff); 1331 if (object == NULL) { 1332 lwkt_reltoken(&map->token); 1333 return(EINVAL); 1334 } 1335 } else if (error) { 1336 lwkt_reltoken(&map->token); 1337 return(error); 1338 } 1339 1340 docow = MAP_PREFAULT_PARTIAL; 1341 flags &= ~(MAP_PRIVATE|MAP_COPY); 1342 flags |= MAP_SHARED; 1343 } else { 1344 /* 1345 * Regular file mapping (typically). The attribute 1346 * check is for the link count test only. mmapable 1347 * vnodes must already have a VM object assigned. 1348 */ 1349 struct vattr vat; 1350 int error; 1351 1352 error = VOP_GETATTR(vp, &vat); 1353 if (error) { 1354 lwkt_reltoken(&map->token); 1355 return (error); 1356 } 1357 docow = MAP_PREFAULT_PARTIAL; 1358 object = vnode_pager_reference(vp); 1359 if (object == NULL && vp->v_type == VREG) { 1360 lwkt_reltoken(&map->token); 1361 kprintf("Warning: cannot mmap vnode %p, no " 1362 "object\n", vp); 1363 return(EINVAL); 1364 } 1365 1366 /* 1367 * If it is a regular file without any references 1368 * we do not need to sync it. 1369 */ 1370 if (vp->v_type == VREG && vat.va_nlink == 0) { 1371 flags |= MAP_NOSYNC; 1372 } 1373 } 1374 } 1375 1376 /* 1377 * Deal with the adjusted flags 1378 */ 1379 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1380 docow |= MAP_COPY_ON_WRITE; 1381 if (flags & MAP_NOSYNC) 1382 docow |= MAP_DISABLE_SYNCER; 1383 if (flags & MAP_NOCORE) 1384 docow |= MAP_DISABLE_COREDUMP; 1385 1386 /* 1387 * This may place the area in its own page directory if (size) is 1388 * large enough, otherwise it typically returns its argument. 1389 * 1390 * (object can be NULL) 1391 */ 1392 if (fitit) { 1393 *addr = pmap_addr_hint(object, *addr, size); 1394 } 1395 1396 /* 1397 * Stack mappings need special attention. 1398 * 1399 * Mappings that use virtual page tables will default to storing 1400 * the page table at offset 0. 1401 */ 1402 if (uksmap) { 1403 rv = vm_map_find(map, uksmap, vp->v_rdev, 1404 foff, addr, size, 1405 align, fitit, 1406 VM_MAPTYPE_UKSMAP, VM_SUBSYS_MMAP, 1407 prot, maxprot, docow); 1408 } else if (flags & MAP_STACK) { 1409 rv = vm_map_stack(map, *addr, size, flags, 1410 prot, maxprot, docow); 1411 } else if (flags & MAP_VPAGETABLE) { 1412 rv = vm_map_find(map, object, NULL, 1413 foff, addr, size, 1414 align, fitit, 1415 VM_MAPTYPE_VPAGETABLE, VM_SUBSYS_MMAP, 1416 prot, maxprot, docow); 1417 } else { 1418 rv = vm_map_find(map, object, NULL, 1419 foff, addr, size, 1420 align, fitit, 1421 VM_MAPTYPE_NORMAL, VM_SUBSYS_MMAP, 1422 prot, maxprot, docow); 1423 } 1424 1425 if (rv != KERN_SUCCESS) { 1426 /* 1427 * Lose the object reference. Will destroy the 1428 * object if it's an unnamed anonymous mapping 1429 * or named anonymous without other references. 1430 * 1431 * (NOTE: object can be NULL) 1432 */ 1433 vm_object_deallocate(object); 1434 goto out; 1435 } 1436 1437 /* 1438 * Shared memory is also shared with children. 1439 */ 1440 if (flags & (MAP_SHARED|MAP_INHERIT)) { 1441 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1442 if (rv != KERN_SUCCESS) { 1443 vm_map_remove(map, *addr, *addr + size); 1444 goto out; 1445 } 1446 } 1447 1448 /* If a process has marked all future mappings for wiring, do so */ 1449 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1450 vm_map_unwire(map, *addr, *addr + size, FALSE); 1451 1452 /* 1453 * Set the access time on the vnode 1454 */ 1455 if (vp != NULL) 1456 vn_mark_atime(vp, td); 1457 out: 1458 lwkt_reltoken(&map->token); 1459 1460 switch (rv) { 1461 case KERN_SUCCESS: 1462 return (0); 1463 case KERN_INVALID_ADDRESS: 1464 case KERN_NO_SPACE: 1465 return (ENOMEM); 1466 case KERN_PROTECTION_FAILURE: 1467 return (EACCES); 1468 default: 1469 return (EINVAL); 1470 } 1471 } 1472 1473 /* 1474 * Translate a Mach VM return code to zero on success or the appropriate errno 1475 * on failure. 1476 */ 1477 int 1478 vm_mmap_to_errno(int rv) 1479 { 1480 1481 switch (rv) { 1482 case KERN_SUCCESS: 1483 return (0); 1484 case KERN_INVALID_ADDRESS: 1485 case KERN_NO_SPACE: 1486 return (ENOMEM); 1487 case KERN_PROTECTION_FAILURE: 1488 return (EACCES); 1489 default: 1490 return (EINVAL); 1491 } 1492 } 1493