1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $ 40 */ 41 42 /* 43 * Mapped file (mmap) interface to VM 44 */ 45 46 #include <sys/param.h> 47 #include <sys/kernel.h> 48 #include <sys/systm.h> 49 #include <sys/sysproto.h> 50 #include <sys/filedesc.h> 51 #include <sys/kern_syscall.h> 52 #include <sys/proc.h> 53 #include <sys/priv.h> 54 #include <sys/resource.h> 55 #include <sys/resourcevar.h> 56 #include <sys/vnode.h> 57 #include <sys/fcntl.h> 58 #include <sys/file.h> 59 #include <sys/mman.h> 60 #include <sys/conf.h> 61 #include <sys/stat.h> 62 #include <sys/vmmeter.h> 63 #include <sys/sysctl.h> 64 65 #include <vm/vm.h> 66 #include <vm/vm_param.h> 67 #include <sys/lock.h> 68 #include <vm/pmap.h> 69 #include <vm/vm_map.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_pager.h> 73 #include <vm/vm_pageout.h> 74 #include <vm/vm_extern.h> 75 #include <vm/vm_kern.h> 76 77 #include <sys/file2.h> 78 #include <sys/thread.h> 79 #include <sys/thread2.h> 80 #include <vm/vm_page2.h> 81 82 static int max_proc_mmap = 1000000; 83 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 84 int vkernel_enable; 85 SYSCTL_INT(_vm, OID_AUTO, vkernel_enable, CTLFLAG_RW, &vkernel_enable, 0, ""); 86 87 /* 88 * MPSAFE 89 */ 90 int 91 sys_sbrk(struct sbrk_args *uap) 92 { 93 /* Not yet implemented */ 94 return (EOPNOTSUPP); 95 } 96 97 /* 98 * sstk_args(int incr) 99 * 100 * MPSAFE 101 */ 102 int 103 sys_sstk(struct sstk_args *uap) 104 { 105 /* Not yet implemented */ 106 return (EOPNOTSUPP); 107 } 108 109 /* 110 * mmap_args(void *addr, size_t len, int prot, int flags, int fd, 111 * long pad, off_t pos) 112 * 113 * Memory Map (mmap) system call. Note that the file offset 114 * and address are allowed to be NOT page aligned, though if 115 * the MAP_FIXED flag it set, both must have the same remainder 116 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 117 * page-aligned, the actual mapping starts at trunc_page(addr) 118 * and the return value is adjusted up by the page offset. 119 * 120 * Generally speaking, only character devices which are themselves 121 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 122 * there would be no cache coherency between a descriptor and a VM mapping 123 * both to the same character device. 124 * 125 * Block devices can be mmap'd no matter what they represent. Cache coherency 126 * is maintained as long as you do not write directly to the underlying 127 * character device. 128 * 129 * No requirements 130 */ 131 int 132 kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen, 133 int uprot, int uflags, int fd, off_t upos, void **res) 134 { 135 struct thread *td = curthread; 136 struct proc *p = td->td_proc; 137 struct file *fp = NULL; 138 struct vnode *vp; 139 vm_offset_t addr; 140 vm_offset_t tmpaddr; 141 vm_size_t size, pageoff; 142 vm_prot_t prot, maxprot; 143 void *handle; 144 int flags, error; 145 off_t pos; 146 vm_object_t obj; 147 148 KKASSERT(p); 149 150 addr = (vm_offset_t) uaddr; 151 size = ulen; 152 prot = uprot & VM_PROT_ALL; 153 flags = uflags; 154 pos = upos; 155 156 /* 157 * Make sure mapping fits into numeric range etc. 158 * 159 * NOTE: We support the full unsigned range for size now. 160 */ 161 if (((flags & MAP_ANON) && (fd != -1 || pos != 0))) 162 return (EINVAL); 163 164 if (size == 0) 165 return (EINVAL); 166 167 if (flags & MAP_STACK) { 168 if (fd != -1) 169 return (EINVAL); 170 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 171 return (EINVAL); 172 flags |= MAP_ANON; 173 pos = 0; 174 } 175 176 /* 177 * Virtual page tables cannot be used with MAP_STACK. Apart from 178 * it not making any sense, the aux union is used by both 179 * types. 180 * 181 * Because the virtual page table is stored in the backing object 182 * and might be updated by the kernel, the mapping must be R+W. 183 */ 184 if (flags & MAP_VPAGETABLE) { 185 if (vkernel_enable == 0) 186 return (EOPNOTSUPP); 187 if (flags & MAP_STACK) 188 return (EINVAL); 189 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 190 return (EINVAL); 191 } 192 193 /* 194 * Align the file position to a page boundary, 195 * and save its page offset component. 196 */ 197 pageoff = (pos & PAGE_MASK); 198 pos -= pageoff; 199 200 /* Adjust size for rounding (on both ends). */ 201 size += pageoff; /* low end... */ 202 size = (vm_size_t) round_page(size); /* hi end */ 203 if (size < ulen) /* wrap */ 204 return(EINVAL); 205 206 /* 207 * Check for illegal addresses. Watch out for address wrap... Note 208 * that VM_*_ADDRESS are not constants due to casts (argh). 209 */ 210 if (flags & (MAP_FIXED | MAP_TRYFIXED)) { 211 /* 212 * The specified address must have the same remainder 213 * as the file offset taken modulo PAGE_SIZE, so it 214 * should be aligned after adjustment by pageoff. 215 */ 216 addr -= pageoff; 217 if (addr & PAGE_MASK) 218 return (EINVAL); 219 220 /* 221 * Address range must be all in user VM space and not wrap. 222 */ 223 tmpaddr = addr + size; 224 if (tmpaddr < addr) 225 return (EINVAL); 226 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 227 return (EINVAL); 228 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 229 return (EINVAL); 230 } else { 231 /* 232 * Get a hint of where to map. It also provides mmap offset 233 * randomization if enabled. 234 */ 235 addr = vm_map_hint(p, addr, prot); 236 } 237 238 if (flags & MAP_ANON) { 239 /* 240 * Mapping blank space is trivial. 241 */ 242 handle = NULL; 243 maxprot = VM_PROT_ALL; 244 } else { 245 /* 246 * Mapping file, get fp for validation. Obtain vnode and make 247 * sure it is of appropriate type. 248 */ 249 fp = holdfp(p->p_fd, fd, -1); 250 if (fp == NULL) 251 return (EBADF); 252 if (fp->f_type != DTYPE_VNODE) { 253 error = EINVAL; 254 goto done; 255 } 256 /* 257 * POSIX shared-memory objects are defined to have 258 * kernel persistence, and are not defined to support 259 * read(2)/write(2) -- or even open(2). Thus, we can 260 * use MAP_ASYNC to trade on-disk coherence for speed. 261 * The shm_open(3) library routine turns on the FPOSIXSHM 262 * flag to request this behavior. 263 */ 264 if (fp->f_flag & FPOSIXSHM) 265 flags |= MAP_NOSYNC; 266 vp = (struct vnode *) fp->f_data; 267 268 /* 269 * Validate the vnode for the operation. 270 */ 271 switch(vp->v_type) { 272 case VREG: 273 /* 274 * Get the proper underlying object 275 */ 276 if ((obj = vp->v_object) == NULL) { 277 error = EINVAL; 278 goto done; 279 } 280 KKASSERT((struct vnode *)obj->handle == vp); 281 break; 282 case VCHR: 283 /* 284 * Make sure a device has not been revoked. 285 * Mappability is handled by the device layer. 286 */ 287 if (vp->v_rdev == NULL) { 288 error = EBADF; 289 goto done; 290 } 291 break; 292 default: 293 /* 294 * Nothing else is mappable. 295 */ 296 error = EINVAL; 297 goto done; 298 } 299 300 /* 301 * XXX hack to handle use of /dev/zero to map anon memory (ala 302 * SunOS). 303 */ 304 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 305 handle = NULL; 306 maxprot = VM_PROT_ALL; 307 flags |= MAP_ANON; 308 pos = 0; 309 } else { 310 /* 311 * cdevs does not provide private mappings of any kind. 312 */ 313 if (vp->v_type == VCHR && 314 (flags & (MAP_PRIVATE|MAP_COPY))) { 315 error = EINVAL; 316 goto done; 317 } 318 /* 319 * Ensure that file and memory protections are 320 * compatible. Note that we only worry about 321 * writability if mapping is shared; in this case, 322 * current and max prot are dictated by the open file. 323 * XXX use the vnode instead? Problem is: what 324 * credentials do we use for determination? What if 325 * proc does a setuid? 326 */ 327 maxprot = VM_PROT_EXECUTE; 328 if (fp->f_flag & FREAD) { 329 maxprot |= VM_PROT_READ; 330 } else if (prot & PROT_READ) { 331 error = EACCES; 332 goto done; 333 } 334 /* 335 * If we are sharing potential changes (either via 336 * MAP_SHARED or via the implicit sharing of character 337 * device mappings), and we are trying to get write 338 * permission although we opened it without asking 339 * for it, bail out. Check for superuser, only if 340 * we're at securelevel < 1, to allow the XIG X server 341 * to continue to work. 342 */ 343 if ((flags & MAP_SHARED) != 0 || vp->v_type == VCHR) { 344 if ((fp->f_flag & FWRITE) != 0) { 345 struct vattr va; 346 if ((error = VOP_GETATTR(vp, &va))) { 347 goto done; 348 } 349 if ((va.va_flags & 350 (IMMUTABLE|APPEND)) == 0) { 351 maxprot |= VM_PROT_WRITE; 352 } else if (prot & PROT_WRITE) { 353 error = EPERM; 354 goto done; 355 } 356 } else if ((prot & PROT_WRITE) != 0) { 357 error = EACCES; 358 goto done; 359 } 360 } else { 361 maxprot |= VM_PROT_WRITE; 362 } 363 handle = (void *)vp; 364 } 365 } 366 367 lwkt_gettoken(&vms->vm_map.token); 368 369 /* 370 * Do not allow more then a certain number of vm_map_entry structures 371 * per process. 0 to disable. 372 */ 373 if (max_proc_mmap && vms->vm_map.nentries >= max_proc_mmap) { 374 error = ENOMEM; 375 lwkt_reltoken(&vms->vm_map.token); 376 goto done; 377 } 378 379 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 380 flags, handle, pos); 381 if (error == 0) 382 *res = (void *)(addr + pageoff); 383 384 lwkt_reltoken(&vms->vm_map.token); 385 done: 386 if (fp) 387 fdrop(fp); 388 389 return (error); 390 } 391 392 /* 393 * mmap system call handler 394 * 395 * No requirements. 396 */ 397 int 398 sys_mmap(struct mmap_args *uap) 399 { 400 int error; 401 402 error = kern_mmap(curproc->p_vmspace, uap->addr, uap->len, 403 uap->prot, uap->flags, 404 uap->fd, uap->pos, &uap->sysmsg_resultp); 405 406 return (error); 407 } 408 409 /* 410 * msync system call handler 411 * 412 * msync_args(void *addr, size_t len, int flags) 413 * 414 * No requirements 415 */ 416 int 417 sys_msync(struct msync_args *uap) 418 { 419 struct proc *p = curproc; 420 vm_offset_t addr; 421 vm_offset_t tmpaddr; 422 vm_size_t size, pageoff; 423 int flags; 424 vm_map_t map; 425 int rv; 426 427 addr = (vm_offset_t) uap->addr; 428 size = uap->len; 429 flags = uap->flags; 430 431 pageoff = (addr & PAGE_MASK); 432 addr -= pageoff; 433 size += pageoff; 434 size = (vm_size_t) round_page(size); 435 if (size < uap->len) /* wrap */ 436 return(EINVAL); 437 tmpaddr = addr + size; /* workaround gcc4 opt */ 438 if (tmpaddr < addr) /* wrap */ 439 return(EINVAL); 440 441 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 442 return (EINVAL); 443 444 map = &p->p_vmspace->vm_map; 445 446 /* 447 * map->token serializes extracting the address range for size == 0 448 * msyncs with the vm_map_clean call; if the token were not held 449 * across the two calls, an intervening munmap/mmap pair, for example, 450 * could cause msync to occur on a wrong region. 451 */ 452 lwkt_gettoken(&map->token); 453 454 /* 455 * XXX Gak! If size is zero we are supposed to sync "all modified 456 * pages with the region containing addr". Unfortunately, we don't 457 * really keep track of individual mmaps so we approximate by flushing 458 * the range of the map entry containing addr. This can be incorrect 459 * if the region splits or is coalesced with a neighbor. 460 */ 461 if (size == 0) { 462 vm_map_entry_t entry; 463 464 vm_map_lock_read(map); 465 rv = vm_map_lookup_entry(map, addr, &entry); 466 if (rv == FALSE) { 467 vm_map_unlock_read(map); 468 rv = KERN_INVALID_ADDRESS; 469 goto done; 470 } 471 addr = entry->start; 472 size = entry->end - entry->start; 473 vm_map_unlock_read(map); 474 } 475 476 /* 477 * Clean the pages and interpret the return value. 478 */ 479 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 480 (flags & MS_INVALIDATE) != 0); 481 done: 482 lwkt_reltoken(&map->token); 483 484 switch (rv) { 485 case KERN_SUCCESS: 486 break; 487 case KERN_INVALID_ADDRESS: 488 return (EINVAL); /* Sun returns ENOMEM? */ 489 case KERN_FAILURE: 490 return (EIO); 491 default: 492 return (EINVAL); 493 } 494 495 return (0); 496 } 497 498 /* 499 * munmap system call handler 500 * 501 * munmap_args(void *addr, size_t len) 502 * 503 * No requirements 504 */ 505 int 506 sys_munmap(struct munmap_args *uap) 507 { 508 struct proc *p = curproc; 509 vm_offset_t addr; 510 vm_offset_t tmpaddr; 511 vm_size_t size, pageoff; 512 vm_map_t map; 513 514 addr = (vm_offset_t) uap->addr; 515 size = uap->len; 516 517 pageoff = (addr & PAGE_MASK); 518 addr -= pageoff; 519 size += pageoff; 520 size = (vm_size_t) round_page(size); 521 if (size < uap->len) /* wrap */ 522 return(EINVAL); 523 tmpaddr = addr + size; /* workaround gcc4 opt */ 524 if (tmpaddr < addr) /* wrap */ 525 return(EINVAL); 526 527 if (size == 0) 528 return (0); 529 530 /* 531 * Check for illegal addresses. Watch out for address wrap... Note 532 * that VM_*_ADDRESS are not constants due to casts (argh). 533 */ 534 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 535 return (EINVAL); 536 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 537 return (EINVAL); 538 539 map = &p->p_vmspace->vm_map; 540 541 /* map->token serializes between the map check and the actual unmap */ 542 lwkt_gettoken(&map->token); 543 544 /* 545 * Make sure entire range is allocated. 546 */ 547 if (!vm_map_check_protection(map, addr, addr + size, 548 VM_PROT_NONE, FALSE)) { 549 lwkt_reltoken(&map->token); 550 return (EINVAL); 551 } 552 /* returns nothing but KERN_SUCCESS anyway */ 553 vm_map_remove(map, addr, addr + size); 554 lwkt_reltoken(&map->token); 555 return (0); 556 } 557 558 /* 559 * mprotect_args(const void *addr, size_t len, int prot) 560 * 561 * No requirements. 562 */ 563 int 564 sys_mprotect(struct mprotect_args *uap) 565 { 566 struct proc *p = curproc; 567 vm_offset_t addr; 568 vm_offset_t tmpaddr; 569 vm_size_t size, pageoff; 570 vm_prot_t prot; 571 int error; 572 573 addr = (vm_offset_t) uap->addr; 574 size = uap->len; 575 prot = uap->prot & VM_PROT_ALL; 576 577 pageoff = (addr & PAGE_MASK); 578 addr -= pageoff; 579 size += pageoff; 580 size = (vm_size_t) round_page(size); 581 if (size < uap->len) /* wrap */ 582 return(EINVAL); 583 tmpaddr = addr + size; /* workaround gcc4 opt */ 584 if (tmpaddr < addr) /* wrap */ 585 return(EINVAL); 586 587 switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, 588 prot, FALSE)) { 589 case KERN_SUCCESS: 590 error = 0; 591 break; 592 case KERN_PROTECTION_FAILURE: 593 error = EACCES; 594 break; 595 default: 596 error = EINVAL; 597 break; 598 } 599 return (error); 600 } 601 602 /* 603 * minherit system call handler 604 * 605 * minherit_args(void *addr, size_t len, int inherit) 606 * 607 * No requirements. 608 */ 609 int 610 sys_minherit(struct minherit_args *uap) 611 { 612 struct proc *p = curproc; 613 vm_offset_t addr; 614 vm_offset_t tmpaddr; 615 vm_size_t size, pageoff; 616 vm_inherit_t inherit; 617 int error; 618 619 addr = (vm_offset_t)uap->addr; 620 size = uap->len; 621 inherit = uap->inherit; 622 623 pageoff = (addr & PAGE_MASK); 624 addr -= pageoff; 625 size += pageoff; 626 size = (vm_size_t) round_page(size); 627 if (size < uap->len) /* wrap */ 628 return(EINVAL); 629 tmpaddr = addr + size; /* workaround gcc4 opt */ 630 if (tmpaddr < addr) /* wrap */ 631 return(EINVAL); 632 633 switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, 634 addr + size, inherit)) { 635 case KERN_SUCCESS: 636 error = 0; 637 break; 638 case KERN_PROTECTION_FAILURE: 639 error = EACCES; 640 break; 641 default: 642 error = EINVAL; 643 break; 644 } 645 return (error); 646 } 647 648 /* 649 * madvise system call handler 650 * 651 * madvise_args(void *addr, size_t len, int behav) 652 * 653 * No requirements. 654 */ 655 int 656 sys_madvise(struct madvise_args *uap) 657 { 658 struct proc *p = curproc; 659 vm_offset_t start, end; 660 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 661 int error; 662 663 /* 664 * Check for illegal behavior 665 */ 666 if (uap->behav < 0 || uap->behav >= MADV_CONTROL_END) 667 return (EINVAL); 668 /* 669 * Check for illegal addresses. Watch out for address wrap... Note 670 * that VM_*_ADDRESS are not constants due to casts (argh). 671 */ 672 if (tmpaddr < (vm_offset_t)uap->addr) 673 return (EINVAL); 674 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 675 return (EINVAL); 676 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 677 return (EINVAL); 678 679 /* 680 * Since this routine is only advisory, we default to conservative 681 * behavior. 682 */ 683 start = trunc_page((vm_offset_t)uap->addr); 684 end = round_page(tmpaddr); 685 686 error = vm_map_madvise(&p->p_vmspace->vm_map, start, end, 687 uap->behav, 0); 688 return (error); 689 } 690 691 /* 692 * mcontrol system call handler 693 * 694 * mcontrol_args(void *addr, size_t len, int behav, off_t value) 695 * 696 * No requirements 697 */ 698 int 699 sys_mcontrol(struct mcontrol_args *uap) 700 { 701 struct proc *p = curproc; 702 vm_offset_t start, end; 703 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 704 int error; 705 706 /* 707 * Check for illegal behavior 708 */ 709 if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) 710 return (EINVAL); 711 /* 712 * Check for illegal addresses. Watch out for address wrap... Note 713 * that VM_*_ADDRESS are not constants due to casts (argh). 714 */ 715 if (tmpaddr < (vm_offset_t) uap->addr) 716 return (EINVAL); 717 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 718 return (EINVAL); 719 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 720 return (EINVAL); 721 722 /* 723 * Since this routine is only advisory, we default to conservative 724 * behavior. 725 */ 726 start = trunc_page((vm_offset_t)uap->addr); 727 end = round_page(tmpaddr); 728 729 error = vm_map_madvise(&p->p_vmspace->vm_map, start, end, 730 uap->behav, uap->value); 731 return (error); 732 } 733 734 735 /* 736 * mincore system call handler 737 * 738 * mincore_args(const void *addr, size_t len, char *vec) 739 * 740 * No requirements 741 */ 742 int 743 sys_mincore(struct mincore_args *uap) 744 { 745 struct proc *p = curproc; 746 vm_offset_t addr, first_addr; 747 vm_offset_t end, cend; 748 pmap_t pmap; 749 vm_map_t map; 750 char *vec; 751 int error; 752 int vecindex, lastvecindex; 753 vm_map_entry_t current; 754 vm_map_entry_t entry; 755 int mincoreinfo; 756 unsigned int timestamp; 757 758 /* 759 * Make sure that the addresses presented are valid for user 760 * mode. 761 */ 762 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 763 end = addr + (vm_size_t)round_page(uap->len); 764 if (end < addr) 765 return (EINVAL); 766 if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS) 767 return (EINVAL); 768 769 /* 770 * Address of byte vector 771 */ 772 vec = uap->vec; 773 774 map = &p->p_vmspace->vm_map; 775 pmap = vmspace_pmap(p->p_vmspace); 776 777 lwkt_gettoken(&map->token); 778 vm_map_lock_read(map); 779 RestartScan: 780 timestamp = map->timestamp; 781 782 if (!vm_map_lookup_entry(map, addr, &entry)) 783 entry = entry->next; 784 785 /* 786 * Do this on a map entry basis so that if the pages are not 787 * in the current processes address space, we can easily look 788 * up the pages elsewhere. 789 */ 790 lastvecindex = -1; 791 for(current = entry; 792 (current != &map->header) && (current->start < end); 793 current = current->next) { 794 795 /* 796 * ignore submaps (for now) or null objects 797 */ 798 if (current->maptype != VM_MAPTYPE_NORMAL && 799 current->maptype != VM_MAPTYPE_VPAGETABLE) { 800 continue; 801 } 802 if (current->object.vm_object == NULL) 803 continue; 804 805 /* 806 * limit this scan to the current map entry and the 807 * limits for the mincore call 808 */ 809 if (addr < current->start) 810 addr = current->start; 811 cend = current->end; 812 if (cend > end) 813 cend = end; 814 815 /* 816 * scan this entry one page at a time 817 */ 818 while (addr < cend) { 819 /* 820 * Check pmap first, it is likely faster, also 821 * it can provide info as to whether we are the 822 * one referencing or modifying the page. 823 * 824 * If we have to check the VM object, only mess 825 * around with normal maps. Do not mess around 826 * with virtual page tables (XXX). 827 */ 828 mincoreinfo = pmap_mincore(pmap, addr); 829 if (mincoreinfo == 0 && 830 current->maptype == VM_MAPTYPE_NORMAL) { 831 vm_pindex_t pindex; 832 vm_ooffset_t offset; 833 vm_page_t m; 834 835 /* 836 * calculate the page index into the object 837 */ 838 offset = current->offset + (addr - current->start); 839 pindex = OFF_TO_IDX(offset); 840 841 /* 842 * if the page is resident, then gather 843 * information about it. spl protection is 844 * required to maintain the object 845 * association. And XXX what if the page is 846 * busy? What's the deal with that? 847 * 848 * XXX vm_token - legacy for pmap_ts_referenced 849 * in i386 and vkernel pmap code. 850 */ 851 lwkt_gettoken(&vm_token); 852 vm_object_hold(current->object.vm_object); 853 m = vm_page_lookup(current->object.vm_object, 854 pindex); 855 if (m && m->valid) { 856 mincoreinfo = MINCORE_INCORE; 857 if (m->dirty || pmap_is_modified(m)) 858 mincoreinfo |= MINCORE_MODIFIED_OTHER; 859 if ((m->flags & PG_REFERENCED) || 860 pmap_ts_referenced(m)) { 861 vm_page_flag_set(m, PG_REFERENCED); 862 mincoreinfo |= MINCORE_REFERENCED_OTHER; 863 } 864 } 865 vm_object_drop(current->object.vm_object); 866 lwkt_reltoken(&vm_token); 867 } 868 869 /* 870 * subyte may page fault. In case it needs to modify 871 * the map, we release the lock. 872 */ 873 vm_map_unlock_read(map); 874 875 /* 876 * calculate index into user supplied byte vector 877 */ 878 vecindex = OFF_TO_IDX(addr - first_addr); 879 880 /* 881 * If we have skipped map entries, we need to make sure that 882 * the byte vector is zeroed for those skipped entries. 883 */ 884 while((lastvecindex + 1) < vecindex) { 885 error = subyte( vec + lastvecindex, 0); 886 if (error) { 887 error = EFAULT; 888 goto done; 889 } 890 ++lastvecindex; 891 } 892 893 /* 894 * Pass the page information to the user 895 */ 896 error = subyte(vec + vecindex, mincoreinfo); 897 if (error) { 898 error = EFAULT; 899 goto done; 900 } 901 902 /* 903 * If the map has changed, due to the subyte, 904 * the previous output may be invalid. 905 */ 906 vm_map_lock_read(map); 907 if (timestamp != map->timestamp) 908 goto RestartScan; 909 910 lastvecindex = vecindex; 911 addr += PAGE_SIZE; 912 } 913 } 914 915 /* 916 * subyte may page fault. In case it needs to modify 917 * the map, we release the lock. 918 */ 919 vm_map_unlock_read(map); 920 921 /* 922 * Zero the last entries in the byte vector. 923 */ 924 vecindex = OFF_TO_IDX(end - first_addr); 925 while((lastvecindex + 1) < vecindex) { 926 error = subyte( vec + lastvecindex, 0); 927 if (error) { 928 error = EFAULT; 929 goto done; 930 } 931 ++lastvecindex; 932 } 933 934 /* 935 * If the map has changed, due to the subyte, the previous 936 * output may be invalid. 937 */ 938 vm_map_lock_read(map); 939 if (timestamp != map->timestamp) 940 goto RestartScan; 941 vm_map_unlock_read(map); 942 943 error = 0; 944 done: 945 lwkt_reltoken(&map->token); 946 return (error); 947 } 948 949 /* 950 * mlock system call handler 951 * 952 * mlock_args(const void *addr, size_t len) 953 * 954 * No requirements 955 */ 956 int 957 sys_mlock(struct mlock_args *uap) 958 { 959 vm_offset_t addr; 960 vm_offset_t tmpaddr; 961 vm_size_t size, pageoff; 962 struct thread *td = curthread; 963 struct proc *p = td->td_proc; 964 int error; 965 966 addr = (vm_offset_t) uap->addr; 967 size = uap->len; 968 969 pageoff = (addr & PAGE_MASK); 970 addr -= pageoff; 971 size += pageoff; 972 size = (vm_size_t) round_page(size); 973 if (size < uap->len) /* wrap */ 974 return(EINVAL); 975 tmpaddr = addr + size; /* workaround gcc4 opt */ 976 if (tmpaddr < addr) /* wrap */ 977 return (EINVAL); 978 979 if (atop(size) + vmstats.v_wire_count > vm_page_max_wired) 980 return (EAGAIN); 981 982 /* 983 * We do not need to synchronize against other threads updating ucred; 984 * they update p->ucred, which is synchronized into td_ucred ourselves. 985 */ 986 #ifdef pmap_wired_count 987 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 988 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) { 989 return (ENOMEM); 990 } 991 #else 992 error = priv_check_cred(td->td_ucred, PRIV_ROOT, 0); 993 if (error) { 994 return (error); 995 } 996 #endif 997 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, FALSE); 998 return (error == KERN_SUCCESS ? 0 : ENOMEM); 999 } 1000 1001 /* 1002 * mlockall(int how) 1003 * 1004 * No requirements 1005 */ 1006 int 1007 sys_mlockall(struct mlockall_args *uap) 1008 { 1009 struct thread *td = curthread; 1010 struct proc *p = td->td_proc; 1011 vm_map_t map = &p->p_vmspace->vm_map; 1012 vm_map_entry_t entry; 1013 int how = uap->how; 1014 int rc = KERN_SUCCESS; 1015 1016 if (((how & MCL_CURRENT) == 0) && ((how & MCL_FUTURE) == 0)) 1017 return (EINVAL); 1018 1019 rc = priv_check_cred(td->td_ucred, PRIV_ROOT, 0); 1020 if (rc) 1021 return (rc); 1022 1023 vm_map_lock(map); 1024 do { 1025 if (how & MCL_CURRENT) { 1026 for(entry = map->header.next; 1027 entry != &map->header; 1028 entry = entry->next); 1029 1030 rc = ENOSYS; 1031 break; 1032 } 1033 1034 if (how & MCL_FUTURE) 1035 map->flags |= MAP_WIREFUTURE; 1036 } while(0); 1037 vm_map_unlock(map); 1038 1039 return (rc); 1040 } 1041 1042 /* 1043 * munlockall(void) 1044 * 1045 * Unwire all user-wired map entries, cancel MCL_FUTURE. 1046 * 1047 * No requirements 1048 */ 1049 int 1050 sys_munlockall(struct munlockall_args *uap) 1051 { 1052 struct thread *td = curthread; 1053 struct proc *p = td->td_proc; 1054 vm_map_t map = &p->p_vmspace->vm_map; 1055 vm_map_entry_t entry; 1056 int rc = KERN_SUCCESS; 1057 1058 vm_map_lock(map); 1059 1060 /* Clear MAP_WIREFUTURE to cancel mlockall(MCL_FUTURE) */ 1061 map->flags &= ~MAP_WIREFUTURE; 1062 1063 retry: 1064 for (entry = map->header.next; 1065 entry != &map->header; 1066 entry = entry->next) { 1067 if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) 1068 continue; 1069 1070 /* 1071 * If we encounter an in-transition entry, we release the 1072 * map lock and retry the scan; we do not decrement any 1073 * wired_count more than once because we do not touch 1074 * any entries with MAP_ENTRY_USER_WIRED not set. 1075 * 1076 * There is a potential interleaving with concurrent 1077 * mlockall()s here -- if we abort a scan, an mlockall() 1078 * could start, wire a number of entries before our 1079 * current position in, and then stall itself on this 1080 * or any other in-transition entry. If that occurs, when 1081 * we resume, we will unwire those entries. 1082 */ 1083 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 1084 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 1085 ++mycpu->gd_cnt.v_intrans_coll; 1086 ++mycpu->gd_cnt.v_intrans_wait; 1087 vm_map_transition_wait(map, 1); 1088 goto retry; 1089 } 1090 1091 KASSERT(entry->wired_count > 0, 1092 ("wired_count was 0 with USER_WIRED set! %p", entry)); 1093 1094 /* Drop wired count, if it hits zero, unwire the entry */ 1095 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 1096 entry->wired_count--; 1097 if (entry->wired_count == 0) 1098 vm_fault_unwire(map, entry); 1099 } 1100 1101 vm_map_unlock(map); 1102 1103 return (rc); 1104 } 1105 1106 /* 1107 * munlock system call handler 1108 * 1109 * munlock_args(const void *addr, size_t len) 1110 * 1111 * No requirements 1112 */ 1113 int 1114 sys_munlock(struct munlock_args *uap) 1115 { 1116 struct thread *td = curthread; 1117 struct proc *p = td->td_proc; 1118 vm_offset_t addr; 1119 vm_offset_t tmpaddr; 1120 vm_size_t size, pageoff; 1121 int error; 1122 1123 addr = (vm_offset_t) uap->addr; 1124 size = uap->len; 1125 1126 pageoff = (addr & PAGE_MASK); 1127 addr -= pageoff; 1128 size += pageoff; 1129 size = (vm_size_t) round_page(size); 1130 1131 tmpaddr = addr + size; 1132 if (tmpaddr < addr) /* wrap */ 1133 return (EINVAL); 1134 1135 #ifndef pmap_wired_count 1136 error = priv_check(td, PRIV_ROOT); 1137 if (error) 1138 return (error); 1139 #endif 1140 1141 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, TRUE); 1142 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1143 } 1144 1145 /* 1146 * Internal version of mmap. 1147 * Currently used by mmap, exec, and sys5 shared memory. 1148 * Handle is either a vnode pointer or NULL for MAP_ANON. 1149 * 1150 * No requirements 1151 */ 1152 int 1153 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1154 vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff) 1155 { 1156 boolean_t fitit; 1157 vm_object_t object; 1158 vm_offset_t eaddr; 1159 vm_size_t esize; 1160 vm_size_t align; 1161 int (*uksmap)(cdev_t dev, vm_page_t fake); 1162 struct vnode *vp; 1163 struct thread *td = curthread; 1164 struct proc *p; 1165 int rv = KERN_SUCCESS; 1166 off_t objsize; 1167 int docow; 1168 int error; 1169 1170 if (size == 0) 1171 return (0); 1172 1173 objsize = round_page(size); 1174 if (objsize < size) 1175 return (EINVAL); 1176 size = objsize; 1177 1178 lwkt_gettoken(&map->token); 1179 1180 /* 1181 * XXX messy code, fixme 1182 * 1183 * NOTE: Overflow checks require discrete statements or GCC4 1184 * will optimize it out. 1185 */ 1186 if ((p = curproc) != NULL && map == &p->p_vmspace->vm_map) { 1187 esize = map->size + size; /* workaround gcc4 opt */ 1188 if (esize < map->size || 1189 esize > p->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1190 lwkt_reltoken(&map->token); 1191 return(ENOMEM); 1192 } 1193 } 1194 1195 /* 1196 * We currently can only deal with page aligned file offsets. 1197 * The check is here rather than in the syscall because the 1198 * kernel calls this function internally for other mmaping 1199 * operations (such as in exec) and non-aligned offsets will 1200 * cause pmap inconsistencies...so we want to be sure to 1201 * disallow this in all cases. 1202 * 1203 * NOTE: Overflow checks require discrete statements or GCC4 1204 * will optimize it out. 1205 */ 1206 if (foff & PAGE_MASK) { 1207 lwkt_reltoken(&map->token); 1208 return (EINVAL); 1209 } 1210 1211 /* 1212 * Handle alignment. For large memory maps it is possible 1213 * that the MMU can optimize the page table so align anything 1214 * that is a multiple of SEG_SIZE to SEG_SIZE. 1215 * 1216 * Also align any large mapping (bigger than 16x SG_SIZE) to a 1217 * SEG_SIZE address boundary. 1218 */ 1219 if (flags & MAP_SIZEALIGN) { 1220 align = size; 1221 if ((align ^ (align - 1)) != (align << 1) - 1) { 1222 lwkt_reltoken(&map->token); 1223 return (EINVAL); 1224 } 1225 } else if ((flags & MAP_FIXED) == 0 && 1226 ((size & SEG_MASK) == 0 || size > SEG_SIZE * 16)) { 1227 align = SEG_SIZE; 1228 } else { 1229 align = PAGE_SIZE; 1230 } 1231 1232 if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) { 1233 fitit = TRUE; 1234 *addr = round_page(*addr); 1235 } else { 1236 if (*addr != trunc_page(*addr)) { 1237 lwkt_reltoken(&map->token); 1238 return (EINVAL); 1239 } 1240 eaddr = *addr + size; 1241 if (eaddr < *addr) { 1242 lwkt_reltoken(&map->token); 1243 return (EINVAL); 1244 } 1245 fitit = FALSE; 1246 if ((flags & MAP_TRYFIXED) == 0) 1247 vm_map_remove(map, *addr, *addr + size); 1248 } 1249 1250 uksmap = NULL; 1251 1252 /* 1253 * Lookup/allocate object. 1254 */ 1255 if (flags & MAP_ANON) { 1256 /* 1257 * Unnamed anonymous regions always start at 0. 1258 */ 1259 if (handle) { 1260 /* 1261 * Default memory object 1262 */ 1263 object = default_pager_alloc(handle, objsize, 1264 prot, foff); 1265 if (object == NULL) { 1266 lwkt_reltoken(&map->token); 1267 return(ENOMEM); 1268 } 1269 docow = MAP_PREFAULT_PARTIAL; 1270 } else { 1271 /* 1272 * Implicit single instance of a default memory 1273 * object, so we don't need a VM object yet. 1274 */ 1275 foff = 0; 1276 object = NULL; 1277 docow = 0; 1278 } 1279 vp = NULL; 1280 } else { 1281 vp = (struct vnode *)handle; 1282 1283 /* 1284 * Non-anonymous mappings of VCHR (aka not /dev/zero) 1285 * cannot specify MAP_STACK or MAP_VPAGETABLE. 1286 */ 1287 if (vp->v_type == VCHR) { 1288 if (flags & (MAP_STACK | MAP_VPAGETABLE)) { 1289 lwkt_reltoken(&map->token); 1290 return(EINVAL); 1291 } 1292 } 1293 1294 if (vp->v_type == VCHR && vp->v_rdev->si_ops->d_uksmap) { 1295 /* 1296 * Device mappings without a VM object, typically 1297 * sharing permanently allocated kernel memory or 1298 * process-context-specific (per-process) data. 1299 * 1300 * Force them to be shared. 1301 */ 1302 uksmap = vp->v_rdev->si_ops->d_uksmap; 1303 object = NULL; 1304 docow = MAP_PREFAULT_PARTIAL; 1305 flags &= ~(MAP_PRIVATE|MAP_COPY); 1306 flags |= MAP_SHARED; 1307 } else if (vp->v_type == VCHR) { 1308 /* 1309 * Device mappings (device size unknown?). 1310 * Force them to be shared. 1311 */ 1312 error = dev_dmmap_single(vp->v_rdev, &foff, objsize, 1313 &object, prot, NULL); 1314 1315 if (error == ENODEV) { 1316 handle = (void *)(intptr_t)vp->v_rdev; 1317 object = dev_pager_alloc(handle, objsize, prot, foff); 1318 if (object == NULL) { 1319 lwkt_reltoken(&map->token); 1320 return(EINVAL); 1321 } 1322 } else if (error) { 1323 lwkt_reltoken(&map->token); 1324 return(error); 1325 } 1326 1327 docow = MAP_PREFAULT_PARTIAL; 1328 flags &= ~(MAP_PRIVATE|MAP_COPY); 1329 flags |= MAP_SHARED; 1330 } else { 1331 /* 1332 * Regular file mapping (typically). The attribute 1333 * check is for the link count test only. mmapable 1334 * vnodes must already have a VM object assigned. 1335 */ 1336 struct vattr vat; 1337 int error; 1338 1339 error = VOP_GETATTR(vp, &vat); 1340 if (error) { 1341 lwkt_reltoken(&map->token); 1342 return (error); 1343 } 1344 docow = MAP_PREFAULT_PARTIAL; 1345 object = vnode_pager_reference(vp); 1346 if (object == NULL && vp->v_type == VREG) { 1347 lwkt_reltoken(&map->token); 1348 kprintf("Warning: cannot mmap vnode %p, no " 1349 "object\n", vp); 1350 return(EINVAL); 1351 } 1352 1353 /* 1354 * If it is a regular file without any references 1355 * we do not need to sync it. 1356 */ 1357 if (vp->v_type == VREG && vat.va_nlink == 0) { 1358 flags |= MAP_NOSYNC; 1359 } 1360 } 1361 } 1362 1363 /* 1364 * Deal with the adjusted flags 1365 */ 1366 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1367 docow |= MAP_COPY_ON_WRITE; 1368 if (flags & MAP_NOSYNC) 1369 docow |= MAP_DISABLE_SYNCER; 1370 if (flags & MAP_NOCORE) 1371 docow |= MAP_DISABLE_COREDUMP; 1372 1373 /* 1374 * This may place the area in its own page directory if (size) is 1375 * large enough, otherwise it typically returns its argument. 1376 * 1377 * (object can be NULL) 1378 */ 1379 if (fitit) { 1380 *addr = pmap_addr_hint(object, *addr, size); 1381 } 1382 1383 /* 1384 * Stack mappings need special attention. 1385 * 1386 * Mappings that use virtual page tables will default to storing 1387 * the page table at offset 0. 1388 */ 1389 if (uksmap) { 1390 rv = vm_map_find(map, uksmap, vp->v_rdev, 1391 foff, addr, size, 1392 align, fitit, 1393 VM_MAPTYPE_UKSMAP, VM_SUBSYS_MMAP, 1394 prot, maxprot, docow); 1395 } else if (flags & MAP_STACK) { 1396 rv = vm_map_stack(map, *addr, size, flags, 1397 prot, maxprot, docow); 1398 } else if (flags & MAP_VPAGETABLE) { 1399 rv = vm_map_find(map, object, NULL, 1400 foff, addr, size, 1401 align, fitit, 1402 VM_MAPTYPE_VPAGETABLE, VM_SUBSYS_MMAP, 1403 prot, maxprot, docow); 1404 } else { 1405 rv = vm_map_find(map, object, NULL, 1406 foff, addr, size, 1407 align, fitit, 1408 VM_MAPTYPE_NORMAL, VM_SUBSYS_MMAP, 1409 prot, maxprot, docow); 1410 } 1411 1412 if (rv != KERN_SUCCESS) { 1413 /* 1414 * Lose the object reference. Will destroy the 1415 * object if it's an unnamed anonymous mapping 1416 * or named anonymous without other references. 1417 * 1418 * (NOTE: object can be NULL) 1419 */ 1420 vm_object_deallocate(object); 1421 goto out; 1422 } 1423 1424 /* 1425 * Shared memory is also shared with children. 1426 */ 1427 if (flags & (MAP_SHARED|MAP_INHERIT)) { 1428 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1429 if (rv != KERN_SUCCESS) { 1430 vm_map_remove(map, *addr, *addr + size); 1431 goto out; 1432 } 1433 } 1434 1435 /* If a process has marked all future mappings for wiring, do so */ 1436 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1437 vm_map_unwire(map, *addr, *addr + size, FALSE); 1438 1439 /* 1440 * Set the access time on the vnode 1441 */ 1442 if (vp != NULL) 1443 vn_mark_atime(vp, td); 1444 out: 1445 lwkt_reltoken(&map->token); 1446 1447 switch (rv) { 1448 case KERN_SUCCESS: 1449 return (0); 1450 case KERN_INVALID_ADDRESS: 1451 case KERN_NO_SPACE: 1452 return (ENOMEM); 1453 case KERN_PROTECTION_FAILURE: 1454 return (EACCES); 1455 default: 1456 return (EINVAL); 1457 } 1458 } 1459 1460 /* 1461 * Translate a Mach VM return code to zero on success or the appropriate errno 1462 * on failure. 1463 */ 1464 int 1465 vm_mmap_to_errno(int rv) 1466 { 1467 1468 switch (rv) { 1469 case KERN_SUCCESS: 1470 return (0); 1471 case KERN_INVALID_ADDRESS: 1472 case KERN_NO_SPACE: 1473 return (ENOMEM); 1474 case KERN_PROTECTION_FAILURE: 1475 return (EACCES); 1476 default: 1477 return (EINVAL); 1478 } 1479 } 1480