1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $ 42 * $DragonFly: src/sys/vm/vm_mmap.c,v 1.39 2007/04/30 07:18:57 dillon Exp $ 43 */ 44 45 /* 46 * Mapped file (mmap) interface to VM 47 */ 48 49 #include <sys/param.h> 50 #include <sys/kernel.h> 51 #include <sys/systm.h> 52 #include <sys/sysproto.h> 53 #include <sys/filedesc.h> 54 #include <sys/kern_syscall.h> 55 #include <sys/proc.h> 56 #include <sys/resource.h> 57 #include <sys/resourcevar.h> 58 #include <sys/vnode.h> 59 #include <sys/fcntl.h> 60 #include <sys/file.h> 61 #include <sys/mman.h> 62 #include <sys/conf.h> 63 #include <sys/stat.h> 64 #include <sys/vmmeter.h> 65 #include <sys/sysctl.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <sys/lock.h> 70 #include <vm/pmap.h> 71 #include <vm/vm_map.h> 72 #include <vm/vm_object.h> 73 #include <vm/vm_page.h> 74 #include <vm/vm_pager.h> 75 #include <vm/vm_pageout.h> 76 #include <vm/vm_extern.h> 77 #include <vm/vm_page.h> 78 #include <vm/vm_kern.h> 79 80 #include <sys/file2.h> 81 #include <sys/thread2.h> 82 83 static int max_proc_mmap; 84 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 85 int vkernel_enable; 86 SYSCTL_INT(_vm, OID_AUTO, vkernel_enable, CTLFLAG_RW, &vkernel_enable, 0, ""); 87 88 /* 89 * Set the maximum number of vm_map_entry structures per process. Roughly 90 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 91 * of our KVM malloc space still results in generous limits. We want a 92 * default that is good enough to prevent the kernel running out of resources 93 * if attacked from compromised user account but generous enough such that 94 * multi-threaded processes are not unduly inconvenienced. 95 */ 96 97 static void vmmapentry_rsrc_init (void *); 98 SYSINIT(vmmersrc, SI_BOOT1_POST, SI_ORDER_ANY, vmmapentry_rsrc_init, NULL) 99 100 static void 101 vmmapentry_rsrc_init(void *dummy) 102 { 103 max_proc_mmap = KvaSize / sizeof(struct vm_map_entry); 104 max_proc_mmap /= 100; 105 } 106 107 /* ARGSUSED */ 108 int 109 sys_sbrk(struct sbrk_args *uap) 110 { 111 /* Not yet implemented */ 112 return (EOPNOTSUPP); 113 } 114 115 /* 116 * sstk_args(int incr) 117 */ 118 /* ARGSUSED */ 119 int 120 sys_sstk(struct sstk_args *uap) 121 { 122 /* Not yet implemented */ 123 return (EOPNOTSUPP); 124 } 125 126 /* 127 * mmap_args(void *addr, size_t len, int prot, int flags, int fd, 128 * long pad, off_t pos) 129 * 130 * Memory Map (mmap) system call. Note that the file offset 131 * and address are allowed to be NOT page aligned, though if 132 * the MAP_FIXED flag it set, both must have the same remainder 133 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 134 * page-aligned, the actual mapping starts at trunc_page(addr) 135 * and the return value is adjusted up by the page offset. 136 * 137 * Generally speaking, only character devices which are themselves 138 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 139 * there would be no cache coherency between a descriptor and a VM mapping 140 * both to the same character device. 141 * 142 * Block devices can be mmap'd no matter what they represent. Cache coherency 143 * is maintained as long as you do not write directly to the underlying 144 * character device. 145 */ 146 147 int 148 kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen, 149 int uprot, int uflags, int fd, off_t upos, void **res) 150 { 151 struct thread *td = curthread; 152 struct proc *p = td->td_proc; 153 struct file *fp = NULL; 154 struct vnode *vp; 155 vm_offset_t addr; 156 vm_size_t size, pageoff; 157 vm_prot_t prot, maxprot; 158 void *handle; 159 int flags, error; 160 int disablexworkaround; 161 off_t pos; 162 vm_object_t obj; 163 164 KKASSERT(p); 165 166 addr = (vm_offset_t) uaddr; 167 size = ulen; 168 prot = uprot & VM_PROT_ALL; 169 flags = uflags; 170 pos = upos; 171 172 /* make sure mapping fits into numeric range etc */ 173 if ((ssize_t) ulen < 0 || ((flags & MAP_ANON) && fd != -1)) 174 return (EINVAL); 175 176 if (flags & MAP_STACK) { 177 if ((fd != -1) || 178 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 179 return (EINVAL); 180 flags |= MAP_ANON; 181 pos = 0; 182 } 183 184 /* 185 * Virtual page tables cannot be used with MAP_STACK. Apart from 186 * it not making any sense, the aux union is used by both 187 * types. 188 * 189 * Because the virtual page table is stored in the backing object 190 * and might be updated by the kernel, the mapping must be R+W. 191 */ 192 if (flags & MAP_VPAGETABLE) { 193 if (vkernel_enable == 0) 194 return (EOPNOTSUPP); 195 if (flags & MAP_STACK) 196 return (EINVAL); 197 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 198 return (EINVAL); 199 } 200 201 /* 202 * Align the file position to a page boundary, 203 * and save its page offset component. 204 */ 205 pageoff = (pos & PAGE_MASK); 206 pos -= pageoff; 207 208 /* Adjust size for rounding (on both ends). */ 209 size += pageoff; /* low end... */ 210 size = (vm_size_t) round_page(size); /* hi end */ 211 212 /* 213 * Check for illegal addresses. Watch out for address wrap... Note 214 * that VM_*_ADDRESS are not constants due to casts (argh). 215 */ 216 if (flags & MAP_FIXED) { 217 /* 218 * The specified address must have the same remainder 219 * as the file offset taken modulo PAGE_SIZE, so it 220 * should be aligned after adjustment by pageoff. 221 */ 222 addr -= pageoff; 223 if (addr & PAGE_MASK) 224 return (EINVAL); 225 /* Address range must be all in user VM space. */ 226 if (VM_MAX_USER_ADDRESS > 0 && addr + size > VM_MAX_USER_ADDRESS) 227 return (EINVAL); 228 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 229 return (EINVAL); 230 if (addr + size < addr) 231 return (EINVAL); 232 } 233 /* 234 * XXX for non-fixed mappings where no hint is provided or 235 * the hint would fall in the potential heap space, 236 * place it after the end of the largest possible heap. 237 * 238 * There should really be a pmap call to determine a reasonable 239 * location. 240 */ 241 else if (addr == 0 || 242 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 243 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) 244 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); 245 246 if (flags & MAP_ANON) { 247 /* 248 * Mapping blank space is trivial. 249 */ 250 handle = NULL; 251 maxprot = VM_PROT_ALL; 252 pos = 0; 253 } else { 254 /* 255 * Mapping file, get fp for validation. Obtain vnode and make 256 * sure it is of appropriate type. 257 */ 258 fp = holdfp(p->p_fd, fd, -1); 259 if (fp == NULL) 260 return (EBADF); 261 if (fp->f_type != DTYPE_VNODE) { 262 error = EINVAL; 263 goto done; 264 } 265 /* 266 * POSIX shared-memory objects are defined to have 267 * kernel persistence, and are not defined to support 268 * read(2)/write(2) -- or even open(2). Thus, we can 269 * use MAP_ASYNC to trade on-disk coherence for speed. 270 * The shm_open(3) library routine turns on the FPOSIXSHM 271 * flag to request this behavior. 272 */ 273 if (fp->f_flag & FPOSIXSHM) 274 flags |= MAP_NOSYNC; 275 vp = (struct vnode *) fp->f_data; 276 277 /* 278 * Validate the vnode for the operation. 279 */ 280 switch(vp->v_type) { 281 case VREG: 282 /* 283 * Get the proper underlying object 284 */ 285 if ((obj = vp->v_object) == NULL) { 286 error = EINVAL; 287 goto done; 288 } 289 KKASSERT((struct vnode *)obj->handle == vp); 290 break; 291 case VCHR: 292 /* 293 * Make sure a device has not been revoked. 294 * Mappability is handled by the device layer. 295 */ 296 if (vp->v_rdev == NULL) { 297 error = EBADF; 298 goto done; 299 } 300 break; 301 default: 302 /* 303 * Nothing else is mappable. 304 */ 305 error = EINVAL; 306 goto done; 307 } 308 309 /* 310 * XXX hack to handle use of /dev/zero to map anon memory (ala 311 * SunOS). 312 */ 313 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 314 handle = NULL; 315 maxprot = VM_PROT_ALL; 316 flags |= MAP_ANON; 317 pos = 0; 318 } else { 319 /* 320 * cdevs does not provide private mappings of any kind. 321 */ 322 /* 323 * However, for XIG X server to continue to work, 324 * we should allow the superuser to do it anyway. 325 * We only allow it at securelevel < 1. 326 * (Because the XIG X server writes directly to video 327 * memory via /dev/mem, it should never work at any 328 * other securelevel. 329 * XXX this will have to go 330 */ 331 if (securelevel >= 1) 332 disablexworkaround = 1; 333 else 334 disablexworkaround = suser(td); 335 if (vp->v_type == VCHR && disablexworkaround && 336 (flags & (MAP_PRIVATE|MAP_COPY))) { 337 error = EINVAL; 338 goto done; 339 } 340 /* 341 * Ensure that file and memory protections are 342 * compatible. Note that we only worry about 343 * writability if mapping is shared; in this case, 344 * current and max prot are dictated by the open file. 345 * XXX use the vnode instead? Problem is: what 346 * credentials do we use for determination? What if 347 * proc does a setuid? 348 */ 349 maxprot = VM_PROT_EXECUTE; /* ??? */ 350 if (fp->f_flag & FREAD) { 351 maxprot |= VM_PROT_READ; 352 } else if (prot & PROT_READ) { 353 error = EACCES; 354 goto done; 355 } 356 /* 357 * If we are sharing potential changes (either via 358 * MAP_SHARED or via the implicit sharing of character 359 * device mappings), and we are trying to get write 360 * permission although we opened it without asking 361 * for it, bail out. Check for superuser, only if 362 * we're at securelevel < 1, to allow the XIG X server 363 * to continue to work. 364 */ 365 366 if ((flags & MAP_SHARED) != 0 || 367 (vp->v_type == VCHR && disablexworkaround)) { 368 if ((fp->f_flag & FWRITE) != 0) { 369 struct vattr va; 370 if ((error = VOP_GETATTR(vp, &va))) { 371 goto done; 372 } 373 if ((va.va_flags & 374 (IMMUTABLE|APPEND)) == 0) { 375 maxprot |= VM_PROT_WRITE; 376 } else if (prot & PROT_WRITE) { 377 error = EPERM; 378 goto done; 379 } 380 } else if ((prot & PROT_WRITE) != 0) { 381 error = EACCES; 382 goto done; 383 } 384 } else { 385 maxprot |= VM_PROT_WRITE; 386 } 387 handle = (void *)vp; 388 } 389 } 390 391 /* 392 * Do not allow more then a certain number of vm_map_entry structures 393 * per process. Scale with the number of rforks sharing the map 394 * to make the limit reasonable for threads. 395 */ 396 if (max_proc_mmap && 397 vms->vm_map.nentries >= max_proc_mmap * vms->vm_sysref.refcnt) { 398 error = ENOMEM; 399 goto done; 400 } 401 402 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 403 flags, handle, pos); 404 if (error == 0) 405 *res = (void *)(addr + pageoff); 406 done: 407 if (fp) 408 fdrop(fp); 409 return (error); 410 } 411 412 int 413 sys_mmap(struct mmap_args *uap) 414 { 415 int error; 416 417 error = kern_mmap(curproc->p_vmspace, uap->addr, uap->len, 418 uap->prot, uap->flags, 419 uap->fd, uap->pos, &uap->sysmsg_resultp); 420 421 return (error); 422 } 423 424 /* 425 * msync_args(void *addr, int len, int flags) 426 */ 427 int 428 sys_msync(struct msync_args *uap) 429 { 430 struct proc *p = curproc; 431 vm_offset_t addr; 432 vm_size_t size, pageoff; 433 int flags; 434 vm_map_t map; 435 int rv; 436 437 addr = (vm_offset_t) uap->addr; 438 size = uap->len; 439 flags = uap->flags; 440 441 pageoff = (addr & PAGE_MASK); 442 addr -= pageoff; 443 size += pageoff; 444 size = (vm_size_t) round_page(size); 445 if (addr + size < addr) 446 return(EINVAL); 447 448 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 449 return (EINVAL); 450 451 map = &p->p_vmspace->vm_map; 452 453 /* 454 * XXX Gak! If size is zero we are supposed to sync "all modified 455 * pages with the region containing addr". Unfortunately, we don't 456 * really keep track of individual mmaps so we approximate by flushing 457 * the range of the map entry containing addr. This can be incorrect 458 * if the region splits or is coalesced with a neighbor. 459 */ 460 if (size == 0) { 461 vm_map_entry_t entry; 462 463 vm_map_lock_read(map); 464 rv = vm_map_lookup_entry(map, addr, &entry); 465 vm_map_unlock_read(map); 466 if (rv == FALSE) 467 return (EINVAL); 468 addr = entry->start; 469 size = entry->end - entry->start; 470 } 471 472 /* 473 * Clean the pages and interpret the return value. 474 */ 475 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 476 (flags & MS_INVALIDATE) != 0); 477 478 switch (rv) { 479 case KERN_SUCCESS: 480 break; 481 case KERN_INVALID_ADDRESS: 482 return (EINVAL); /* Sun returns ENOMEM? */ 483 case KERN_FAILURE: 484 return (EIO); 485 default: 486 return (EINVAL); 487 } 488 489 return (0); 490 } 491 492 /* 493 * munmap_args(void *addr, size_t len) 494 */ 495 int 496 sys_munmap(struct munmap_args *uap) 497 { 498 struct proc *p = curproc; 499 vm_offset_t addr; 500 vm_size_t size, pageoff; 501 vm_map_t map; 502 503 addr = (vm_offset_t) uap->addr; 504 size = uap->len; 505 506 pageoff = (addr & PAGE_MASK); 507 addr -= pageoff; 508 size += pageoff; 509 size = (vm_size_t) round_page(size); 510 if (addr + size < addr) 511 return(EINVAL); 512 513 if (size == 0) 514 return (0); 515 516 /* 517 * Check for illegal addresses. Watch out for address wrap... Note 518 * that VM_*_ADDRESS are not constants due to casts (argh). 519 */ 520 if (VM_MAX_USER_ADDRESS > 0 && addr + size > VM_MAX_USER_ADDRESS) 521 return (EINVAL); 522 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 523 return (EINVAL); 524 map = &p->p_vmspace->vm_map; 525 /* 526 * Make sure entire range is allocated. 527 */ 528 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) 529 return (EINVAL); 530 /* returns nothing but KERN_SUCCESS anyway */ 531 vm_map_remove(map, addr, addr + size); 532 return (0); 533 } 534 535 /* 536 * mprotect_args(const void *addr, size_t len, int prot) 537 */ 538 int 539 sys_mprotect(struct mprotect_args *uap) 540 { 541 struct proc *p = curproc; 542 vm_offset_t addr; 543 vm_size_t size, pageoff; 544 vm_prot_t prot; 545 546 addr = (vm_offset_t) uap->addr; 547 size = uap->len; 548 prot = uap->prot & VM_PROT_ALL; 549 #if defined(VM_PROT_READ_IS_EXEC) 550 if (prot & VM_PROT_READ) 551 prot |= VM_PROT_EXECUTE; 552 #endif 553 554 pageoff = (addr & PAGE_MASK); 555 addr -= pageoff; 556 size += pageoff; 557 size = (vm_size_t) round_page(size); 558 if (addr + size < addr) 559 return(EINVAL); 560 561 switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, 562 FALSE)) { 563 case KERN_SUCCESS: 564 return (0); 565 case KERN_PROTECTION_FAILURE: 566 return (EACCES); 567 } 568 return (EINVAL); 569 } 570 571 /* 572 * minherit_args(void *addr, size_t len, int inherit) 573 */ 574 int 575 sys_minherit(struct minherit_args *uap) 576 { 577 struct proc *p = curproc; 578 vm_offset_t addr; 579 vm_size_t size, pageoff; 580 vm_inherit_t inherit; 581 582 addr = (vm_offset_t)uap->addr; 583 size = uap->len; 584 inherit = uap->inherit; 585 586 pageoff = (addr & PAGE_MASK); 587 addr -= pageoff; 588 size += pageoff; 589 size = (vm_size_t) round_page(size); 590 if (addr + size < addr) 591 return(EINVAL); 592 593 switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, 594 inherit)) { 595 case KERN_SUCCESS: 596 return (0); 597 case KERN_PROTECTION_FAILURE: 598 return (EACCES); 599 } 600 return (EINVAL); 601 } 602 603 /* 604 * madvise_args(void *addr, size_t len, int behav) 605 */ 606 /* ARGSUSED */ 607 int 608 sys_madvise(struct madvise_args *uap) 609 { 610 struct proc *p = curproc; 611 vm_offset_t start, end; 612 613 /* 614 * Check for illegal behavior 615 */ 616 if (uap->behav < 0 || uap->behav >= MADV_CONTROL_END) 617 return (EINVAL); 618 /* 619 * Check for illegal addresses. Watch out for address wrap... Note 620 * that VM_*_ADDRESS are not constants due to casts (argh). 621 */ 622 if (VM_MAX_USER_ADDRESS > 0 && 623 ((vm_offset_t) uap->addr + uap->len) > VM_MAX_USER_ADDRESS) 624 return (EINVAL); 625 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 626 return (EINVAL); 627 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 628 return (EINVAL); 629 630 /* 631 * Since this routine is only advisory, we default to conservative 632 * behavior. 633 */ 634 start = trunc_page((vm_offset_t) uap->addr); 635 end = round_page((vm_offset_t) uap->addr + uap->len); 636 637 return (vm_map_madvise(&p->p_vmspace->vm_map, start, end, 638 uap->behav, 0)); 639 } 640 641 /* 642 * mcontrol_args(void *addr, size_t len, int behav, off_t value) 643 */ 644 /* ARGSUSED */ 645 int 646 sys_mcontrol(struct mcontrol_args *uap) 647 { 648 struct proc *p = curproc; 649 vm_offset_t start, end; 650 651 /* 652 * Check for illegal behavior 653 */ 654 if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) 655 return (EINVAL); 656 /* 657 * Check for illegal addresses. Watch out for address wrap... Note 658 * that VM_*_ADDRESS are not constants due to casts (argh). 659 */ 660 if (VM_MAX_USER_ADDRESS > 0 && 661 ((vm_offset_t) uap->addr + uap->len) > VM_MAX_USER_ADDRESS) 662 return (EINVAL); 663 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 664 return (EINVAL); 665 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 666 return (EINVAL); 667 668 /* 669 * Since this routine is only advisory, we default to conservative 670 * behavior. 671 */ 672 start = trunc_page((vm_offset_t) uap->addr); 673 end = round_page((vm_offset_t) uap->addr + uap->len); 674 675 return (vm_map_madvise(&p->p_vmspace->vm_map, start, end, 676 uap->behav, uap->value)); 677 } 678 679 680 /* 681 * mincore_args(const void *addr, size_t len, char *vec) 682 */ 683 /* ARGSUSED */ 684 int 685 sys_mincore(struct mincore_args *uap) 686 { 687 struct proc *p = curproc; 688 vm_offset_t addr, first_addr; 689 vm_offset_t end, cend; 690 pmap_t pmap; 691 vm_map_t map; 692 char *vec; 693 int error; 694 int vecindex, lastvecindex; 695 vm_map_entry_t current; 696 vm_map_entry_t entry; 697 int mincoreinfo; 698 unsigned int timestamp; 699 700 /* 701 * Make sure that the addresses presented are valid for user 702 * mode. 703 */ 704 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 705 end = addr + (vm_size_t)round_page(uap->len); 706 if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS) 707 return (EINVAL); 708 if (end < addr) 709 return (EINVAL); 710 711 /* 712 * Address of byte vector 713 */ 714 vec = uap->vec; 715 716 map = &p->p_vmspace->vm_map; 717 pmap = vmspace_pmap(p->p_vmspace); 718 719 vm_map_lock_read(map); 720 RestartScan: 721 timestamp = map->timestamp; 722 723 if (!vm_map_lookup_entry(map, addr, &entry)) 724 entry = entry->next; 725 726 /* 727 * Do this on a map entry basis so that if the pages are not 728 * in the current processes address space, we can easily look 729 * up the pages elsewhere. 730 */ 731 lastvecindex = -1; 732 for(current = entry; 733 (current != &map->header) && (current->start < end); 734 current = current->next) { 735 736 /* 737 * ignore submaps (for now) or null objects 738 */ 739 if (current->maptype != VM_MAPTYPE_NORMAL && 740 current->maptype != VM_MAPTYPE_VPAGETABLE) { 741 continue; 742 } 743 if (current->object.vm_object == NULL) 744 continue; 745 746 /* 747 * limit this scan to the current map entry and the 748 * limits for the mincore call 749 */ 750 if (addr < current->start) 751 addr = current->start; 752 cend = current->end; 753 if (cend > end) 754 cend = end; 755 756 /* 757 * scan this entry one page at a time 758 */ 759 while (addr < cend) { 760 /* 761 * Check pmap first, it is likely faster, also 762 * it can provide info as to whether we are the 763 * one referencing or modifying the page. 764 * 765 * If we have to check the VM object, only mess 766 * around with normal maps. Do not mess around 767 * with virtual page tables (XXX). 768 */ 769 mincoreinfo = pmap_mincore(pmap, addr); 770 if (mincoreinfo == 0 && 771 current->maptype == VM_MAPTYPE_NORMAL) { 772 vm_pindex_t pindex; 773 vm_ooffset_t offset; 774 vm_page_t m; 775 776 /* 777 * calculate the page index into the object 778 */ 779 offset = current->offset + (addr - current->start); 780 pindex = OFF_TO_IDX(offset); 781 782 /* 783 * if the page is resident, then gather 784 * information about it. spl protection is 785 * required to maintain the object 786 * association. And XXX what if the page is 787 * busy? What's the deal with that? 788 */ 789 crit_enter(); 790 m = vm_page_lookup(current->object.vm_object, 791 pindex); 792 if (m && m->valid) { 793 mincoreinfo = MINCORE_INCORE; 794 if (m->dirty || 795 pmap_is_modified(m)) 796 mincoreinfo |= MINCORE_MODIFIED_OTHER; 797 if ((m->flags & PG_REFERENCED) || 798 pmap_ts_referenced(m)) { 799 vm_page_flag_set(m, PG_REFERENCED); 800 mincoreinfo |= MINCORE_REFERENCED_OTHER; 801 } 802 } 803 crit_exit(); 804 } 805 806 /* 807 * subyte may page fault. In case it needs to modify 808 * the map, we release the lock. 809 */ 810 vm_map_unlock_read(map); 811 812 /* 813 * calculate index into user supplied byte vector 814 */ 815 vecindex = OFF_TO_IDX(addr - first_addr); 816 817 /* 818 * If we have skipped map entries, we need to make sure that 819 * the byte vector is zeroed for those skipped entries. 820 */ 821 while((lastvecindex + 1) < vecindex) { 822 error = subyte( vec + lastvecindex, 0); 823 if (error) { 824 return (EFAULT); 825 } 826 ++lastvecindex; 827 } 828 829 /* 830 * Pass the page information to the user 831 */ 832 error = subyte( vec + vecindex, mincoreinfo); 833 if (error) { 834 return (EFAULT); 835 } 836 837 /* 838 * If the map has changed, due to the subyte, the previous 839 * output may be invalid. 840 */ 841 vm_map_lock_read(map); 842 if (timestamp != map->timestamp) 843 goto RestartScan; 844 845 lastvecindex = vecindex; 846 addr += PAGE_SIZE; 847 } 848 } 849 850 /* 851 * subyte may page fault. In case it needs to modify 852 * the map, we release the lock. 853 */ 854 vm_map_unlock_read(map); 855 856 /* 857 * Zero the last entries in the byte vector. 858 */ 859 vecindex = OFF_TO_IDX(end - first_addr); 860 while((lastvecindex + 1) < vecindex) { 861 error = subyte( vec + lastvecindex, 0); 862 if (error) { 863 return (EFAULT); 864 } 865 ++lastvecindex; 866 } 867 868 /* 869 * If the map has changed, due to the subyte, the previous 870 * output may be invalid. 871 */ 872 vm_map_lock_read(map); 873 if (timestamp != map->timestamp) 874 goto RestartScan; 875 vm_map_unlock_read(map); 876 877 return (0); 878 } 879 880 /* 881 * mlock_args(const void *addr, size_t len) 882 */ 883 int 884 sys_mlock(struct mlock_args *uap) 885 { 886 vm_offset_t addr; 887 vm_size_t size, pageoff; 888 int error; 889 struct proc *p = curproc; 890 891 addr = (vm_offset_t) uap->addr; 892 size = uap->len; 893 894 pageoff = (addr & PAGE_MASK); 895 addr -= pageoff; 896 size += pageoff; 897 size = (vm_size_t) round_page(size); 898 899 /* disable wrap around */ 900 if (addr + size < addr) 901 return (EINVAL); 902 903 if (atop(size) + vmstats.v_wire_count > vm_page_max_wired) 904 return (EAGAIN); 905 906 #ifdef pmap_wired_count 907 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 908 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 909 return (ENOMEM); 910 #else 911 error = suser_cred(p->p_ucred, 0); 912 if (error) 913 return (error); 914 #endif 915 916 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, FALSE); 917 return (error == KERN_SUCCESS ? 0 : ENOMEM); 918 } 919 920 /* 921 * mlockall_args(int how) 922 */ 923 int 924 sys_mlockall(struct mlockall_args *uap) 925 { 926 return 0; 927 } 928 929 /* 930 * munlockall_args(void) 931 */ 932 int 933 sys_munlockall(struct munlockall_args *uap) 934 { 935 return 0; 936 } 937 938 /* 939 * munlock_args(const void *addr, size_t len) 940 */ 941 int 942 sys_munlock(struct munlock_args *uap) 943 { 944 struct thread *td = curthread; 945 struct proc *p = td->td_proc; 946 vm_offset_t addr; 947 vm_size_t size, pageoff; 948 int error; 949 950 addr = (vm_offset_t) uap->addr; 951 size = uap->len; 952 953 pageoff = (addr & PAGE_MASK); 954 addr -= pageoff; 955 size += pageoff; 956 size = (vm_size_t) round_page(size); 957 958 /* disable wrap around */ 959 if (addr + size < addr) 960 return (EINVAL); 961 962 #ifndef pmap_wired_count 963 error = suser(td); 964 if (error) 965 return (error); 966 #endif 967 968 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, TRUE); 969 return (error == KERN_SUCCESS ? 0 : ENOMEM); 970 } 971 972 /* 973 * Internal version of mmap. 974 * Currently used by mmap, exec, and sys5 shared memory. 975 * Handle is either a vnode pointer or NULL for MAP_ANON. 976 */ 977 int 978 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 979 vm_prot_t maxprot, int flags, 980 void *handle, 981 vm_ooffset_t foff) 982 { 983 boolean_t fitit; 984 vm_object_t object; 985 struct vnode *vp = NULL; 986 struct proc *p; 987 objtype_t type; 988 int rv = KERN_SUCCESS; 989 off_t objsize; 990 int docow; 991 992 if (size == 0) 993 return (0); 994 995 objsize = size = round_page(size); 996 997 /* 998 * XXX messy code, fixme 999 */ 1000 if ((p = curproc) != NULL && map == &p->p_vmspace->vm_map) { 1001 if (map->size + size > p->p_rlimit[RLIMIT_VMEM].rlim_cur) 1002 return(ENOMEM); 1003 } 1004 1005 /* 1006 * We currently can only deal with page aligned file offsets. 1007 * The check is here rather than in the syscall because the 1008 * kernel calls this function internally for other mmaping 1009 * operations (such as in exec) and non-aligned offsets will 1010 * cause pmap inconsistencies...so we want to be sure to 1011 * disallow this in all cases. 1012 */ 1013 if (foff & PAGE_MASK) 1014 return (EINVAL); 1015 1016 if ((flags & MAP_FIXED) == 0) { 1017 fitit = TRUE; 1018 *addr = round_page(*addr); 1019 } else { 1020 if (*addr != trunc_page(*addr)) 1021 return (EINVAL); 1022 fitit = FALSE; 1023 vm_map_remove(map, *addr, *addr + size); 1024 } 1025 1026 /* 1027 * Lookup/allocate object. 1028 */ 1029 if (flags & MAP_ANON) { 1030 type = OBJT_DEFAULT; 1031 /* 1032 * Unnamed anonymous regions always start at 0. 1033 */ 1034 if (handle == 0) 1035 foff = 0; 1036 } else { 1037 vp = (struct vnode *) handle; 1038 if (vp->v_type == VCHR) { 1039 type = OBJT_DEVICE; 1040 handle = (void *)(intptr_t)vp->v_rdev; 1041 } else { 1042 struct vattr vat; 1043 int error; 1044 1045 error = VOP_GETATTR(vp, &vat); 1046 if (error) 1047 return (error); 1048 objsize = vat.va_size; 1049 type = OBJT_VNODE; 1050 /* 1051 * if it is a regular file without any references 1052 * we do not need to sync it. 1053 */ 1054 if (vp->v_type == VREG && vat.va_nlink == 0) { 1055 flags |= MAP_NOSYNC; 1056 } 1057 } 1058 } 1059 1060 if (handle == NULL) { 1061 object = NULL; 1062 docow = 0; 1063 } else { 1064 object = vm_pager_allocate(type, handle, objsize, prot, foff); 1065 if (object == NULL) 1066 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1067 docow = MAP_PREFAULT_PARTIAL; 1068 } 1069 1070 /* 1071 * Force device mappings to be shared. 1072 */ 1073 if (type == OBJT_DEVICE || type == OBJT_PHYS) { 1074 flags &= ~(MAP_PRIVATE|MAP_COPY); 1075 flags |= MAP_SHARED; 1076 } 1077 1078 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1079 docow |= MAP_COPY_ON_WRITE; 1080 if (flags & MAP_NOSYNC) 1081 docow |= MAP_DISABLE_SYNCER; 1082 if (flags & MAP_NOCORE) 1083 docow |= MAP_DISABLE_COREDUMP; 1084 1085 #if defined(VM_PROT_READ_IS_EXEC) 1086 if (prot & VM_PROT_READ) 1087 prot |= VM_PROT_EXECUTE; 1088 1089 if (maxprot & VM_PROT_READ) 1090 maxprot |= VM_PROT_EXECUTE; 1091 #endif 1092 1093 if (fitit) { 1094 *addr = pmap_addr_hint(object, *addr, size); 1095 } 1096 1097 /* 1098 * Stack mappings need special attention. Mappings that use virtual 1099 * page tables will default to storing the page table at offset 0. 1100 */ 1101 if (flags & MAP_STACK) { 1102 rv = vm_map_stack (map, *addr, size, prot, maxprot, docow); 1103 } else if (flags & MAP_VPAGETABLE) { 1104 rv = vm_map_find(map, object, foff, addr, size, fitit, 1105 VM_MAPTYPE_VPAGETABLE, prot, maxprot, docow); 1106 } else { 1107 rv = vm_map_find(map, object, foff, addr, size, fitit, 1108 VM_MAPTYPE_NORMAL, prot, maxprot, docow); 1109 } 1110 1111 if (rv != KERN_SUCCESS) { 1112 /* 1113 * Lose the object reference. Will destroy the 1114 * object if it's an unnamed anonymous mapping 1115 * or named anonymous without other references. 1116 */ 1117 vm_object_deallocate(object); 1118 goto out; 1119 } 1120 1121 /* 1122 * Shared memory is also shared with children. 1123 */ 1124 if (flags & (MAP_SHARED|MAP_INHERIT)) { 1125 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1126 if (rv != KERN_SUCCESS) { 1127 vm_map_remove(map, *addr, *addr + size); 1128 goto out; 1129 } 1130 } 1131 out: 1132 switch (rv) { 1133 case KERN_SUCCESS: 1134 return (0); 1135 case KERN_INVALID_ADDRESS: 1136 case KERN_NO_SPACE: 1137 return (ENOMEM); 1138 case KERN_PROTECTION_FAILURE: 1139 return (EACCES); 1140 default: 1141 return (EINVAL); 1142 } 1143 } 1144