1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $ 42 * $DragonFly: src/sys/vm/vm_mmap.c,v 1.14 2003/10/02 21:00:20 hmp Exp $ 43 */ 44 45 /* 46 * Mapped file (mmap) interface to VM 47 */ 48 49 #include "opt_compat.h" 50 51 #include <sys/param.h> 52 #include <sys/kernel.h> 53 #include <sys/systm.h> 54 #include <sys/sysproto.h> 55 #include <sys/filedesc.h> 56 #include <sys/proc.h> 57 #include <sys/resource.h> 58 #include <sys/resourcevar.h> 59 #include <sys/vnode.h> 60 #include <sys/fcntl.h> 61 #include <sys/file.h> 62 #include <sys/mman.h> 63 #include <sys/conf.h> 64 #include <sys/stat.h> 65 #include <sys/vmmeter.h> 66 #include <sys/sysctl.h> 67 68 #include <vm/vm.h> 69 #include <vm/vm_param.h> 70 #include <sys/lock.h> 71 #include <vm/pmap.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_object.h> 74 #include <vm/vm_page.h> 75 #include <vm/vm_pager.h> 76 #include <vm/vm_pageout.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_kern.h> 80 81 #include <sys/file2.h> 82 83 static int max_proc_mmap; 84 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 85 86 /* 87 * Set the maximum number of vm_map_entry structures per process. Roughly 88 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 89 * of our KVM malloc space still results in generous limits. We want a 90 * default that is good enough to prevent the kernel running out of resources 91 * if attacked from compromised user account but generous enough such that 92 * multi-threaded processes are not unduly inconvenienced. 93 */ 94 95 static void vmmapentry_rsrc_init (void *); 96 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 97 98 static void 99 vmmapentry_rsrc_init(dummy) 100 void *dummy; 101 { 102 #if defined(USE_KMEM_MAP) 103 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 104 max_proc_mmap /= 100; 105 #else 106 max_proc_mmap = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 107 sizeof(struct vm_map_entry); 108 max_proc_mmap /= 100; 109 #endif 110 } 111 112 /* ARGSUSED */ 113 int 114 sbrk(struct sbrk_args *uap) 115 { 116 /* Not yet implemented */ 117 return (EOPNOTSUPP); 118 } 119 120 /* 121 * sstk_args(int incr) 122 */ 123 /* ARGSUSED */ 124 int 125 sstk(struct sstk_args *uap) 126 { 127 /* Not yet implemented */ 128 return (EOPNOTSUPP); 129 } 130 131 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 132 133 /* 134 * getpagesize_args(int dummy) 135 */ 136 /* ARGSUSED */ 137 int 138 ogetpagesize(struct getpagesize_args *uap) 139 { 140 uap->sysmsg_result = PAGE_SIZE; 141 return (0); 142 } 143 #endif /* COMPAT_43 || COMPAT_SUNOS */ 144 145 146 /* 147 * mmap_args(void *addr, size_t len, int prot, int flags, int fd, 148 * long pad, off_t pos) 149 * 150 * Memory Map (mmap) system call. Note that the file offset 151 * and address are allowed to be NOT page aligned, though if 152 * the MAP_FIXED flag it set, both must have the same remainder 153 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 154 * page-aligned, the actual mapping starts at trunc_page(addr) 155 * and the return value is adjusted up by the page offset. 156 * 157 * Generally speaking, only character devices which are themselves 158 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 159 * there would be no cache coherency between a descriptor and a VM mapping 160 * both to the same character device. 161 * 162 * Block devices can be mmap'd no matter what they represent. Cache coherency 163 * is maintained as long as you do not write directly to the underlying 164 * character device. 165 */ 166 167 int 168 mmap(struct mmap_args *uap) 169 { 170 struct thread *td = curthread; 171 struct proc *p = td->td_proc; 172 struct filedesc *fdp = p->p_fd; 173 struct file *fp = NULL; 174 struct vnode *vp; 175 vm_offset_t addr; 176 vm_size_t size, pageoff; 177 vm_prot_t prot, maxprot; 178 void *handle; 179 int flags, error; 180 int disablexworkaround; 181 off_t pos; 182 struct vmspace *vms = p->p_vmspace; 183 vm_object_t obj; 184 185 KKASSERT(p); 186 187 addr = (vm_offset_t) uap->addr; 188 size = uap->len; 189 prot = uap->prot & VM_PROT_ALL; 190 flags = uap->flags; 191 pos = uap->pos; 192 193 /* make sure mapping fits into numeric range etc */ 194 if ((ssize_t) uap->len < 0 || 195 ((flags & MAP_ANON) && uap->fd != -1)) 196 return (EINVAL); 197 198 if (flags & MAP_STACK) { 199 if ((uap->fd != -1) || 200 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 201 return (EINVAL); 202 flags |= MAP_ANON; 203 pos = 0; 204 } 205 206 /* 207 * Align the file position to a page boundary, 208 * and save its page offset component. 209 */ 210 pageoff = (pos & PAGE_MASK); 211 pos -= pageoff; 212 213 /* Adjust size for rounding (on both ends). */ 214 size += pageoff; /* low end... */ 215 size = (vm_size_t) round_page(size); /* hi end */ 216 217 /* 218 * Check for illegal addresses. Watch out for address wrap... Note 219 * that VM_*_ADDRESS are not constants due to casts (argh). 220 */ 221 if (flags & MAP_FIXED) { 222 /* 223 * The specified address must have the same remainder 224 * as the file offset taken modulo PAGE_SIZE, so it 225 * should be aligned after adjustment by pageoff. 226 */ 227 addr -= pageoff; 228 if (addr & PAGE_MASK) 229 return (EINVAL); 230 /* Address range must be all in user VM space. */ 231 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 232 return (EINVAL); 233 #ifndef i386 234 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 235 return (EINVAL); 236 #endif 237 if (addr + size < addr) 238 return (EINVAL); 239 } 240 /* 241 * XXX for non-fixed mappings where no hint is provided or 242 * the hint would fall in the potential heap space, 243 * place it after the end of the largest possible heap. 244 * 245 * There should really be a pmap call to determine a reasonable 246 * location. 247 */ 248 else if (addr == 0 || 249 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 250 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) 251 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); 252 253 if (flags & MAP_ANON) { 254 /* 255 * Mapping blank space is trivial. 256 */ 257 handle = NULL; 258 maxprot = VM_PROT_ALL; 259 pos = 0; 260 } else { 261 /* 262 * Mapping file, get fp for validation. Obtain vnode and make 263 * sure it is of appropriate type. 264 */ 265 if (((unsigned) uap->fd) >= fdp->fd_nfiles || 266 (fp = fdp->fd_ofiles[uap->fd]) == NULL) 267 return (EBADF); 268 if (fp->f_type != DTYPE_VNODE) 269 return (EINVAL); 270 /* 271 * POSIX shared-memory objects are defined to have 272 * kernel persistence, and are not defined to support 273 * read(2)/write(2) -- or even open(2). Thus, we can 274 * use MAP_ASYNC to trade on-disk coherence for speed. 275 * The shm_open(3) library routine turns on the FPOSIXSHM 276 * flag to request this behavior. 277 */ 278 if (fp->f_flag & FPOSIXSHM) 279 flags |= MAP_NOSYNC; 280 vp = (struct vnode *) fp->f_data; 281 if (vp->v_type != VREG && vp->v_type != VCHR) 282 return (EINVAL); 283 if (vp->v_type == VREG) { 284 /* 285 * Get the proper underlying object 286 */ 287 if (VOP_GETVOBJECT(vp, &obj) != 0) 288 return (EINVAL); 289 vp = (struct vnode*)obj->handle; 290 } 291 292 /* 293 * don't let the descriptor disappear on us if we block 294 */ 295 fhold(fp); 296 297 /* 298 * XXX hack to handle use of /dev/zero to map anon memory (ala 299 * SunOS). 300 */ 301 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 302 handle = NULL; 303 maxprot = VM_PROT_ALL; 304 flags |= MAP_ANON; 305 pos = 0; 306 } else { 307 /* 308 * cdevs does not provide private mappings of any kind. 309 */ 310 /* 311 * However, for XIG X server to continue to work, 312 * we should allow the superuser to do it anyway. 313 * We only allow it at securelevel < 1. 314 * (Because the XIG X server writes directly to video 315 * memory via /dev/mem, it should never work at any 316 * other securelevel. 317 * XXX this will have to go 318 */ 319 if (securelevel >= 1) 320 disablexworkaround = 1; 321 else 322 disablexworkaround = suser(td); 323 if (vp->v_type == VCHR && disablexworkaround && 324 (flags & (MAP_PRIVATE|MAP_COPY))) { 325 error = EINVAL; 326 goto done; 327 } 328 /* 329 * Ensure that file and memory protections are 330 * compatible. Note that we only worry about 331 * writability if mapping is shared; in this case, 332 * current and max prot are dictated by the open file. 333 * XXX use the vnode instead? Problem is: what 334 * credentials do we use for determination? What if 335 * proc does a setuid? 336 */ 337 maxprot = VM_PROT_EXECUTE; /* ??? */ 338 if (fp->f_flag & FREAD) { 339 maxprot |= VM_PROT_READ; 340 } else if (prot & PROT_READ) { 341 error = EACCES; 342 goto done; 343 } 344 /* 345 * If we are sharing potential changes (either via 346 * MAP_SHARED or via the implicit sharing of character 347 * device mappings), and we are trying to get write 348 * permission although we opened it without asking 349 * for it, bail out. Check for superuser, only if 350 * we're at securelevel < 1, to allow the XIG X server 351 * to continue to work. 352 */ 353 354 if ((flags & MAP_SHARED) != 0 || 355 (vp->v_type == VCHR && disablexworkaround)) { 356 if ((fp->f_flag & FWRITE) != 0) { 357 struct vattr va; 358 if ((error = VOP_GETATTR(vp, &va, td))) { 359 goto done; 360 } 361 if ((va.va_flags & 362 (IMMUTABLE|APPEND)) == 0) { 363 maxprot |= VM_PROT_WRITE; 364 } else if (prot & PROT_WRITE) { 365 error = EPERM; 366 goto done; 367 } 368 } else if ((prot & PROT_WRITE) != 0) { 369 error = EACCES; 370 goto done; 371 } 372 } else { 373 maxprot |= VM_PROT_WRITE; 374 } 375 handle = (void *)vp; 376 } 377 } 378 379 /* 380 * Do not allow more then a certain number of vm_map_entry structures 381 * per process. Scale with the number of rforks sharing the map 382 * to make the limit reasonable for threads. 383 */ 384 if (max_proc_mmap && 385 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 386 error = ENOMEM; 387 goto done; 388 } 389 390 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 391 flags, handle, pos); 392 if (error == 0) 393 uap->sysmsg_resultp = (void *)(addr + pageoff); 394 done: 395 if (fp) 396 fdrop(fp, td); 397 return (error); 398 } 399 400 #ifdef COMPAT_43 401 /* 402 * ommap_args(caddr_t addr, int len, int prot, int flags, int fd, long pos) 403 */ 404 int 405 ommap(struct ommap_args *uap) 406 { 407 struct mmap_args nargs; 408 static const char cvtbsdprot[8] = { 409 0, 410 PROT_EXEC, 411 PROT_WRITE, 412 PROT_EXEC | PROT_WRITE, 413 PROT_READ, 414 PROT_EXEC | PROT_READ, 415 PROT_WRITE | PROT_READ, 416 PROT_EXEC | PROT_WRITE | PROT_READ, 417 }; 418 419 #define OMAP_ANON 0x0002 420 #define OMAP_COPY 0x0020 421 #define OMAP_SHARED 0x0010 422 #define OMAP_FIXED 0x0100 423 #define OMAP_INHERIT 0x0800 424 425 nargs.addr = uap->addr; 426 nargs.len = uap->len; 427 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 428 nargs.flags = 0; 429 if (uap->flags & OMAP_ANON) 430 nargs.flags |= MAP_ANON; 431 if (uap->flags & OMAP_COPY) 432 nargs.flags |= MAP_COPY; 433 if (uap->flags & OMAP_SHARED) 434 nargs.flags |= MAP_SHARED; 435 else 436 nargs.flags |= MAP_PRIVATE; 437 if (uap->flags & OMAP_FIXED) 438 nargs.flags |= MAP_FIXED; 439 if (uap->flags & OMAP_INHERIT) 440 nargs.flags |= MAP_INHERIT; 441 nargs.fd = uap->fd; 442 nargs.pos = uap->pos; 443 return (mmap(&nargs)); 444 } 445 #endif /* COMPAT_43 */ 446 447 448 /* 449 * msync_args(void *addr, int len, int flags) 450 */ 451 int 452 msync(struct msync_args *uap) 453 { 454 struct proc *p = curproc; 455 vm_offset_t addr; 456 vm_size_t size, pageoff; 457 int flags; 458 vm_map_t map; 459 int rv; 460 461 addr = (vm_offset_t) uap->addr; 462 size = uap->len; 463 flags = uap->flags; 464 465 pageoff = (addr & PAGE_MASK); 466 addr -= pageoff; 467 size += pageoff; 468 size = (vm_size_t) round_page(size); 469 if (addr + size < addr) 470 return(EINVAL); 471 472 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 473 return (EINVAL); 474 475 map = &p->p_vmspace->vm_map; 476 477 /* 478 * XXX Gak! If size is zero we are supposed to sync "all modified 479 * pages with the region containing addr". Unfortunately, we don't 480 * really keep track of individual mmaps so we approximate by flushing 481 * the range of the map entry containing addr. This can be incorrect 482 * if the region splits or is coalesced with a neighbor. 483 */ 484 if (size == 0) { 485 vm_map_entry_t entry; 486 487 vm_map_lock_read(map); 488 rv = vm_map_lookup_entry(map, addr, &entry); 489 vm_map_unlock_read(map); 490 if (rv == FALSE) 491 return (EINVAL); 492 addr = entry->start; 493 size = entry->end - entry->start; 494 } 495 496 /* 497 * Clean the pages and interpret the return value. 498 */ 499 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 500 (flags & MS_INVALIDATE) != 0); 501 502 switch (rv) { 503 case KERN_SUCCESS: 504 break; 505 case KERN_INVALID_ADDRESS: 506 return (EINVAL); /* Sun returns ENOMEM? */ 507 case KERN_FAILURE: 508 return (EIO); 509 default: 510 return (EINVAL); 511 } 512 513 return (0); 514 } 515 516 /* 517 * munmap_args(void *addr, size_t len) 518 */ 519 int 520 munmap(struct munmap_args *uap) 521 { 522 struct proc *p = curproc; 523 vm_offset_t addr; 524 vm_size_t size, pageoff; 525 vm_map_t map; 526 527 addr = (vm_offset_t) uap->addr; 528 size = uap->len; 529 530 pageoff = (addr & PAGE_MASK); 531 addr -= pageoff; 532 size += pageoff; 533 size = (vm_size_t) round_page(size); 534 if (addr + size < addr) 535 return(EINVAL); 536 537 if (size == 0) 538 return (0); 539 540 /* 541 * Check for illegal addresses. Watch out for address wrap... Note 542 * that VM_*_ADDRESS are not constants due to casts (argh). 543 */ 544 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 545 return (EINVAL); 546 #ifndef i386 547 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 548 return (EINVAL); 549 #endif 550 map = &p->p_vmspace->vm_map; 551 /* 552 * Make sure entire range is allocated. 553 */ 554 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) 555 return (EINVAL); 556 /* returns nothing but KERN_SUCCESS anyway */ 557 (void) vm_map_remove(map, addr, addr + size); 558 return (0); 559 } 560 561 #if 0 562 void 563 munmapfd(p, fd) 564 struct proc *p; 565 int fd; 566 { 567 /* 568 * XXX should unmap any regions mapped to this file 569 */ 570 p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; 571 } 572 #endif 573 574 /* 575 * mprotect_args(const void *addr, size_t len, int prot) 576 */ 577 int 578 mprotect(struct mprotect_args *uap) 579 { 580 struct proc *p = curproc; 581 vm_offset_t addr; 582 vm_size_t size, pageoff; 583 vm_prot_t prot; 584 585 addr = (vm_offset_t) uap->addr; 586 size = uap->len; 587 prot = uap->prot & VM_PROT_ALL; 588 #if defined(VM_PROT_READ_IS_EXEC) 589 if (prot & VM_PROT_READ) 590 prot |= VM_PROT_EXECUTE; 591 #endif 592 593 pageoff = (addr & PAGE_MASK); 594 addr -= pageoff; 595 size += pageoff; 596 size = (vm_size_t) round_page(size); 597 if (addr + size < addr) 598 return(EINVAL); 599 600 switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, 601 FALSE)) { 602 case KERN_SUCCESS: 603 return (0); 604 case KERN_PROTECTION_FAILURE: 605 return (EACCES); 606 } 607 return (EINVAL); 608 } 609 610 /* 611 * minherit_args(void *addr, size_t len, int inherit) 612 */ 613 int 614 minherit(struct minherit_args *uap) 615 { 616 struct proc *p = curproc; 617 vm_offset_t addr; 618 vm_size_t size, pageoff; 619 vm_inherit_t inherit; 620 621 addr = (vm_offset_t)uap->addr; 622 size = uap->len; 623 inherit = uap->inherit; 624 625 pageoff = (addr & PAGE_MASK); 626 addr -= pageoff; 627 size += pageoff; 628 size = (vm_size_t) round_page(size); 629 if (addr + size < addr) 630 return(EINVAL); 631 632 switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, 633 inherit)) { 634 case KERN_SUCCESS: 635 return (0); 636 case KERN_PROTECTION_FAILURE: 637 return (EACCES); 638 } 639 return (EINVAL); 640 } 641 642 /* 643 * madvise_args(void *addr, size_t len, int behav) 644 */ 645 /* ARGSUSED */ 646 int 647 madvise(struct madvise_args *uap) 648 { 649 struct proc *p = curproc; 650 vm_offset_t start, end; 651 652 /* 653 * Check for illegal behavior 654 */ 655 if (uap->behav < 0 || uap->behav > MADV_CORE) 656 return (EINVAL); 657 /* 658 * Check for illegal addresses. Watch out for address wrap... Note 659 * that VM_*_ADDRESS are not constants due to casts (argh). 660 */ 661 if (VM_MAXUSER_ADDRESS > 0 && 662 ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS) 663 return (EINVAL); 664 #ifndef i386 665 if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) 666 return (EINVAL); 667 #endif 668 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 669 return (EINVAL); 670 671 /* 672 * Since this routine is only advisory, we default to conservative 673 * behavior. 674 */ 675 start = trunc_page((vm_offset_t) uap->addr); 676 end = round_page((vm_offset_t) uap->addr + uap->len); 677 678 if (vm_map_madvise(&p->p_vmspace->vm_map, start, end, uap->behav)) 679 return (EINVAL); 680 return (0); 681 } 682 683 /* 684 * mincore_args(const void *addr, size_t len, char *vec) 685 */ 686 /* ARGSUSED */ 687 int 688 mincore(struct mincore_args *uap) 689 { 690 struct proc *p = curproc; 691 vm_offset_t addr, first_addr; 692 vm_offset_t end, cend; 693 pmap_t pmap; 694 vm_map_t map; 695 char *vec; 696 int error; 697 int vecindex, lastvecindex; 698 vm_map_entry_t current; 699 vm_map_entry_t entry; 700 int mincoreinfo; 701 unsigned int timestamp; 702 703 /* 704 * Make sure that the addresses presented are valid for user 705 * mode. 706 */ 707 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 708 end = addr + (vm_size_t)round_page(uap->len); 709 if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) 710 return (EINVAL); 711 if (end < addr) 712 return (EINVAL); 713 714 /* 715 * Address of byte vector 716 */ 717 vec = uap->vec; 718 719 map = &p->p_vmspace->vm_map; 720 pmap = vmspace_pmap(p->p_vmspace); 721 722 vm_map_lock_read(map); 723 RestartScan: 724 timestamp = map->timestamp; 725 726 if (!vm_map_lookup_entry(map, addr, &entry)) 727 entry = entry->next; 728 729 /* 730 * Do this on a map entry basis so that if the pages are not 731 * in the current processes address space, we can easily look 732 * up the pages elsewhere. 733 */ 734 lastvecindex = -1; 735 for(current = entry; 736 (current != &map->header) && (current->start < end); 737 current = current->next) { 738 739 /* 740 * ignore submaps (for now) or null objects 741 */ 742 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 743 current->object.vm_object == NULL) 744 continue; 745 746 /* 747 * limit this scan to the current map entry and the 748 * limits for the mincore call 749 */ 750 if (addr < current->start) 751 addr = current->start; 752 cend = current->end; 753 if (cend > end) 754 cend = end; 755 756 /* 757 * scan this entry one page at a time 758 */ 759 while(addr < cend) { 760 /* 761 * Check pmap first, it is likely faster, also 762 * it can provide info as to whether we are the 763 * one referencing or modifying the page. 764 */ 765 mincoreinfo = pmap_mincore(pmap, addr); 766 if (!mincoreinfo) { 767 vm_pindex_t pindex; 768 vm_ooffset_t offset; 769 vm_page_t m; 770 /* 771 * calculate the page index into the object 772 */ 773 offset = current->offset + (addr - current->start); 774 pindex = OFF_TO_IDX(offset); 775 m = vm_page_lookup(current->object.vm_object, 776 pindex); 777 /* 778 * if the page is resident, then gather information about 779 * it. 780 */ 781 if (m) { 782 mincoreinfo = MINCORE_INCORE; 783 if (m->dirty || 784 pmap_is_modified(m)) 785 mincoreinfo |= MINCORE_MODIFIED_OTHER; 786 if ((m->flags & PG_REFERENCED) || 787 pmap_ts_referenced(m)) { 788 vm_page_flag_set(m, PG_REFERENCED); 789 mincoreinfo |= MINCORE_REFERENCED_OTHER; 790 } 791 } 792 } 793 794 /* 795 * subyte may page fault. In case it needs to modify 796 * the map, we release the lock. 797 */ 798 vm_map_unlock_read(map); 799 800 /* 801 * calculate index into user supplied byte vector 802 */ 803 vecindex = OFF_TO_IDX(addr - first_addr); 804 805 /* 806 * If we have skipped map entries, we need to make sure that 807 * the byte vector is zeroed for those skipped entries. 808 */ 809 while((lastvecindex + 1) < vecindex) { 810 error = subyte( vec + lastvecindex, 0); 811 if (error) { 812 return (EFAULT); 813 } 814 ++lastvecindex; 815 } 816 817 /* 818 * Pass the page information to the user 819 */ 820 error = subyte( vec + vecindex, mincoreinfo); 821 if (error) { 822 return (EFAULT); 823 } 824 825 /* 826 * If the map has changed, due to the subyte, the previous 827 * output may be invalid. 828 */ 829 vm_map_lock_read(map); 830 if (timestamp != map->timestamp) 831 goto RestartScan; 832 833 lastvecindex = vecindex; 834 addr += PAGE_SIZE; 835 } 836 } 837 838 /* 839 * subyte may page fault. In case it needs to modify 840 * the map, we release the lock. 841 */ 842 vm_map_unlock_read(map); 843 844 /* 845 * Zero the last entries in the byte vector. 846 */ 847 vecindex = OFF_TO_IDX(end - first_addr); 848 while((lastvecindex + 1) < vecindex) { 849 error = subyte( vec + lastvecindex, 0); 850 if (error) { 851 return (EFAULT); 852 } 853 ++lastvecindex; 854 } 855 856 /* 857 * If the map has changed, due to the subyte, the previous 858 * output may be invalid. 859 */ 860 vm_map_lock_read(map); 861 if (timestamp != map->timestamp) 862 goto RestartScan; 863 vm_map_unlock_read(map); 864 865 return (0); 866 } 867 868 /* 869 * mlock_args(const void *addr, size_t len) 870 */ 871 int 872 mlock(struct mlock_args *uap) 873 { 874 vm_offset_t addr; 875 vm_size_t size, pageoff; 876 int error; 877 struct proc *p = curproc; 878 879 addr = (vm_offset_t) uap->addr; 880 size = uap->len; 881 882 pageoff = (addr & PAGE_MASK); 883 addr -= pageoff; 884 size += pageoff; 885 size = (vm_size_t) round_page(size); 886 887 /* disable wrap around */ 888 if (addr + size < addr) 889 return (EINVAL); 890 891 if (atop(size) + vmstats.v_wire_count > vm_page_max_wired) 892 return (EAGAIN); 893 894 #ifdef pmap_wired_count 895 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 896 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 897 return (ENOMEM); 898 #else 899 error = suser_cred(p->p_ucred, 0); 900 if (error) 901 return (error); 902 #endif 903 904 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, FALSE); 905 return (error == KERN_SUCCESS ? 0 : ENOMEM); 906 } 907 908 /* 909 * mlockall_args(int how) 910 */ 911 int 912 mlockall(struct mlockall_args *uap) 913 { 914 return 0; 915 } 916 917 /* 918 * mlockall_args(int how) 919 */ 920 int 921 munlockall(struct munlockall_args *uap) 922 { 923 return 0; 924 } 925 926 /* 927 * munlock_args(const void *addr, size_t len) 928 */ 929 int 930 munlock(struct munlock_args *uap) 931 { 932 struct thread *td = curthread; 933 struct proc *p = td->td_proc; 934 vm_offset_t addr; 935 vm_size_t size, pageoff; 936 int error; 937 938 addr = (vm_offset_t) uap->addr; 939 size = uap->len; 940 941 pageoff = (addr & PAGE_MASK); 942 addr -= pageoff; 943 size += pageoff; 944 size = (vm_size_t) round_page(size); 945 946 /* disable wrap around */ 947 if (addr + size < addr) 948 return (EINVAL); 949 950 #ifndef pmap_wired_count 951 error = suser(td); 952 if (error) 953 return (error); 954 #endif 955 956 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, TRUE); 957 return (error == KERN_SUCCESS ? 0 : ENOMEM); 958 } 959 960 /* 961 * Internal version of mmap. 962 * Currently used by mmap, exec, and sys5 shared memory. 963 * Handle is either a vnode pointer or NULL for MAP_ANON. 964 */ 965 int 966 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 967 vm_prot_t maxprot, int flags, 968 void *handle, 969 vm_ooffset_t foff) 970 { 971 boolean_t fitit; 972 vm_object_t object; 973 struct vnode *vp = NULL; 974 objtype_t type; 975 int rv = KERN_SUCCESS; 976 vm_ooffset_t objsize; 977 int docow; 978 struct thread *td = curthread; /* XXX */ 979 struct proc *p = td->td_proc; 980 981 KKASSERT(p); 982 983 if (size == 0) 984 return (0); 985 986 objsize = size = round_page(size); 987 988 if (p->p_vmspace->vm_map.size + size > 989 p->p_rlimit[RLIMIT_VMEM].rlim_cur) { 990 return(ENOMEM); 991 } 992 993 /* 994 * We currently can only deal with page aligned file offsets. 995 * The check is here rather than in the syscall because the 996 * kernel calls this function internally for other mmaping 997 * operations (such as in exec) and non-aligned offsets will 998 * cause pmap inconsistencies...so we want to be sure to 999 * disallow this in all cases. 1000 */ 1001 if (foff & PAGE_MASK) 1002 return (EINVAL); 1003 1004 if ((flags & MAP_FIXED) == 0) { 1005 fitit = TRUE; 1006 *addr = round_page(*addr); 1007 } else { 1008 if (*addr != trunc_page(*addr)) 1009 return (EINVAL); 1010 fitit = FALSE; 1011 (void) vm_map_remove(map, *addr, *addr + size); 1012 } 1013 1014 /* 1015 * Lookup/allocate object. 1016 */ 1017 if (flags & MAP_ANON) { 1018 type = OBJT_DEFAULT; 1019 /* 1020 * Unnamed anonymous regions always start at 0. 1021 */ 1022 if (handle == 0) 1023 foff = 0; 1024 } else { 1025 vp = (struct vnode *) handle; 1026 if (vp->v_type == VCHR) { 1027 type = OBJT_DEVICE; 1028 handle = (void *)(intptr_t)vp->v_rdev; 1029 } else { 1030 struct vattr vat; 1031 int error; 1032 1033 error = VOP_GETATTR(vp, &vat, td); 1034 if (error) 1035 return (error); 1036 objsize = round_page(vat.va_size); 1037 type = OBJT_VNODE; 1038 /* 1039 * if it is a regular file without any references 1040 * we do not need to sync it. 1041 */ 1042 if (vp->v_type == VREG && vat.va_nlink == 0) { 1043 flags |= MAP_NOSYNC; 1044 } 1045 } 1046 } 1047 1048 if (handle == NULL) { 1049 object = NULL; 1050 docow = 0; 1051 } else { 1052 object = vm_pager_allocate(type, 1053 handle, objsize, prot, foff); 1054 if (object == NULL) 1055 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1056 docow = MAP_PREFAULT_PARTIAL; 1057 } 1058 1059 /* 1060 * Force device mappings to be shared. 1061 */ 1062 if (type == OBJT_DEVICE || type == OBJT_PHYS) { 1063 flags &= ~(MAP_PRIVATE|MAP_COPY); 1064 flags |= MAP_SHARED; 1065 } 1066 1067 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1068 docow |= MAP_COPY_ON_WRITE; 1069 if (flags & MAP_NOSYNC) 1070 docow |= MAP_DISABLE_SYNCER; 1071 if (flags & MAP_NOCORE) 1072 docow |= MAP_DISABLE_COREDUMP; 1073 1074 #if defined(VM_PROT_READ_IS_EXEC) 1075 if (prot & VM_PROT_READ) 1076 prot |= VM_PROT_EXECUTE; 1077 1078 if (maxprot & VM_PROT_READ) 1079 maxprot |= VM_PROT_EXECUTE; 1080 #endif 1081 1082 if (fitit) { 1083 *addr = pmap_addr_hint(object, *addr, size); 1084 } 1085 1086 if (flags & MAP_STACK) 1087 rv = vm_map_stack (map, *addr, size, prot, 1088 maxprot, docow); 1089 else 1090 rv = vm_map_find(map, object, foff, addr, size, fitit, 1091 prot, maxprot, docow); 1092 1093 if (rv != KERN_SUCCESS) { 1094 /* 1095 * Lose the object reference. Will destroy the 1096 * object if it's an unnamed anonymous mapping 1097 * or named anonymous without other references. 1098 */ 1099 vm_object_deallocate(object); 1100 goto out; 1101 } 1102 1103 /* 1104 * Shared memory is also shared with children. 1105 */ 1106 if (flags & (MAP_SHARED|MAP_INHERIT)) { 1107 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1108 if (rv != KERN_SUCCESS) { 1109 (void) vm_map_remove(map, *addr, *addr + size); 1110 goto out; 1111 } 1112 } 1113 out: 1114 switch (rv) { 1115 case KERN_SUCCESS: 1116 return (0); 1117 case KERN_INVALID_ADDRESS: 1118 case KERN_NO_SPACE: 1119 return (ENOMEM); 1120 case KERN_PROTECTION_FAILURE: 1121 return (EACCES); 1122 default: 1123 return (EINVAL); 1124 } 1125 } 1126