1 /* 2 * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $DragonFly: src/sys/kern/kern_fp.c,v 1.5 2004/03/01 06:33:17 dillon Exp $ 27 */ 28 29 /* 30 * Direct file pointer API functions for in-kernel operations on files. These 31 * functions provide a open/read/write/close like interface within the kernel 32 * for operating on files that are not necessarily associated with processes 33 * and which do not (typically) have descriptors. 34 * 35 * FUTURE: file handle conversion routines to support checkpointing, 36 * and additional file operations (ioctl, fcntl). 37 */ 38 39 #include <sys/param.h> 40 #include <sys/kernel.h> 41 #include <sys/systm.h> 42 #include <sys/malloc.h> 43 #include <sys/sysproto.h> 44 #include <sys/conf.h> 45 #include <sys/filedesc.h> 46 #include <sys/sysctl.h> 47 #include <sys/vnode.h> 48 #include <sys/proc.h> 49 #include <sys/namei.h> 50 #include <sys/file.h> 51 #include <sys/stat.h> 52 #include <sys/filio.h> 53 #include <sys/fcntl.h> 54 #include <sys/unistd.h> 55 #include <sys/resourcevar.h> 56 #include <sys/event.h> 57 #include <sys/mman.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_param.h> 61 #include <sys/lock.h> 62 #include <vm/pmap.h> 63 #include <vm/vm_map.h> 64 #include <vm/vm_object.h> 65 #include <vm/vm_page.h> 66 #include <vm/vm_pager.h> 67 #include <vm/vm_pageout.h> 68 #include <vm/vm_extern.h> 69 #include <vm/vm_page.h> 70 #include <vm/vm_kern.h> 71 72 #include <sys/file2.h> 73 #include <machine/limits.h> 74 75 typedef struct file *file_t; 76 77 /* 78 * fp_open: 79 * 80 * Open a file as specified. Use O_* flags for flags. 81 * 82 * NOTE! O_ROOTCRED not quite working yet, vn_open() asserts that the 83 * cred must match the process's cred. 84 */ 85 int 86 fp_open(const char *path, int flags, int mode, file_t *fpp) 87 { 88 struct nameidata nd; 89 struct thread *td; 90 struct file *fp; 91 int error; 92 93 if ((error = falloc(NULL, fpp, NULL)) != 0) 94 return (error); 95 fp = *fpp; 96 td = curthread; 97 if ((flags & O_ROOTCRED) == 0 && td->td_proc) 98 fsetcred(fp, td->td_proc->p_ucred); 99 100 NDINIT(&nd, NAMEI_LOOKUP, 0, UIO_SYSSPACE, path, td); 101 flags = FFLAGS(flags); 102 if ((error = vn_open(&nd, flags, mode)) == 0) { 103 NDFREE(&nd, NDF_ONLY_PNBUF); 104 fp->f_data = (caddr_t)nd.ni_vp; 105 fp->f_flag = flags; 106 fp->f_ops = &vnops; 107 fp->f_type = DTYPE_VNODE; 108 VOP_UNLOCK(nd.ni_vp, NULL, 0, td); 109 } else { 110 fdrop(fp, td); 111 *fpp = NULL; 112 } 113 return(error); 114 } 115 116 117 /* 118 * fp_vpopen(): open a file pointer given a vnode. The vnode must be locked. 119 * The vnode will be returned unlocked whether an error occurs or not. 120 */ 121 int 122 fp_vpopen(struct vnode *vp, int flags, file_t *fpp) 123 { 124 struct thread *td; 125 struct file *fp; 126 int vmode; 127 int error; 128 129 *fpp = NULL; 130 td = curthread; 131 132 /* 133 * Vnode checks (from vn_open()) 134 */ 135 if (vp->v_type == VLNK) { 136 error = EMLINK; 137 goto done; 138 } 139 if (vp->v_type == VSOCK) { 140 error = EOPNOTSUPP; 141 goto done; 142 } 143 flags = FFLAGS(flags); 144 vmode = 0; 145 if (flags & (FWRITE | O_TRUNC)) { 146 if (vp->v_type == VDIR) { 147 error = EISDIR; 148 goto done; 149 } 150 error = vn_writechk(vp); 151 if (error) 152 goto done; 153 vmode |= VWRITE; 154 } 155 if (flags & FREAD) 156 vmode |= VREAD; 157 if (vmode) { 158 error = VOP_ACCESS(vp, vmode, td->td_proc->p_ucred, td); 159 if (error) 160 goto done; 161 } 162 error = VOP_OPEN(vp, flags, td->td_proc->p_ucred, td); 163 if (error) 164 goto done; 165 /* 166 * Make sure that a VM object is created for VMIO support. 167 */ 168 if (vn_canvmio(vp) == TRUE) { 169 if ((error = vfs_object_create(vp, td)) != 0) 170 goto done; 171 } 172 173 /* 174 * File pointer setup 175 */ 176 if ((error = falloc(NULL, fpp, NULL)) != 0) 177 goto done; 178 fp = *fpp; 179 if ((flags & O_ROOTCRED) == 0 && td->td_proc) 180 fsetcred(fp, td->td_proc->p_ucred); 181 fp->f_data = (caddr_t)vp; 182 fp->f_flag = flags; 183 fp->f_ops = &vnops; 184 fp->f_type = DTYPE_VNODE; 185 186 /* 187 * All done, set return value and update v_writecount now that no more 188 * errors can occur. 189 */ 190 *fpp = fp; 191 if (flags & FWRITE) 192 vp->v_writecount++; 193 done: 194 VOP_UNLOCK(vp, NULL, 0, td); 195 return (error); 196 } 197 198 /* 199 * fp_*read() is meant to operate like the normal descriptor based syscalls 200 * would. Note that if 'buf' points to user memory a UIO_USERSPACE 201 * transfer will be used. 202 */ 203 int 204 fp_pread(file_t fp, void *buf, size_t nbytes, off_t offset, ssize_t *res) 205 { 206 struct uio auio; 207 struct iovec aiov; 208 size_t count; 209 int error; 210 211 if (res) 212 *res = 0; 213 if (nbytes > INT_MAX) 214 return (EINVAL); 215 bzero(&auio, sizeof(auio)); 216 aiov.iov_base = (caddr_t)buf; 217 aiov.iov_len = nbytes; 218 auio.uio_iov = &aiov; 219 auio.uio_iovcnt = 1; 220 auio.uio_offset = offset; 221 auio.uio_resid = nbytes; 222 auio.uio_rw = UIO_READ; 223 if ((vm_offset_t)buf < VM_MAXUSER_ADDRESS) 224 auio.uio_segflg = UIO_USERSPACE; 225 else 226 auio.uio_segflg = UIO_SYSSPACE; 227 auio.uio_td = curthread; 228 229 count = nbytes; 230 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, auio.uio_td); 231 if (error) { 232 if (auio.uio_resid != nbytes && (error == ERESTART || error == EINTR || 233 error == EWOULDBLOCK) 234 ) { 235 error = 0; 236 } 237 } 238 count -= auio.uio_resid; 239 if (res) 240 *res = count; 241 return(error); 242 } 243 244 int 245 fp_read(file_t fp, void *buf, size_t nbytes, ssize_t *res) 246 { 247 struct uio auio; 248 struct iovec aiov; 249 size_t count; 250 int error; 251 252 if (res) 253 *res = 0; 254 if (nbytes > INT_MAX) 255 return (EINVAL); 256 bzero(&auio, sizeof(auio)); 257 aiov.iov_base = (caddr_t)buf; 258 aiov.iov_len = nbytes; 259 auio.uio_iov = &aiov; 260 auio.uio_iovcnt = 1; 261 auio.uio_offset = 0; 262 auio.uio_resid = nbytes; 263 auio.uio_rw = UIO_READ; 264 if ((vm_offset_t)buf < VM_MAXUSER_ADDRESS) 265 auio.uio_segflg = UIO_USERSPACE; 266 else 267 auio.uio_segflg = UIO_SYSSPACE; 268 auio.uio_td = curthread; 269 270 count = nbytes; 271 error = fo_read(fp, &auio, fp->f_cred, 0, auio.uio_td); 272 if (error) { 273 if (auio.uio_resid != nbytes && (error == ERESTART || error == EINTR || 274 error == EWOULDBLOCK) 275 ) { 276 error = 0; 277 } 278 } 279 count -= auio.uio_resid; 280 if (res) 281 *res = count; 282 return(error); 283 } 284 285 int 286 fp_pwrite(file_t fp, void *buf, size_t nbytes, off_t offset, ssize_t *res) 287 { 288 struct uio auio; 289 struct iovec aiov; 290 size_t count; 291 int error; 292 293 if (res) 294 *res = 0; 295 if (nbytes > INT_MAX) 296 return (EINVAL); 297 bzero(&auio, sizeof(auio)); 298 aiov.iov_base = (caddr_t)buf; 299 aiov.iov_len = nbytes; 300 auio.uio_iov = &aiov; 301 auio.uio_iovcnt = 1; 302 auio.uio_offset = offset; 303 auio.uio_resid = nbytes; 304 auio.uio_rw = UIO_WRITE; 305 if ((vm_offset_t)buf < VM_MAXUSER_ADDRESS) 306 auio.uio_segflg = UIO_USERSPACE; 307 else 308 auio.uio_segflg = UIO_SYSSPACE; 309 auio.uio_td = curthread; 310 311 count = nbytes; 312 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, auio.uio_td); 313 if (error) { 314 if (auio.uio_resid != nbytes && (error == ERESTART || error == EINTR || 315 error == EWOULDBLOCK) 316 ) { 317 error = 0; 318 } 319 } 320 count -= auio.uio_resid; 321 if (res) 322 *res = count; 323 return(error); 324 } 325 326 327 int 328 fp_write(file_t fp, void *buf, size_t nbytes, ssize_t *res) 329 { 330 struct uio auio; 331 struct iovec aiov; 332 size_t count; 333 int error; 334 335 if (res) 336 *res = 0; 337 if (nbytes > INT_MAX) 338 return (EINVAL); 339 bzero(&auio, sizeof(auio)); 340 aiov.iov_base = (caddr_t)buf; 341 aiov.iov_len = nbytes; 342 auio.uio_iov = &aiov; 343 auio.uio_iovcnt = 1; 344 auio.uio_offset = 0; 345 auio.uio_resid = nbytes; 346 auio.uio_rw = UIO_WRITE; 347 if ((vm_offset_t)buf < VM_MAXUSER_ADDRESS) 348 auio.uio_segflg = UIO_USERSPACE; 349 else 350 auio.uio_segflg = UIO_SYSSPACE; 351 auio.uio_td = curthread; 352 353 count = nbytes; 354 error = fo_write(fp, &auio, fp->f_cred, 0, auio.uio_td); 355 if (error) { 356 if (auio.uio_resid != nbytes && (error == ERESTART || error == EINTR || 357 error == EWOULDBLOCK) 358 ) { 359 error = 0; 360 } 361 } 362 count -= auio.uio_resid; 363 if (res) 364 *res = count; 365 return(error); 366 } 367 368 int 369 fp_stat(file_t fp, struct stat *ub) 370 { 371 int error; 372 373 error = fo_stat(fp, ub, curthread); 374 return(error); 375 } 376 377 /* 378 * non-anonymous, non-stack descriptor mappings only! 379 * 380 * This routine mostly snarfed from vm/vm_mmap.c 381 */ 382 int 383 fp_mmap(void *addr_arg, size_t size, int prot, int flags, struct file *fp, 384 off_t pos, void **resp) 385 { 386 struct thread *td = curthread; 387 struct proc *p = td->td_proc; 388 vm_size_t pageoff; 389 vm_prot_t maxprot; 390 vm_offset_t addr; 391 void *handle; 392 int error; 393 vm_object_t obj; 394 struct vmspace *vms = p->p_vmspace; 395 struct vnode *vp; 396 int disablexworkaround; 397 398 prot &= VM_PROT_ALL; 399 400 if ((ssize_t)size < 0 || (flags & MAP_ANON)) 401 return(EINVAL); 402 403 pageoff = (pos & PAGE_MASK); 404 pos -= pageoff; 405 406 /* Adjust size for rounding (on both ends). */ 407 size += pageoff; /* low end... */ 408 size = (vm_size_t)round_page(size); /* hi end */ 409 addr = (vm_offset_t)addr_arg; 410 411 /* 412 * Check for illegal addresses. Watch out for address wrap... Note 413 * that VM_*_ADDRESS are not constants due to casts (argh). 414 */ 415 if (flags & MAP_FIXED) { 416 /* 417 * The specified address must have the same remainder 418 * as the file offset taken modulo PAGE_SIZE, so it 419 * should be aligned after adjustment by pageoff. 420 */ 421 addr -= pageoff; 422 if (addr & PAGE_MASK) 423 return (EINVAL); 424 /* Address range must be all in user VM space. */ 425 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 426 return (EINVAL); 427 #ifndef i386 428 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 429 return (EINVAL); 430 #endif 431 if (addr + size < addr) 432 return (EINVAL); 433 } else if (addr == 0 || 434 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 435 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz)) 436 ) { 437 /* 438 * XXX for non-fixed mappings where no hint is provided or 439 * the hint would fall in the potential heap space, 440 * place it after the end of the largest possible heap. 441 * 442 * There should really be a pmap call to determine a reasonable 443 * location. 444 */ 445 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); 446 } 447 448 /* 449 * Mapping file, get fp for validation. Obtain vnode and make 450 * sure it is of appropriate type. 451 */ 452 if (fp->f_type != DTYPE_VNODE) 453 return (EINVAL); 454 455 /* 456 * POSIX shared-memory objects are defined to have 457 * kernel persistence, and are not defined to support 458 * read(2)/write(2) -- or even open(2). Thus, we can 459 * use MAP_ASYNC to trade on-disk coherence for speed. 460 * The shm_open(3) library routine turns on the FPOSIXSHM 461 * flag to request this behavior. 462 */ 463 if (fp->f_flag & FPOSIXSHM) 464 flags |= MAP_NOSYNC; 465 vp = (struct vnode *) fp->f_data; 466 if (vp->v_type != VREG && vp->v_type != VCHR) 467 return (EINVAL); 468 469 /* 470 * Get the proper underlying object 471 */ 472 if (vp->v_type == VREG) { 473 if (VOP_GETVOBJECT(vp, &obj) != 0) 474 return (EINVAL); 475 vp = (struct vnode*)obj->handle; 476 } 477 478 /* 479 * XXX hack to handle use of /dev/zero to map anon memory (ala 480 * SunOS). 481 */ 482 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 483 handle = NULL; 484 maxprot = VM_PROT_ALL; 485 flags |= MAP_ANON; 486 pos = 0; 487 } else { 488 /* 489 * cdevs does not provide private mappings of any kind. 490 */ 491 /* 492 * However, for XIG X server to continue to work, 493 * we should allow the superuser to do it anyway. 494 * We only allow it at securelevel < 1. 495 * (Because the XIG X server writes directly to video 496 * memory via /dev/mem, it should never work at any 497 * other securelevel. 498 * XXX this will have to go 499 */ 500 if (securelevel >= 1) 501 disablexworkaround = 1; 502 else 503 disablexworkaround = suser(td); 504 if (vp->v_type == VCHR && disablexworkaround && 505 (flags & (MAP_PRIVATE|MAP_COPY))) { 506 error = EINVAL; 507 goto done; 508 } 509 /* 510 * Ensure that file and memory protections are 511 * compatible. Note that we only worry about 512 * writability if mapping is shared; in this case, 513 * current and max prot are dictated by the open file. 514 * XXX use the vnode instead? Problem is: what 515 * credentials do we use for determination? What if 516 * proc does a setuid? 517 */ 518 maxprot = VM_PROT_EXECUTE; /* ??? */ 519 if (fp->f_flag & FREAD) { 520 maxprot |= VM_PROT_READ; 521 } else if (prot & PROT_READ) { 522 error = EACCES; 523 goto done; 524 } 525 /* 526 * If we are sharing potential changes (either via 527 * MAP_SHARED or via the implicit sharing of character 528 * device mappings), and we are trying to get write 529 * permission although we opened it without asking 530 * for it, bail out. Check for superuser, only if 531 * we're at securelevel < 1, to allow the XIG X server 532 * to continue to work. 533 */ 534 535 if ((flags & MAP_SHARED) != 0 || 536 (vp->v_type == VCHR && disablexworkaround) 537 ) { 538 if ((fp->f_flag & FWRITE) != 0) { 539 struct vattr va; 540 if ((error = VOP_GETATTR(vp, &va, td))) { 541 goto done; 542 } 543 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) { 544 maxprot |= VM_PROT_WRITE; 545 } else if (prot & PROT_WRITE) { 546 error = EPERM; 547 goto done; 548 } 549 } else if ((prot & PROT_WRITE) != 0) { 550 error = EACCES; 551 goto done; 552 } 553 } else { 554 maxprot |= VM_PROT_WRITE; 555 } 556 handle = (void *)vp; 557 } 558 error = vm_mmap(&vms->vm_map, &addr, size, prot, 559 maxprot, flags, handle, pos); 560 if (error == 0 && addr_arg) 561 *resp = (void *)addr; 562 done: 563 return (error); 564 } 565 566 int 567 fp_close(file_t fp) 568 { 569 return(fdrop(fp, curthread)); 570 } 571 572