1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $ 40 * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.58 2008/06/28 17:59:49 dillon Exp $ 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/fcntl.h> 46 #include <sys/file.h> 47 #include <sys/stat.h> 48 #include <sys/proc.h> 49 #include <sys/priv.h> 50 #include <sys/mount.h> 51 #include <sys/nlookup.h> 52 #include <sys/vnode.h> 53 #include <sys/buf.h> 54 #include <sys/filio.h> 55 #include <sys/ttycom.h> 56 #include <sys/conf.h> 57 #include <sys/sysctl.h> 58 #include <sys/syslog.h> 59 60 #include <sys/thread2.h> 61 62 static int vn_closefile (struct file *fp); 63 static int vn_ioctl (struct file *fp, u_long com, caddr_t data, 64 struct ucred *cred); 65 static int vn_read (struct file *fp, struct uio *uio, 66 struct ucred *cred, int flags); 67 static int svn_read (struct file *fp, struct uio *uio, 68 struct ucred *cred, int flags); 69 static int vn_poll (struct file *fp, int events, struct ucred *cred); 70 static int vn_kqfilter (struct file *fp, struct knote *kn); 71 static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred); 72 static int vn_write (struct file *fp, struct uio *uio, 73 struct ucred *cred, int flags); 74 static int svn_write (struct file *fp, struct uio *uio, 75 struct ucred *cred, int flags); 76 77 #ifdef SMP 78 static int read_mpsafe = 0; 79 SYSCTL_INT(_vfs, OID_AUTO, read_mpsafe, CTLFLAG_RW, &read_mpsafe, 0, ""); 80 static int write_mpsafe = 0; 81 SYSCTL_INT(_vfs, OID_AUTO, write_mpsafe, CTLFLAG_RW, &write_mpsafe, 0, ""); 82 static int getattr_mpsafe = 0; 83 SYSCTL_INT(_vfs, OID_AUTO, getattr_mpsafe, CTLFLAG_RW, &getattr_mpsafe, 0, ""); 84 #else 85 #define read_mpsafe 0 86 #define write_mpsafe 0 87 #define getattr_mpsafe 0 88 #endif 89 90 struct fileops vnode_fileops = { 91 .fo_read = vn_read, 92 .fo_write = vn_write, 93 .fo_ioctl = vn_ioctl, 94 .fo_poll = vn_poll, 95 .fo_kqfilter = vn_kqfilter, 96 .fo_stat = vn_statfile, 97 .fo_close = vn_closefile, 98 .fo_shutdown = nofo_shutdown 99 }; 100 101 struct fileops specvnode_fileops = { 102 .fo_read = svn_read, 103 .fo_write = svn_write, 104 .fo_ioctl = vn_ioctl, 105 .fo_poll = vn_poll, 106 .fo_kqfilter = vn_kqfilter, 107 .fo_stat = vn_statfile, 108 .fo_close = vn_closefile, 109 .fo_shutdown = nofo_shutdown 110 }; 111 112 /* 113 * Shortcut the device read/write. This avoids a lot of vnode junk. 114 * Basically the specfs vnops for read and write take the locked vnode, 115 * unlock it (because we can't hold the vnode locked while reading or writing 116 * a device which may block indefinitely), issues the device operation, then 117 * relock the vnode before returning, plus other junk. This bypasses all 118 * of that and just does the device operation. 119 */ 120 void 121 vn_setspecops(struct file *fp) 122 { 123 if (vfs_fastdev && fp->f_ops == &vnode_fileops) { 124 fp->f_ops = &specvnode_fileops; 125 } 126 } 127 128 /* 129 * Common code for vnode open operations. Check permissions, and call 130 * the VOP_NOPEN or VOP_NCREATE routine. 131 * 132 * The caller is responsible for setting up nd with nlookup_init() and 133 * for cleaning it up with nlookup_done(), whether we return an error 134 * or not. 135 * 136 * On success nd->nl_open_vp will hold a referenced and, if requested, 137 * locked vnode. A locked vnode is requested via NLC_LOCKVP. If fp 138 * is non-NULL the vnode will be installed in the file pointer. 139 * 140 * NOTE: The vnode is referenced just once on return whether or not it 141 * is also installed in the file pointer. 142 */ 143 int 144 vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode) 145 { 146 struct vnode *vp; 147 struct ucred *cred = nd->nl_cred; 148 struct vattr vat; 149 struct vattr *vap = &vat; 150 int error; 151 152 /* 153 * Certain combinations are illegal 154 */ 155 if ((fmode & (FWRITE | O_TRUNC)) == O_TRUNC) 156 return(EACCES); 157 158 /* 159 * Lookup the path and create or obtain the vnode. After a 160 * successful lookup a locked nd->nl_nch will be returned. 161 * 162 * The result of this section should be a locked vnode. 163 * 164 * XXX with only a little work we should be able to avoid locking 165 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set. 166 */ 167 nd->nl_flags |= NLC_OPEN; 168 if (fmode & O_APPEND) 169 nd->nl_flags |= NLC_APPEND; 170 if (fmode & O_TRUNC) 171 nd->nl_flags |= NLC_TRUNCATE; 172 if (fmode & FREAD) 173 nd->nl_flags |= NLC_READ; 174 if (fmode & FWRITE) 175 nd->nl_flags |= NLC_WRITE; 176 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 177 nd->nl_flags |= NLC_FOLLOW; 178 179 if (fmode & O_CREAT) { 180 /* 181 * CONDITIONAL CREATE FILE CASE 182 * 183 * Setting NLC_CREATE causes a negative hit to store 184 * the negative hit ncp and not return an error. Then 185 * nc_error or nc_vp may be checked to see if the ncp 186 * represents a negative hit. NLC_CREATE also requires 187 * write permission on the governing directory or EPERM 188 * is returned. 189 */ 190 nd->nl_flags |= NLC_CREATE; 191 nd->nl_flags |= NLC_REFDVP; 192 bwillinode(1); 193 error = nlookup(nd); 194 } else { 195 /* 196 * NORMAL OPEN FILE CASE 197 */ 198 error = nlookup(nd); 199 } 200 201 if (error) 202 return (error); 203 204 /* 205 * split case to allow us to re-resolve and retry the ncp in case 206 * we get ESTALE. 207 */ 208 again: 209 if (fmode & O_CREAT) { 210 if (nd->nl_nch.ncp->nc_vp == NULL) { 211 if ((error = ncp_writechk(&nd->nl_nch)) != 0) 212 return (error); 213 VATTR_NULL(vap); 214 vap->va_type = VREG; 215 vap->va_mode = cmode; 216 if (fmode & O_EXCL) 217 vap->va_vaflags |= VA_EXCLUSIVE; 218 error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp, 219 nd->nl_cred, vap); 220 if (error) 221 return (error); 222 fmode &= ~O_TRUNC; 223 /* locked vnode is returned */ 224 } else { 225 if (fmode & O_EXCL) { 226 error = EEXIST; 227 } else { 228 error = cache_vget(&nd->nl_nch, cred, 229 LK_EXCLUSIVE, &vp); 230 } 231 if (error) 232 return (error); 233 fmode &= ~O_CREAT; 234 } 235 } else { 236 error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp); 237 if (error) 238 return (error); 239 } 240 241 /* 242 * We have a locked vnode and ncp now. Note that the ncp will 243 * be cleaned up by the caller if nd->nl_nch is left intact. 244 */ 245 if (vp->v_type == VLNK) { 246 error = EMLINK; 247 goto bad; 248 } 249 if (vp->v_type == VSOCK) { 250 error = EOPNOTSUPP; 251 goto bad; 252 } 253 if ((fmode & O_CREAT) == 0) { 254 if (fmode & (FWRITE | O_TRUNC)) { 255 if (vp->v_type == VDIR) { 256 error = EISDIR; 257 goto bad; 258 } 259 error = vn_writechk(vp, &nd->nl_nch); 260 if (error) { 261 /* 262 * Special stale handling, re-resolve the 263 * vnode. 264 */ 265 if (error == ESTALE) { 266 vput(vp); 267 vp = NULL; 268 cache_setunresolved(&nd->nl_nch); 269 error = cache_resolve(&nd->nl_nch, cred); 270 if (error == 0) 271 goto again; 272 } 273 goto bad; 274 } 275 } 276 } 277 if (fmode & O_TRUNC) { 278 vn_unlock(vp); /* XXX */ 279 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */ 280 VATTR_NULL(vap); 281 vap->va_size = 0; 282 error = VOP_SETATTR(vp, vap, cred); 283 if (error) 284 goto bad; 285 } 286 287 /* 288 * Setup the fp so VOP_OPEN can override it. No descriptor has been 289 * associated with the fp yet so we own it clean. 290 * 291 * f_nchandle inherits nl_nch. This used to be necessary only for 292 * directories but now we do it unconditionally so f*() ops 293 * such as fchmod() can access the actual namespace that was 294 * used to open the file. 295 */ 296 if (fp) { 297 if (nd->nl_flags & NLC_APPENDONLY) 298 fmode |= FAPPENDONLY; 299 fp->f_nchandle = nd->nl_nch; 300 cache_zero(&nd->nl_nch); 301 cache_unlock(&fp->f_nchandle); 302 } 303 304 /* 305 * Get rid of nl_nch. vn_open does not return it (it returns the 306 * vnode or the file pointer). Note: we can't leave nl_nch locked 307 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g. 308 * on /dev/ttyd0 309 */ 310 if (nd->nl_nch.ncp) 311 cache_put(&nd->nl_nch); 312 313 error = VOP_OPEN(vp, fmode, cred, fp); 314 if (error) { 315 /* 316 * setting f_ops to &badfileops will prevent the descriptor 317 * code from trying to close and release the vnode, since 318 * the open failed we do not want to call close. 319 */ 320 if (fp) { 321 fp->f_data = NULL; 322 fp->f_ops = &badfileops; 323 } 324 goto bad; 325 } 326 327 #if 0 328 /* 329 * Assert that VREG files have been setup for vmio. 330 */ 331 KASSERT(vp->v_type != VREG || vp->v_object != NULL, 332 ("vn_open: regular file was not VMIO enabled!")); 333 #endif 334 335 /* 336 * Return the vnode. XXX needs some cleaning up. The vnode is 337 * only returned in the fp == NULL case. 338 */ 339 if (fp == NULL) { 340 nd->nl_open_vp = vp; 341 nd->nl_vp_fmode = fmode; 342 if ((nd->nl_flags & NLC_LOCKVP) == 0) 343 vn_unlock(vp); 344 } else { 345 vput(vp); 346 } 347 return (0); 348 bad: 349 if (vp) 350 vput(vp); 351 return (error); 352 } 353 354 int 355 vn_opendisk(const char *devname, int fmode, struct vnode **vpp) 356 { 357 struct vnode *vp; 358 int error; 359 360 if (strncmp(devname, "/dev/", 5) == 0) 361 devname += 5; 362 if ((vp = getsynthvnode(devname)) == NULL) { 363 error = ENODEV; 364 } else { 365 error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL); 366 vn_unlock(vp); 367 if (error) { 368 vrele(vp); 369 vp = NULL; 370 } 371 } 372 *vpp = vp; 373 return (error); 374 } 375 376 /* 377 * Check for write permissions on the specified vnode. nch may be NULL. 378 */ 379 int 380 vn_writechk(struct vnode *vp, struct nchandle *nch) 381 { 382 /* 383 * If there's shared text associated with 384 * the vnode, try to free it up once. If 385 * we fail, we can't allow writing. 386 */ 387 if (vp->v_flag & VTEXT) 388 return (ETXTBSY); 389 390 /* 391 * If the vnode represents a regular file, check the mount 392 * point via the nch. This may be a different mount point 393 * then the one embedded in the vnode (e.g. nullfs). 394 * 395 * We can still write to non-regular files (e.g. devices) 396 * via read-only mounts. 397 */ 398 if (nch && nch->ncp && vp->v_type == VREG) 399 return (ncp_writechk(nch)); 400 return (0); 401 } 402 403 /* 404 * Check whether the underlying mount is read-only. The mount point 405 * referenced by the namecache may be different from the mount point 406 * used by the underlying vnode in the case of NULLFS, so a separate 407 * check is needed. 408 */ 409 int 410 ncp_writechk(struct nchandle *nch) 411 { 412 if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY)) 413 return (EROFS); 414 return(0); 415 } 416 417 /* 418 * Vnode close call 419 */ 420 int 421 vn_close(struct vnode *vp, int flags) 422 { 423 int error; 424 425 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 426 if (error == 0) { 427 error = VOP_CLOSE(vp, flags); 428 vn_unlock(vp); 429 } 430 vrele(vp); 431 return (error); 432 } 433 434 static __inline 435 int 436 sequential_heuristic(struct uio *uio, struct file *fp) 437 { 438 /* 439 * Sequential heuristic - detect sequential operation 440 * 441 * NOTE: SMP: We allow f_seqcount updates to race. 442 */ 443 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 444 uio->uio_offset == fp->f_nextoff) { 445 int tmpseq = fp->f_seqcount; 446 /* 447 * XXX we assume that the filesystem block size is 448 * the default. Not true, but still gives us a pretty 449 * good indicator of how sequential the read operations 450 * are. 451 */ 452 tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; 453 if (tmpseq > IO_SEQMAX) 454 tmpseq = IO_SEQMAX; 455 fp->f_seqcount = tmpseq; 456 return(fp->f_seqcount << IO_SEQSHIFT); 457 } 458 459 /* 460 * Not sequential, quick draw-down of seqcount 461 * 462 * NOTE: SMP: We allow f_seqcount updates to race. 463 */ 464 if (fp->f_seqcount > 1) 465 fp->f_seqcount = 1; 466 else 467 fp->f_seqcount = 0; 468 return(0); 469 } 470 471 /* 472 * get - lock and return the f_offset field. 473 * set - set and unlock the f_offset field. 474 * 475 * These routines serve the dual purpose of serializing access to the 476 * f_offset field (at least on i386) and guaranteeing operational integrity 477 * when multiple read()ers and write()ers are present on the same fp. 478 */ 479 static __inline off_t 480 vn_get_fpf_offset(struct file *fp) 481 { 482 u_int flags; 483 u_int nflags; 484 485 /* 486 * Shortcut critical path. 487 */ 488 flags = fp->f_flag & ~FOFFSETLOCK; 489 if (atomic_cmpset_int(&fp->f_flag, flags, flags | FOFFSETLOCK)) 490 return(fp->f_offset); 491 492 /* 493 * The hard way 494 */ 495 for (;;) { 496 flags = fp->f_flag; 497 if (flags & FOFFSETLOCK) { 498 nflags = flags | FOFFSETWAKE; 499 tsleep_interlock(&fp->f_flag, 0); 500 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) 501 tsleep(&fp->f_flag, PINTERLOCKED, "fpoff", 0); 502 } else { 503 nflags = flags | FOFFSETLOCK; 504 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) 505 break; 506 } 507 } 508 return(fp->f_offset); 509 } 510 511 static __inline void 512 vn_set_fpf_offset(struct file *fp, off_t offset) 513 { 514 u_int flags; 515 u_int nflags; 516 517 /* 518 * We hold the lock so we can set the offset without interference. 519 */ 520 fp->f_offset = offset; 521 522 /* 523 * Normal release is already a reasonably critical path. 524 */ 525 for (;;) { 526 flags = fp->f_flag; 527 nflags = flags & ~(FOFFSETLOCK | FOFFSETWAKE); 528 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) { 529 if (flags & FOFFSETWAKE) 530 wakeup(&fp->f_flag); 531 break; 532 } 533 } 534 } 535 536 static __inline off_t 537 vn_poll_fpf_offset(struct file *fp) 538 { 539 #if defined(__amd64__) || !defined(SMP) 540 return(fp->f_offset); 541 #else 542 off_t off = vn_get_fpf_offset(fp); 543 vn_set_fpf_offset(fp, off); 544 return(off); 545 #endif 546 } 547 548 /* 549 * Package up an I/O request on a vnode into a uio and do it. 550 */ 551 int 552 vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, 553 off_t offset, enum uio_seg segflg, int ioflg, 554 struct ucred *cred, int *aresid) 555 { 556 struct uio auio; 557 struct iovec aiov; 558 struct ccms_lock ccms_lock; 559 int error; 560 561 if ((ioflg & IO_NODELOCKED) == 0) 562 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 563 auio.uio_iov = &aiov; 564 auio.uio_iovcnt = 1; 565 aiov.iov_base = base; 566 aiov.iov_len = len; 567 auio.uio_resid = len; 568 auio.uio_offset = offset; 569 auio.uio_segflg = segflg; 570 auio.uio_rw = rw; 571 auio.uio_td = curthread; 572 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, &auio); 573 if (rw == UIO_READ) { 574 error = VOP_READ(vp, &auio, ioflg, cred); 575 } else { 576 error = VOP_WRITE(vp, &auio, ioflg, cred); 577 } 578 ccms_lock_put(&vp->v_ccms, &ccms_lock); 579 if (aresid) 580 *aresid = auio.uio_resid; 581 else 582 if (auio.uio_resid && error == 0) 583 error = EIO; 584 if ((ioflg & IO_NODELOCKED) == 0) 585 vn_unlock(vp); 586 return (error); 587 } 588 589 /* 590 * Package up an I/O request on a vnode into a uio and do it. The I/O 591 * request is split up into smaller chunks and we try to avoid saturating 592 * the buffer cache while potentially holding a vnode locked, so we 593 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() 594 * to give other processes a chance to lock the vnode (either other processes 595 * core'ing the same binary, or unrelated processes scanning the directory). 596 */ 597 int 598 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, 599 off_t offset, enum uio_seg segflg, int ioflg, 600 struct ucred *cred, int *aresid) 601 { 602 int error = 0; 603 604 do { 605 int chunk; 606 607 /* 608 * Force `offset' to a multiple of MAXBSIZE except possibly 609 * for the first chunk, so that filesystems only need to 610 * write full blocks except possibly for the first and last 611 * chunks. 612 */ 613 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 614 615 if (chunk > len) 616 chunk = len; 617 if (vp->v_type == VREG) { 618 switch(rw) { 619 case UIO_READ: 620 bwillread(chunk); 621 break; 622 case UIO_WRITE: 623 bwillwrite(chunk); 624 break; 625 } 626 } 627 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 628 ioflg, cred, aresid); 629 len -= chunk; /* aresid calc already includes length */ 630 if (error) 631 break; 632 offset += chunk; 633 base += chunk; 634 uio_yield(); 635 } while (len); 636 if (aresid) 637 *aresid += len; 638 return (error); 639 } 640 641 /* 642 * MPALMOSTSAFE - acquires mplock 643 * 644 * File pointers can no longer get ripped up by revoke so 645 * we don't need to lock access to the vp. 646 * 647 * f_offset updates are not guaranteed against multiple readers 648 */ 649 static int 650 vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 651 { 652 struct ccms_lock ccms_lock; 653 struct vnode *vp; 654 int error, ioflag; 655 656 KASSERT(uio->uio_td == curthread, 657 ("uio_td %p is not td %p", uio->uio_td, curthread)); 658 vp = (struct vnode *)fp->f_data; 659 660 ioflag = 0; 661 if (flags & O_FBLOCKING) { 662 /* ioflag &= ~IO_NDELAY; */ 663 } else if (flags & O_FNONBLOCKING) { 664 ioflag |= IO_NDELAY; 665 } else if (fp->f_flag & FNONBLOCK) { 666 ioflag |= IO_NDELAY; 667 } 668 if (flags & O_FBUFFERED) { 669 /* ioflag &= ~IO_DIRECT; */ 670 } else if (flags & O_FUNBUFFERED) { 671 ioflag |= IO_DIRECT; 672 } else if (fp->f_flag & O_DIRECT) { 673 ioflag |= IO_DIRECT; 674 } 675 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 676 uio->uio_offset = vn_get_fpf_offset(fp); 677 vn_lock(vp, LK_SHARED | LK_RETRY); 678 ioflag |= sequential_heuristic(uio, fp); 679 680 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio); 681 if (read_mpsafe && (vp->v_flag & VMP_READ)) { 682 error = VOP_READ(vp, uio, ioflag, cred); 683 } else { 684 get_mplock(); 685 error = VOP_READ(vp, uio, ioflag, cred); 686 rel_mplock(); 687 } 688 ccms_lock_put(&vp->v_ccms, &ccms_lock); 689 fp->f_nextoff = uio->uio_offset; 690 vn_unlock(vp); 691 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 692 vn_set_fpf_offset(fp, uio->uio_offset); 693 return (error); 694 } 695 696 /* 697 * Device-optimized file table vnode read routine. 698 * 699 * This bypasses the VOP table and talks directly to the device. Most 700 * filesystems just route to specfs and can make this optimization. 701 * 702 * MPALMOSTSAFE - acquires mplock 703 */ 704 static int 705 svn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 706 { 707 struct vnode *vp; 708 int ioflag; 709 int error; 710 cdev_t dev; 711 712 get_mplock(); 713 KASSERT(uio->uio_td == curthread, 714 ("uio_td %p is not td %p", uio->uio_td, curthread)); 715 716 vp = (struct vnode *)fp->f_data; 717 if (vp == NULL || vp->v_type == VBAD) { 718 error = EBADF; 719 goto done; 720 } 721 722 if ((dev = vp->v_rdev) == NULL) { 723 error = EBADF; 724 goto done; 725 } 726 reference_dev(dev); 727 728 if (uio->uio_resid == 0) { 729 error = 0; 730 goto done; 731 } 732 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 733 uio->uio_offset = vn_get_fpf_offset(fp); 734 735 ioflag = 0; 736 if (flags & O_FBLOCKING) { 737 /* ioflag &= ~IO_NDELAY; */ 738 } else if (flags & O_FNONBLOCKING) { 739 ioflag |= IO_NDELAY; 740 } else if (fp->f_flag & FNONBLOCK) { 741 ioflag |= IO_NDELAY; 742 } 743 if (flags & O_FBUFFERED) { 744 /* ioflag &= ~IO_DIRECT; */ 745 } else if (flags & O_FUNBUFFERED) { 746 ioflag |= IO_DIRECT; 747 } else if (fp->f_flag & O_DIRECT) { 748 ioflag |= IO_DIRECT; 749 } 750 ioflag |= sequential_heuristic(uio, fp); 751 752 error = dev_dread(dev, uio, ioflag); 753 754 release_dev(dev); 755 fp->f_nextoff = uio->uio_offset; 756 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 757 vn_set_fpf_offset(fp, uio->uio_offset); 758 done: 759 rel_mplock(); 760 return (error); 761 } 762 763 /* 764 * MPALMOSTSAFE - acquires mplock 765 */ 766 static int 767 vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 768 { 769 struct ccms_lock ccms_lock; 770 struct vnode *vp; 771 int error, ioflag; 772 773 KASSERT(uio->uio_td == curthread, 774 ("uio_td %p is not p %p", uio->uio_td, curthread)); 775 vp = (struct vnode *)fp->f_data; 776 777 ioflag = IO_UNIT; 778 if (vp->v_type == VREG && 779 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) { 780 ioflag |= IO_APPEND; 781 } 782 783 if (flags & O_FBLOCKING) { 784 /* ioflag &= ~IO_NDELAY; */ 785 } else if (flags & O_FNONBLOCKING) { 786 ioflag |= IO_NDELAY; 787 } else if (fp->f_flag & FNONBLOCK) { 788 ioflag |= IO_NDELAY; 789 } 790 if (flags & O_FBUFFERED) { 791 /* ioflag &= ~IO_DIRECT; */ 792 } else if (flags & O_FUNBUFFERED) { 793 ioflag |= IO_DIRECT; 794 } else if (fp->f_flag & O_DIRECT) { 795 ioflag |= IO_DIRECT; 796 } 797 if (flags & O_FASYNCWRITE) { 798 /* ioflag &= ~IO_SYNC; */ 799 } else if (flags & O_FSYNCWRITE) { 800 ioflag |= IO_SYNC; 801 } else if (fp->f_flag & O_FSYNC) { 802 ioflag |= IO_SYNC; 803 } 804 805 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)) 806 ioflag |= IO_SYNC; 807 if ((flags & O_FOFFSET) == 0) 808 uio->uio_offset = vn_get_fpf_offset(fp); 809 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 810 ioflag |= sequential_heuristic(uio, fp); 811 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio); 812 if (write_mpsafe && (vp->v_flag & VMP_WRITE)) { 813 error = VOP_WRITE(vp, uio, ioflag, cred); 814 } else { 815 get_mplock(); 816 error = VOP_WRITE(vp, uio, ioflag, cred); 817 rel_mplock(); 818 } 819 ccms_lock_put(&vp->v_ccms, &ccms_lock); 820 fp->f_nextoff = uio->uio_offset; 821 vn_unlock(vp); 822 if ((flags & O_FOFFSET) == 0) 823 vn_set_fpf_offset(fp, uio->uio_offset); 824 return (error); 825 } 826 827 /* 828 * Device-optimized file table vnode write routine. 829 * 830 * This bypasses the VOP table and talks directly to the device. Most 831 * filesystems just route to specfs and can make this optimization. 832 * 833 * MPALMOSTSAFE - acquires mplock 834 */ 835 static int 836 svn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 837 { 838 struct vnode *vp; 839 int ioflag; 840 int error; 841 cdev_t dev; 842 843 get_mplock(); 844 KASSERT(uio->uio_td == curthread, 845 ("uio_td %p is not p %p", uio->uio_td, curthread)); 846 847 vp = (struct vnode *)fp->f_data; 848 if (vp == NULL || vp->v_type == VBAD) { 849 error = EBADF; 850 goto done; 851 } 852 if (vp->v_type == VREG) 853 bwillwrite(uio->uio_resid); 854 vp = (struct vnode *)fp->f_data; /* XXX needed? */ 855 856 if ((dev = vp->v_rdev) == NULL) { 857 error = EBADF; 858 goto done; 859 } 860 reference_dev(dev); 861 862 if ((flags & O_FOFFSET) == 0) 863 uio->uio_offset = vn_get_fpf_offset(fp); 864 865 ioflag = IO_UNIT; 866 if (vp->v_type == VREG && 867 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) { 868 ioflag |= IO_APPEND; 869 } 870 871 if (flags & O_FBLOCKING) { 872 /* ioflag &= ~IO_NDELAY; */ 873 } else if (flags & O_FNONBLOCKING) { 874 ioflag |= IO_NDELAY; 875 } else if (fp->f_flag & FNONBLOCK) { 876 ioflag |= IO_NDELAY; 877 } 878 if (flags & O_FBUFFERED) { 879 /* ioflag &= ~IO_DIRECT; */ 880 } else if (flags & O_FUNBUFFERED) { 881 ioflag |= IO_DIRECT; 882 } else if (fp->f_flag & O_DIRECT) { 883 ioflag |= IO_DIRECT; 884 } 885 if (flags & O_FASYNCWRITE) { 886 /* ioflag &= ~IO_SYNC; */ 887 } else if (flags & O_FSYNCWRITE) { 888 ioflag |= IO_SYNC; 889 } else if (fp->f_flag & O_FSYNC) { 890 ioflag |= IO_SYNC; 891 } 892 893 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)) 894 ioflag |= IO_SYNC; 895 ioflag |= sequential_heuristic(uio, fp); 896 897 error = dev_dwrite(dev, uio, ioflag); 898 899 release_dev(dev); 900 fp->f_nextoff = uio->uio_offset; 901 if ((flags & O_FOFFSET) == 0) 902 vn_set_fpf_offset(fp, uio->uio_offset); 903 done: 904 rel_mplock(); 905 return (error); 906 } 907 908 /* 909 * MPSAFE 910 */ 911 static int 912 vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred) 913 { 914 struct vnode *vp; 915 int error; 916 917 vp = (struct vnode *)fp->f_data; 918 error = vn_stat(vp, sb, cred); 919 return (error); 920 } 921 922 /* 923 * MPSAFE (if vnode has VMP_GETATTR) 924 */ 925 int 926 vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred) 927 { 928 struct vattr vattr; 929 struct vattr *vap; 930 int error; 931 u_short mode; 932 cdev_t dev; 933 934 vap = &vattr; 935 if (getattr_mpsafe && (vp->v_flag & VMP_GETATTR)) { 936 error = VOP_GETATTR(vp, vap); 937 } else { 938 get_mplock(); 939 error = VOP_GETATTR(vp, vap); 940 rel_mplock(); 941 } 942 if (error) 943 return (error); 944 945 /* 946 * Zero the spare stat fields 947 */ 948 sb->st_lspare = 0; 949 sb->st_qspare = 0; 950 951 /* 952 * Copy from vattr table 953 */ 954 if (vap->va_fsid != VNOVAL) 955 sb->st_dev = vap->va_fsid; 956 else 957 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 958 sb->st_ino = vap->va_fileid; 959 mode = vap->va_mode; 960 switch (vap->va_type) { 961 case VREG: 962 mode |= S_IFREG; 963 break; 964 case VDATABASE: 965 mode |= S_IFDB; 966 break; 967 case VDIR: 968 mode |= S_IFDIR; 969 break; 970 case VBLK: 971 mode |= S_IFBLK; 972 break; 973 case VCHR: 974 mode |= S_IFCHR; 975 break; 976 case VLNK: 977 mode |= S_IFLNK; 978 /* This is a cosmetic change, symlinks do not have a mode. */ 979 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) 980 sb->st_mode &= ~ACCESSPERMS; /* 0000 */ 981 else 982 sb->st_mode |= ACCESSPERMS; /* 0777 */ 983 break; 984 case VSOCK: 985 mode |= S_IFSOCK; 986 break; 987 case VFIFO: 988 mode |= S_IFIFO; 989 break; 990 default: 991 return (EBADF); 992 } 993 sb->st_mode = mode; 994 if (vap->va_nlink > (nlink_t)-1) 995 sb->st_nlink = (nlink_t)-1; 996 else 997 sb->st_nlink = vap->va_nlink; 998 sb->st_uid = vap->va_uid; 999 sb->st_gid = vap->va_gid; 1000 sb->st_rdev = makeudev(vap->va_rmajor, vap->va_rminor); 1001 sb->st_size = vap->va_size; 1002 sb->st_atimespec = vap->va_atime; 1003 sb->st_mtimespec = vap->va_mtime; 1004 sb->st_ctimespec = vap->va_ctime; 1005 1006 /* 1007 * A VCHR and VBLK device may track the last access and last modified 1008 * time independantly of the filesystem. This is particularly true 1009 * because device read and write calls may bypass the filesystem. 1010 */ 1011 if (vp->v_type == VCHR || vp->v_type == VBLK) { 1012 dev = vp->v_rdev; 1013 if (dev != NULL) { 1014 if (dev->si_lastread) { 1015 sb->st_atimespec.tv_sec = dev->si_lastread; 1016 sb->st_atimespec.tv_nsec = 0; 1017 } 1018 if (dev->si_lastwrite) { 1019 sb->st_atimespec.tv_sec = dev->si_lastwrite; 1020 sb->st_atimespec.tv_nsec = 0; 1021 } 1022 } 1023 } 1024 1025 /* 1026 * According to www.opengroup.org, the meaning of st_blksize is 1027 * "a filesystem-specific preferred I/O block size for this 1028 * object. In some filesystem types, this may vary from file 1029 * to file" 1030 * Default to PAGE_SIZE after much discussion. 1031 */ 1032 1033 if (vap->va_type == VREG) { 1034 sb->st_blksize = vap->va_blocksize; 1035 } else if (vn_isdisk(vp, NULL)) { 1036 /* 1037 * XXX this is broken. If the device is not yet open (aka 1038 * stat() call, aka v_rdev == NULL), how are we supposed 1039 * to get a valid block size out of it? 1040 */ 1041 dev = vp->v_rdev; 1042 if (dev == NULL && vp->v_type == VCHR) { 1043 get_mplock(); 1044 dev = get_dev(vp->v_umajor, vp->v_uminor); 1045 rel_mplock(); 1046 } 1047 sb->st_blksize = dev->si_bsize_best; 1048 if (sb->st_blksize < dev->si_bsize_phys) 1049 sb->st_blksize = dev->si_bsize_phys; 1050 if (sb->st_blksize < BLKDEV_IOSIZE) 1051 sb->st_blksize = BLKDEV_IOSIZE; 1052 } else { 1053 sb->st_blksize = PAGE_SIZE; 1054 } 1055 1056 sb->st_flags = vap->va_flags; 1057 1058 error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0); 1059 if (error) 1060 sb->st_gen = 0; 1061 else 1062 sb->st_gen = (u_int32_t)vap->va_gen; 1063 1064 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 1065 sb->st_fsmid = vap->va_fsmid; 1066 return (0); 1067 } 1068 1069 /* 1070 * MPALMOSTSAFE - acquires mplock 1071 */ 1072 static int 1073 vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred) 1074 { 1075 struct vnode *vp = ((struct vnode *)fp->f_data); 1076 struct vnode *ovp; 1077 struct vattr vattr; 1078 int error; 1079 off_t size; 1080 1081 get_mplock(); 1082 1083 switch (vp->v_type) { 1084 case VREG: 1085 case VDIR: 1086 if (com == FIONREAD) { 1087 error = VOP_GETATTR(vp, &vattr); 1088 if (error) 1089 break; 1090 size = vattr.va_size; 1091 if ((vp->v_flag & VNOTSEEKABLE) == 0) 1092 size -= vn_poll_fpf_offset(fp); 1093 if (size > 0x7FFFFFFF) 1094 size = 0x7FFFFFFF; 1095 *(int *)data = size; 1096 error = 0; 1097 break; 1098 } 1099 if (com == FIOASYNC) { /* XXX */ 1100 error = 0; /* XXX */ 1101 break; 1102 } 1103 /* fall into ... */ 1104 default: 1105 #if 0 1106 return (ENOTTY); 1107 #endif 1108 case VFIFO: 1109 case VCHR: 1110 case VBLK: 1111 if (com == FIODTYPE) { 1112 if (vp->v_type != VCHR && vp->v_type != VBLK) { 1113 error = ENOTTY; 1114 break; 1115 } 1116 *(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK; 1117 error = 0; 1118 break; 1119 } 1120 error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred); 1121 if (error == 0 && com == TIOCSCTTY) { 1122 struct proc *p = curthread->td_proc; 1123 struct session *sess; 1124 1125 if (p == NULL) { 1126 error = ENOTTY; 1127 break; 1128 } 1129 1130 sess = p->p_session; 1131 /* Do nothing if reassigning same control tty */ 1132 if (sess->s_ttyvp == vp) { 1133 error = 0; 1134 break; 1135 } 1136 1137 /* Get rid of reference to old control tty */ 1138 ovp = sess->s_ttyvp; 1139 vref(vp); 1140 sess->s_ttyvp = vp; 1141 if (ovp) 1142 vrele(ovp); 1143 } 1144 break; 1145 } 1146 rel_mplock(); 1147 return (error); 1148 } 1149 1150 /* 1151 * MPALMOSTSAFE - acquires mplock 1152 */ 1153 static int 1154 vn_poll(struct file *fp, int events, struct ucred *cred) 1155 { 1156 int error; 1157 1158 get_mplock(); 1159 error = VOP_POLL(((struct vnode *)fp->f_data), events, cred); 1160 rel_mplock(); 1161 return (error); 1162 } 1163 1164 /* 1165 * Check that the vnode is still valid, and if so 1166 * acquire requested lock. 1167 */ 1168 int 1169 #ifndef DEBUG_LOCKS 1170 vn_lock(struct vnode *vp, int flags) 1171 #else 1172 debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line) 1173 #endif 1174 { 1175 int error; 1176 1177 do { 1178 #ifdef DEBUG_LOCKS 1179 vp->filename = filename; 1180 vp->line = line; 1181 error = debuglockmgr(&vp->v_lock, flags, 1182 "vn_lock", filename, line); 1183 #else 1184 error = lockmgr(&vp->v_lock, flags); 1185 #endif 1186 if (error == 0) 1187 break; 1188 } while (flags & LK_RETRY); 1189 1190 /* 1191 * Because we (had better!) have a ref on the vnode, once it 1192 * goes to VRECLAIMED state it will not be recycled until all 1193 * refs go away. So we can just check the flag. 1194 */ 1195 if (error == 0 && (vp->v_flag & VRECLAIMED)) { 1196 lockmgr(&vp->v_lock, LK_RELEASE); 1197 error = ENOENT; 1198 } 1199 return (error); 1200 } 1201 1202 void 1203 vn_unlock(struct vnode *vp) 1204 { 1205 lockmgr(&vp->v_lock, LK_RELEASE); 1206 } 1207 1208 int 1209 vn_islocked(struct vnode *vp) 1210 { 1211 return (lockstatus(&vp->v_lock, curthread)); 1212 } 1213 1214 /* 1215 * MPALMOSTSAFE - acquires mplock 1216 */ 1217 static int 1218 vn_closefile(struct file *fp) 1219 { 1220 int error; 1221 1222 get_mplock(); 1223 fp->f_ops = &badfileops; 1224 error = vn_close(((struct vnode *)fp->f_data), fp->f_flag); 1225 rel_mplock(); 1226 return (error); 1227 } 1228 1229 /* 1230 * MPALMOSTSAFE - acquires mplock 1231 */ 1232 static int 1233 vn_kqfilter(struct file *fp, struct knote *kn) 1234 { 1235 int error; 1236 1237 get_mplock(); 1238 error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn); 1239 rel_mplock(); 1240 return (error); 1241 } 1242