1 /*- 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 61 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include <sys/param.h> 68 #include <sys/bio.h> 69 #include <sys/systm.h> 70 #include <sys/buf.h> 71 #include <sys/conf.h> 72 #include <sys/extattr.h> 73 #include <sys/kernel.h> 74 #include <sys/limits.h> 75 #include <sys/malloc.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/proc.h> 79 #include <sys/resourcevar.h> 80 #include <sys/signalvar.h> 81 #include <sys/stat.h> 82 #include <sys/vmmeter.h> 83 #include <sys/vnode.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vnode_pager.h> 91 92 #include <ufs/ufs/extattr.h> 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/ufs_extern.h> 96 #include <ufs/ufs/ufsmount.h> 97 98 #include <ufs/ffs/fs.h> 99 #include <ufs/ffs/ffs_extern.h> 100 #include "opt_directio.h" 101 #include "opt_ffs.h" 102 103 #ifdef DIRECTIO 104 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 105 #endif 106 static vop_fsync_t ffs_fsync; 107 static _vop_lock_t ffs_lock; 108 static vop_getpages_t ffs_getpages; 109 static vop_read_t ffs_read; 110 static vop_write_t ffs_write; 111 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 112 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 113 struct ucred *cred); 114 static vop_strategy_t ffsext_strategy; 115 static vop_closeextattr_t ffs_closeextattr; 116 static vop_deleteextattr_t ffs_deleteextattr; 117 static vop_getextattr_t ffs_getextattr; 118 static vop_listextattr_t ffs_listextattr; 119 static vop_openextattr_t ffs_openextattr; 120 static vop_setextattr_t ffs_setextattr; 121 static vop_vptofh_t ffs_vptofh; 122 123 124 /* Global vfs data structures for ufs. */ 125 struct vop_vector ffs_vnodeops1 = { 126 .vop_default = &ufs_vnodeops, 127 .vop_fsync = ffs_fsync, 128 .vop_getpages = ffs_getpages, 129 ._vop_lock = ffs_lock, 130 .vop_read = ffs_read, 131 .vop_reallocblks = ffs_reallocblks, 132 .vop_write = ffs_write, 133 .vop_vptofh = ffs_vptofh, 134 }; 135 136 struct vop_vector ffs_fifoops1 = { 137 .vop_default = &ufs_fifoops, 138 .vop_fsync = ffs_fsync, 139 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ 140 .vop_vptofh = ffs_vptofh, 141 }; 142 143 /* Global vfs data structures for ufs. */ 144 struct vop_vector ffs_vnodeops2 = { 145 .vop_default = &ufs_vnodeops, 146 .vop_fsync = ffs_fsync, 147 .vop_getpages = ffs_getpages, 148 ._vop_lock = ffs_lock, 149 .vop_read = ffs_read, 150 .vop_reallocblks = ffs_reallocblks, 151 .vop_write = ffs_write, 152 .vop_closeextattr = ffs_closeextattr, 153 .vop_deleteextattr = ffs_deleteextattr, 154 .vop_getextattr = ffs_getextattr, 155 .vop_listextattr = ffs_listextattr, 156 .vop_openextattr = ffs_openextattr, 157 .vop_setextattr = ffs_setextattr, 158 .vop_vptofh = ffs_vptofh, 159 }; 160 161 struct vop_vector ffs_fifoops2 = { 162 .vop_default = &ufs_fifoops, 163 .vop_fsync = ffs_fsync, 164 ._vop_lock = ffs_lock, 165 .vop_reallocblks = ffs_reallocblks, 166 .vop_strategy = ffsext_strategy, 167 .vop_closeextattr = ffs_closeextattr, 168 .vop_deleteextattr = ffs_deleteextattr, 169 .vop_getextattr = ffs_getextattr, 170 .vop_listextattr = ffs_listextattr, 171 .vop_openextattr = ffs_openextattr, 172 .vop_setextattr = ffs_setextattr, 173 .vop_vptofh = ffs_vptofh, 174 }; 175 176 /* 177 * Synch an open file. 178 */ 179 /* ARGSUSED */ 180 static int 181 ffs_fsync(struct vop_fsync_args *ap) 182 { 183 int error; 184 185 error = ffs_syncvnode(ap->a_vp, ap->a_waitfor); 186 if (error) 187 return (error); 188 if (ap->a_waitfor == MNT_WAIT && 189 (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP)) 190 error = softdep_fsync(ap->a_vp); 191 return (error); 192 } 193 194 int 195 ffs_syncvnode(struct vnode *vp, int waitfor) 196 { 197 struct inode *ip = VTOI(vp); 198 struct buf *bp; 199 struct buf *nbp; 200 int s, error, wait, passes, skipmeta; 201 ufs_lbn_t lbn; 202 203 wait = (waitfor == MNT_WAIT); 204 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 205 206 /* 207 * Flush all dirty buffers associated with a vnode. 208 */ 209 passes = NIADDR + 1; 210 skipmeta = 0; 211 if (wait) 212 skipmeta = 1; 213 s = splbio(); 214 VI_LOCK(vp); 215 loop: 216 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) 217 bp->b_vflags &= ~BV_SCANNED; 218 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { 219 /* 220 * Reasons to skip this buffer: it has already been considered 221 * on this pass, this pass is the first time through on a 222 * synchronous flush request and the buffer being considered 223 * is metadata, the buffer has dependencies that will cause 224 * it to be redirtied and it has not already been deferred, 225 * or it is already being written. 226 */ 227 if ((bp->b_vflags & BV_SCANNED) != 0) 228 continue; 229 bp->b_vflags |= BV_SCANNED; 230 if ((skipmeta == 1 && bp->b_lblkno < 0)) 231 continue; 232 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 233 continue; 234 VI_UNLOCK(vp); 235 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 236 (bp->b_flags & B_DEFERRED) == 0 && 237 buf_countdeps(bp, 0)) { 238 bp->b_flags |= B_DEFERRED; 239 BUF_UNLOCK(bp); 240 VI_LOCK(vp); 241 continue; 242 } 243 if ((bp->b_flags & B_DELWRI) == 0) 244 panic("ffs_fsync: not dirty"); 245 /* 246 * If this is a synchronous flush request, or it is not a 247 * file or device, start the write on this buffer immediatly. 248 */ 249 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 250 251 /* 252 * On our final pass through, do all I/O synchronously 253 * so that we can find out if our flush is failing 254 * because of write errors. 255 */ 256 if (passes > 0 || !wait) { 257 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 258 (void) vfs_bio_awrite(bp); 259 } else { 260 bremfree(bp); 261 splx(s); 262 (void) bawrite(bp); 263 s = splbio(); 264 } 265 } else { 266 bremfree(bp); 267 splx(s); 268 if ((error = bwrite(bp)) != 0) 269 return (error); 270 s = splbio(); 271 } 272 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 273 /* 274 * If the buffer is for data that has been truncated 275 * off the file, then throw it away. 276 */ 277 bremfree(bp); 278 bp->b_flags |= B_INVAL | B_NOCACHE; 279 splx(s); 280 brelse(bp); 281 s = splbio(); 282 } else 283 vfs_bio_awrite(bp); 284 285 /* 286 * Since we may have slept during the I/O, we need 287 * to start from a known point. 288 */ 289 VI_LOCK(vp); 290 nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd); 291 } 292 /* 293 * If we were asked to do this synchronously, then go back for 294 * another pass, this time doing the metadata. 295 */ 296 if (skipmeta) { 297 skipmeta = 0; 298 goto loop; 299 } 300 301 if (wait) { 302 bufobj_wwait(&vp->v_bufobj, 3, 0); 303 VI_UNLOCK(vp); 304 305 /* 306 * Ensure that any filesystem metatdata associated 307 * with the vnode has been written. 308 */ 309 splx(s); 310 if ((error = softdep_sync_metadata(vp)) != 0) 311 return (error); 312 s = splbio(); 313 314 VI_LOCK(vp); 315 if (vp->v_bufobj.bo_dirty.bv_cnt > 0) { 316 /* 317 * Block devices associated with filesystems may 318 * have new I/O requests posted for them even if 319 * the vnode is locked, so no amount of trying will 320 * get them clean. Thus we give block devices a 321 * good effort, then just give up. For all other file 322 * types, go around and try again until it is clean. 323 */ 324 if (passes > 0) { 325 passes -= 1; 326 goto loop; 327 } 328 #ifdef DIAGNOSTIC 329 if (!vn_isdisk(vp, NULL)) 330 vprint("ffs_fsync: dirty", vp); 331 #endif 332 } 333 } 334 VI_UNLOCK(vp); 335 splx(s); 336 return (ffs_update(vp, wait)); 337 } 338 339 static int 340 ffs_lock(ap) 341 struct _vop_lock_args /* { 342 struct vnode *a_vp; 343 int a_flags; 344 struct thread *a_td; 345 char *file; 346 int line; 347 } */ *ap; 348 { 349 #ifndef NO_FFS_SNAPSHOT 350 struct vnode *vp; 351 int flags; 352 struct lock *lkp; 353 int result; 354 355 switch (ap->a_flags & LK_TYPE_MASK) { 356 case LK_SHARED: 357 case LK_UPGRADE: 358 case LK_EXCLUSIVE: 359 vp = ap->a_vp; 360 flags = ap->a_flags; 361 for (;;) { 362 /* 363 * vnode interlock must be held to ensure that 364 * the possibly external lock isn't freed, 365 * e.g. when mutating from snapshot file vnode 366 * to regular file vnode. 367 */ 368 if ((flags & LK_INTERLOCK) == 0) { 369 VI_LOCK(vp); 370 flags |= LK_INTERLOCK; 371 } 372 lkp = vp->v_vnlock; 373 result = _lockmgr(lkp, flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line); 374 if (lkp == vp->v_vnlock || result != 0) 375 break; 376 /* 377 * Apparent success, except that the vnode 378 * mutated between snapshot file vnode and 379 * regular file vnode while this process 380 * slept. The lock currently held is not the 381 * right lock. Release it, and try to get the 382 * new lock. 383 */ 384 (void) _lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line); 385 if ((flags & LK_TYPE_MASK) == LK_UPGRADE) 386 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; 387 flags &= ~LK_INTERLOCK; 388 } 389 break; 390 default: 391 result = _VOP_LOCK_APV(&ufs_vnodeops, ap); 392 } 393 return (result); 394 #else 395 return (_VOP_LOCK_APV(&ufs_vnodeops, ap)); 396 #endif 397 } 398 399 /* 400 * Vnode op for reading. 401 */ 402 /* ARGSUSED */ 403 static int 404 ffs_read(ap) 405 struct vop_read_args /* { 406 struct vnode *a_vp; 407 struct uio *a_uio; 408 int a_ioflag; 409 struct ucred *a_cred; 410 } */ *ap; 411 { 412 struct vnode *vp; 413 struct inode *ip; 414 struct uio *uio; 415 struct fs *fs; 416 struct buf *bp; 417 ufs_lbn_t lbn, nextlbn; 418 off_t bytesinfile; 419 long size, xfersize, blkoffset; 420 int error, orig_resid; 421 int seqcount; 422 int ioflag; 423 424 vp = ap->a_vp; 425 uio = ap->a_uio; 426 ioflag = ap->a_ioflag; 427 if (ap->a_ioflag & IO_EXT) 428 #ifdef notyet 429 return (ffs_extread(vp, uio, ioflag)); 430 #else 431 panic("ffs_read+IO_EXT"); 432 #endif 433 #ifdef DIRECTIO 434 if ((ioflag & IO_DIRECT) != 0) { 435 int workdone; 436 437 error = ffs_rawread(vp, uio, &workdone); 438 if (error != 0 || workdone != 0) 439 return error; 440 } 441 #endif 442 443 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 444 ip = VTOI(vp); 445 446 #ifdef DIAGNOSTIC 447 if (uio->uio_rw != UIO_READ) 448 panic("ffs_read: mode"); 449 450 if (vp->v_type == VLNK) { 451 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 452 panic("ffs_read: short symlink"); 453 } else if (vp->v_type != VREG && vp->v_type != VDIR) 454 panic("ffs_read: type %d", vp->v_type); 455 #endif 456 orig_resid = uio->uio_resid; 457 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 458 if (orig_resid == 0) 459 return (0); 460 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 461 fs = ip->i_fs; 462 if (uio->uio_offset < ip->i_size && 463 uio->uio_offset >= fs->fs_maxfilesize) 464 return (EOVERFLOW); 465 466 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 467 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 468 break; 469 lbn = lblkno(fs, uio->uio_offset); 470 nextlbn = lbn + 1; 471 472 /* 473 * size of buffer. The buffer representing the 474 * end of the file is rounded up to the size of 475 * the block type ( fragment or full block, 476 * depending ). 477 */ 478 size = blksize(fs, ip, lbn); 479 blkoffset = blkoff(fs, uio->uio_offset); 480 481 /* 482 * The amount we want to transfer in this iteration is 483 * one FS block less the amount of the data before 484 * our startpoint (duh!) 485 */ 486 xfersize = fs->fs_bsize - blkoffset; 487 488 /* 489 * But if we actually want less than the block, 490 * or the file doesn't have a whole block more of data, 491 * then use the lesser number. 492 */ 493 if (uio->uio_resid < xfersize) 494 xfersize = uio->uio_resid; 495 if (bytesinfile < xfersize) 496 xfersize = bytesinfile; 497 498 if (lblktosize(fs, nextlbn) >= ip->i_size) { 499 /* 500 * Don't do readahead if this is the end of the file. 501 */ 502 error = bread(vp, lbn, size, NOCRED, &bp); 503 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 504 /* 505 * Otherwise if we are allowed to cluster, 506 * grab as much as we can. 507 * 508 * XXX This may not be a win if we are not 509 * doing sequential access. 510 */ 511 error = cluster_read(vp, ip->i_size, lbn, 512 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); 513 } else if (seqcount > 1) { 514 /* 515 * If we are NOT allowed to cluster, then 516 * if we appear to be acting sequentially, 517 * fire off a request for a readahead 518 * as well as a read. Note that the 4th and 5th 519 * arguments point to arrays of the size specified in 520 * the 6th argument. 521 */ 522 int nextsize = blksize(fs, ip, nextlbn); 523 error = breadn(vp, lbn, 524 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 525 } else { 526 /* 527 * Failing all of the above, just read what the 528 * user asked for. Interestingly, the same as 529 * the first option above. 530 */ 531 error = bread(vp, lbn, size, NOCRED, &bp); 532 } 533 if (error) { 534 brelse(bp); 535 bp = NULL; 536 break; 537 } 538 539 /* 540 * If IO_DIRECT then set B_DIRECT for the buffer. This 541 * will cause us to attempt to release the buffer later on 542 * and will cause the buffer cache to attempt to free the 543 * underlying pages. 544 */ 545 if (ioflag & IO_DIRECT) 546 bp->b_flags |= B_DIRECT; 547 548 /* 549 * We should only get non-zero b_resid when an I/O error 550 * has occurred, which should cause us to break above. 551 * However, if the short read did not cause an error, 552 * then we want to ensure that we do not uiomove bad 553 * or uninitialized data. 554 */ 555 size -= bp->b_resid; 556 if (size < xfersize) { 557 if (size == 0) 558 break; 559 xfersize = size; 560 } 561 562 error = uiomove((char *)bp->b_data + blkoffset, 563 (int)xfersize, uio); 564 if (error) 565 break; 566 567 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 568 (LIST_FIRST(&bp->b_dep) == NULL)) { 569 /* 570 * If there are no dependencies, and it's VMIO, 571 * then we don't need the buf, mark it available 572 * for freeing. The VM has the data. 573 */ 574 bp->b_flags |= B_RELBUF; 575 brelse(bp); 576 } else { 577 /* 578 * Otherwise let whoever 579 * made the request take care of 580 * freeing it. We just queue 581 * it onto another list. 582 */ 583 bqrelse(bp); 584 } 585 } 586 587 /* 588 * This can only happen in the case of an error 589 * because the loop above resets bp to NULL on each iteration 590 * and on normal completion has not set a new value into it. 591 * so it must have come from a 'break' statement 592 */ 593 if (bp != NULL) { 594 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 595 (LIST_FIRST(&bp->b_dep) == NULL)) { 596 bp->b_flags |= B_RELBUF; 597 brelse(bp); 598 } else { 599 bqrelse(bp); 600 } 601 } 602 603 if ((error == 0 || uio->uio_resid != orig_resid) && 604 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { 605 VI_LOCK(vp); 606 ip->i_flag |= IN_ACCESS; 607 VI_UNLOCK(vp); 608 } 609 return (error); 610 } 611 612 /* 613 * Vnode op for writing. 614 */ 615 static int 616 ffs_write(ap) 617 struct vop_write_args /* { 618 struct vnode *a_vp; 619 struct uio *a_uio; 620 int a_ioflag; 621 struct ucred *a_cred; 622 } */ *ap; 623 { 624 struct vnode *vp; 625 struct uio *uio; 626 struct inode *ip; 627 struct fs *fs; 628 struct buf *bp; 629 struct thread *td; 630 ufs_lbn_t lbn; 631 off_t osize; 632 int seqcount; 633 int blkoffset, error, flags, ioflag, resid, size, xfersize; 634 635 vp = ap->a_vp; 636 uio = ap->a_uio; 637 ioflag = ap->a_ioflag; 638 if (ap->a_ioflag & IO_EXT) 639 #ifdef notyet 640 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 641 #else 642 panic("ffs_write+IO_EXT"); 643 #endif 644 645 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 646 ip = VTOI(vp); 647 648 #ifdef DIAGNOSTIC 649 if (uio->uio_rw != UIO_WRITE) 650 panic("ffs_write: mode"); 651 #endif 652 653 switch (vp->v_type) { 654 case VREG: 655 if (ioflag & IO_APPEND) 656 uio->uio_offset = ip->i_size; 657 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 658 return (EPERM); 659 /* FALLTHROUGH */ 660 case VLNK: 661 break; 662 case VDIR: 663 panic("ffs_write: dir write"); 664 break; 665 default: 666 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 667 (int)uio->uio_offset, 668 (int)uio->uio_resid 669 ); 670 } 671 672 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 673 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 674 fs = ip->i_fs; 675 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 676 return (EFBIG); 677 /* 678 * Maybe this should be above the vnode op call, but so long as 679 * file servers have no limits, I don't think it matters. 680 */ 681 td = uio->uio_td; 682 if (vp->v_type == VREG && td != NULL) { 683 PROC_LOCK(td->td_proc); 684 if (uio->uio_offset + uio->uio_resid > 685 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 686 psignal(td->td_proc, SIGXFSZ); 687 PROC_UNLOCK(td->td_proc); 688 return (EFBIG); 689 } 690 PROC_UNLOCK(td->td_proc); 691 } 692 693 resid = uio->uio_resid; 694 osize = ip->i_size; 695 if (seqcount > BA_SEQMAX) 696 flags = BA_SEQMAX << BA_SEQSHIFT; 697 else 698 flags = seqcount << BA_SEQSHIFT; 699 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 700 flags |= IO_SYNC; 701 702 for (error = 0; uio->uio_resid > 0;) { 703 lbn = lblkno(fs, uio->uio_offset); 704 blkoffset = blkoff(fs, uio->uio_offset); 705 xfersize = fs->fs_bsize - blkoffset; 706 if (uio->uio_resid < xfersize) 707 xfersize = uio->uio_resid; 708 if (uio->uio_offset + xfersize > ip->i_size) 709 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 710 711 /* 712 * We must perform a read-before-write if the transfer size 713 * does not cover the entire buffer. 714 */ 715 if (fs->fs_bsize > xfersize) 716 flags |= BA_CLRBUF; 717 else 718 flags &= ~BA_CLRBUF; 719 /* XXX is uio->uio_offset the right thing here? */ 720 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 721 ap->a_cred, flags, &bp); 722 if (error != 0) 723 break; 724 /* 725 * If the buffer is not valid we have to clear out any 726 * garbage data from the pages instantiated for the buffer. 727 * If we do not, a failed uiomove() during a write can leave 728 * the prior contents of the pages exposed to a userland 729 * mmap(). XXX deal with uiomove() errors a better way. 730 */ 731 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 732 vfs_bio_clrbuf(bp); 733 if (ioflag & IO_DIRECT) 734 bp->b_flags |= B_DIRECT; 735 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 736 bp->b_flags |= B_NOCACHE; 737 738 if (uio->uio_offset + xfersize > ip->i_size) { 739 ip->i_size = uio->uio_offset + xfersize; 740 DIP_SET(ip, i_size, ip->i_size); 741 } 742 743 size = blksize(fs, ip, lbn) - bp->b_resid; 744 if (size < xfersize) 745 xfersize = size; 746 747 error = 748 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 749 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 750 (LIST_FIRST(&bp->b_dep) == NULL)) { 751 bp->b_flags |= B_RELBUF; 752 } 753 754 /* 755 * If IO_SYNC each buffer is written synchronously. Otherwise 756 * if we have a severe page deficiency write the buffer 757 * asynchronously. Otherwise try to cluster, and if that 758 * doesn't do it then either do an async write (if O_DIRECT), 759 * or a delayed write (if not). 760 */ 761 if (ioflag & IO_SYNC) { 762 (void)bwrite(bp); 763 } else if (vm_page_count_severe() || 764 buf_dirty_count_severe() || 765 (ioflag & IO_ASYNC)) { 766 bp->b_flags |= B_CLUSTEROK; 767 bawrite(bp); 768 } else if (xfersize + blkoffset == fs->fs_bsize) { 769 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 770 bp->b_flags |= B_CLUSTEROK; 771 cluster_write(vp, bp, ip->i_size, seqcount); 772 } else { 773 bawrite(bp); 774 } 775 } else if (ioflag & IO_DIRECT) { 776 bp->b_flags |= B_CLUSTEROK; 777 bawrite(bp); 778 } else { 779 bp->b_flags |= B_CLUSTEROK; 780 bdwrite(bp); 781 } 782 if (error || xfersize == 0) 783 break; 784 ip->i_flag |= IN_CHANGE | IN_UPDATE; 785 } 786 /* 787 * If we successfully wrote any data, and we are not the superuser 788 * we clear the setuid and setgid bits as a precaution against 789 * tampering. 790 */ 791 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && 792 ap->a_cred) { 793 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 794 SUSER_ALLOWJAIL)) { 795 ip->i_mode &= ~(ISUID | ISGID); 796 DIP_SET(ip, i_mode, ip->i_mode); 797 } 798 } 799 if (error) { 800 if (ioflag & IO_UNIT) { 801 (void)ffs_truncate(vp, osize, 802 IO_NORMAL | (ioflag & IO_SYNC), 803 ap->a_cred, uio->uio_td); 804 uio->uio_offset -= resid - uio->uio_resid; 805 uio->uio_resid = resid; 806 } 807 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 808 error = ffs_update(vp, 1); 809 return (error); 810 } 811 812 /* 813 * get page routine 814 */ 815 static int 816 ffs_getpages(ap) 817 struct vop_getpages_args *ap; 818 { 819 int i; 820 vm_page_t mreq; 821 int pcount; 822 823 pcount = round_page(ap->a_count) / PAGE_SIZE; 824 mreq = ap->a_m[ap->a_reqpage]; 825 826 /* 827 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 828 * then the entire page is valid. Since the page may be mapped, 829 * user programs might reference data beyond the actual end of file 830 * occuring within the page. We have to zero that data. 831 */ 832 VM_OBJECT_LOCK(mreq->object); 833 if (mreq->valid) { 834 if (mreq->valid != VM_PAGE_BITS_ALL) 835 vm_page_zero_invalid(mreq, TRUE); 836 vm_page_lock_queues(); 837 for (i = 0; i < pcount; i++) { 838 if (i != ap->a_reqpage) { 839 vm_page_free(ap->a_m[i]); 840 } 841 } 842 vm_page_unlock_queues(); 843 VM_OBJECT_UNLOCK(mreq->object); 844 return VM_PAGER_OK; 845 } 846 VM_OBJECT_UNLOCK(mreq->object); 847 848 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 849 ap->a_count, 850 ap->a_reqpage); 851 } 852 853 854 /* 855 * Extended attribute area reading. 856 */ 857 static int 858 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 859 { 860 struct inode *ip; 861 struct ufs2_dinode *dp; 862 struct fs *fs; 863 struct buf *bp; 864 ufs_lbn_t lbn, nextlbn; 865 off_t bytesinfile; 866 long size, xfersize, blkoffset; 867 int error, orig_resid; 868 869 ip = VTOI(vp); 870 fs = ip->i_fs; 871 dp = ip->i_din2; 872 873 #ifdef DIAGNOSTIC 874 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 875 panic("ffs_extread: mode"); 876 877 #endif 878 orig_resid = uio->uio_resid; 879 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 880 if (orig_resid == 0) 881 return (0); 882 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 883 884 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 885 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 886 break; 887 lbn = lblkno(fs, uio->uio_offset); 888 nextlbn = lbn + 1; 889 890 /* 891 * size of buffer. The buffer representing the 892 * end of the file is rounded up to the size of 893 * the block type ( fragment or full block, 894 * depending ). 895 */ 896 size = sblksize(fs, dp->di_extsize, lbn); 897 blkoffset = blkoff(fs, uio->uio_offset); 898 899 /* 900 * The amount we want to transfer in this iteration is 901 * one FS block less the amount of the data before 902 * our startpoint (duh!) 903 */ 904 xfersize = fs->fs_bsize - blkoffset; 905 906 /* 907 * But if we actually want less than the block, 908 * or the file doesn't have a whole block more of data, 909 * then use the lesser number. 910 */ 911 if (uio->uio_resid < xfersize) 912 xfersize = uio->uio_resid; 913 if (bytesinfile < xfersize) 914 xfersize = bytesinfile; 915 916 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 917 /* 918 * Don't do readahead if this is the end of the info. 919 */ 920 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 921 } else { 922 /* 923 * If we have a second block, then 924 * fire off a request for a readahead 925 * as well as a read. Note that the 4th and 5th 926 * arguments point to arrays of the size specified in 927 * the 6th argument. 928 */ 929 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 930 931 nextlbn = -1 - nextlbn; 932 error = breadn(vp, -1 - lbn, 933 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 934 } 935 if (error) { 936 brelse(bp); 937 bp = NULL; 938 break; 939 } 940 941 /* 942 * If IO_DIRECT then set B_DIRECT for the buffer. This 943 * will cause us to attempt to release the buffer later on 944 * and will cause the buffer cache to attempt to free the 945 * underlying pages. 946 */ 947 if (ioflag & IO_DIRECT) 948 bp->b_flags |= B_DIRECT; 949 950 /* 951 * We should only get non-zero b_resid when an I/O error 952 * has occurred, which should cause us to break above. 953 * However, if the short read did not cause an error, 954 * then we want to ensure that we do not uiomove bad 955 * or uninitialized data. 956 */ 957 size -= bp->b_resid; 958 if (size < xfersize) { 959 if (size == 0) 960 break; 961 xfersize = size; 962 } 963 964 error = uiomove((char *)bp->b_data + blkoffset, 965 (int)xfersize, uio); 966 if (error) 967 break; 968 969 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 970 (LIST_FIRST(&bp->b_dep) == NULL)) { 971 /* 972 * If there are no dependencies, and it's VMIO, 973 * then we don't need the buf, mark it available 974 * for freeing. The VM has the data. 975 */ 976 bp->b_flags |= B_RELBUF; 977 brelse(bp); 978 } else { 979 /* 980 * Otherwise let whoever 981 * made the request take care of 982 * freeing it. We just queue 983 * it onto another list. 984 */ 985 bqrelse(bp); 986 } 987 } 988 989 /* 990 * This can only happen in the case of an error 991 * because the loop above resets bp to NULL on each iteration 992 * and on normal completion has not set a new value into it. 993 * so it must have come from a 'break' statement 994 */ 995 if (bp != NULL) { 996 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 997 (LIST_FIRST(&bp->b_dep) == NULL)) { 998 bp->b_flags |= B_RELBUF; 999 brelse(bp); 1000 } else { 1001 bqrelse(bp); 1002 } 1003 } 1004 1005 if ((error == 0 || uio->uio_resid != orig_resid) && 1006 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { 1007 VI_LOCK(vp); 1008 ip->i_flag |= IN_ACCESS; 1009 VI_UNLOCK(vp); 1010 } 1011 return (error); 1012 } 1013 1014 /* 1015 * Extended attribute area writing. 1016 */ 1017 static int 1018 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1019 { 1020 struct inode *ip; 1021 struct ufs2_dinode *dp; 1022 struct fs *fs; 1023 struct buf *bp; 1024 ufs_lbn_t lbn; 1025 off_t osize; 1026 int blkoffset, error, flags, resid, size, xfersize; 1027 1028 ip = VTOI(vp); 1029 fs = ip->i_fs; 1030 dp = ip->i_din2; 1031 1032 #ifdef DIAGNOSTIC 1033 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1034 panic("ffs_extwrite: mode"); 1035 #endif 1036 1037 if (ioflag & IO_APPEND) 1038 uio->uio_offset = dp->di_extsize; 1039 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1040 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1041 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1042 return (EFBIG); 1043 1044 resid = uio->uio_resid; 1045 osize = dp->di_extsize; 1046 flags = IO_EXT; 1047 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1048 flags |= IO_SYNC; 1049 1050 for (error = 0; uio->uio_resid > 0;) { 1051 lbn = lblkno(fs, uio->uio_offset); 1052 blkoffset = blkoff(fs, uio->uio_offset); 1053 xfersize = fs->fs_bsize - blkoffset; 1054 if (uio->uio_resid < xfersize) 1055 xfersize = uio->uio_resid; 1056 1057 /* 1058 * We must perform a read-before-write if the transfer size 1059 * does not cover the entire buffer. 1060 */ 1061 if (fs->fs_bsize > xfersize) 1062 flags |= BA_CLRBUF; 1063 else 1064 flags &= ~BA_CLRBUF; 1065 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1066 ucred, flags, &bp); 1067 if (error != 0) 1068 break; 1069 /* 1070 * If the buffer is not valid we have to clear out any 1071 * garbage data from the pages instantiated for the buffer. 1072 * If we do not, a failed uiomove() during a write can leave 1073 * the prior contents of the pages exposed to a userland 1074 * mmap(). XXX deal with uiomove() errors a better way. 1075 */ 1076 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1077 vfs_bio_clrbuf(bp); 1078 if (ioflag & IO_DIRECT) 1079 bp->b_flags |= B_DIRECT; 1080 1081 if (uio->uio_offset + xfersize > dp->di_extsize) 1082 dp->di_extsize = uio->uio_offset + xfersize; 1083 1084 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1085 if (size < xfersize) 1086 xfersize = size; 1087 1088 error = 1089 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1090 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1091 (LIST_FIRST(&bp->b_dep) == NULL)) { 1092 bp->b_flags |= B_RELBUF; 1093 } 1094 1095 /* 1096 * If IO_SYNC each buffer is written synchronously. Otherwise 1097 * if we have a severe page deficiency write the buffer 1098 * asynchronously. Otherwise try to cluster, and if that 1099 * doesn't do it then either do an async write (if O_DIRECT), 1100 * or a delayed write (if not). 1101 */ 1102 if (ioflag & IO_SYNC) { 1103 (void)bwrite(bp); 1104 } else if (vm_page_count_severe() || 1105 buf_dirty_count_severe() || 1106 xfersize + blkoffset == fs->fs_bsize || 1107 (ioflag & (IO_ASYNC | IO_DIRECT))) 1108 bawrite(bp); 1109 else 1110 bdwrite(bp); 1111 if (error || xfersize == 0) 1112 break; 1113 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1114 } 1115 /* 1116 * If we successfully wrote any data, and we are not the superuser 1117 * we clear the setuid and setgid bits as a precaution against 1118 * tampering. 1119 */ 1120 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { 1121 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 1122 SUSER_ALLOWJAIL)) { 1123 ip->i_mode &= ~(ISUID | ISGID); 1124 dp->di_mode = ip->i_mode; 1125 } 1126 } 1127 if (error) { 1128 if (ioflag & IO_UNIT) { 1129 (void)ffs_truncate(vp, osize, 1130 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1131 uio->uio_offset -= resid - uio->uio_resid; 1132 uio->uio_resid = resid; 1133 } 1134 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1135 error = ffs_update(vp, 1); 1136 return (error); 1137 } 1138 1139 1140 /* 1141 * Vnode operating to retrieve a named extended attribute. 1142 * 1143 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1144 * the length of the EA, and possibly the pointer to the entry and to the data. 1145 */ 1146 static int 1147 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1148 { 1149 u_char *p, *pe, *pn, *p0; 1150 int eapad1, eapad2, ealength, ealen, nlen; 1151 uint32_t ul; 1152 1153 pe = ptr + length; 1154 nlen = strlen(name); 1155 1156 for (p = ptr; p < pe; p = pn) { 1157 p0 = p; 1158 bcopy(p, &ul, sizeof(ul)); 1159 pn = p + ul; 1160 /* make sure this entry is complete */ 1161 if (pn > pe) 1162 break; 1163 p += sizeof(uint32_t); 1164 if (*p != nspace) 1165 continue; 1166 p++; 1167 eapad2 = *p++; 1168 if (*p != nlen) 1169 continue; 1170 p++; 1171 if (bcmp(p, name, nlen)) 1172 continue; 1173 ealength = sizeof(uint32_t) + 3 + nlen; 1174 eapad1 = 8 - (ealength % 8); 1175 if (eapad1 == 8) 1176 eapad1 = 0; 1177 ealength += eapad1; 1178 ealen = ul - ealength - eapad2; 1179 p += nlen + eapad1; 1180 if (eap != NULL) 1181 *eap = p0; 1182 if (eac != NULL) 1183 *eac = p; 1184 return (ealen); 1185 } 1186 return(-1); 1187 } 1188 1189 static int 1190 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1191 { 1192 struct inode *ip; 1193 struct ufs2_dinode *dp; 1194 struct uio luio; 1195 struct iovec liovec; 1196 int easize, error; 1197 u_char *eae; 1198 1199 ip = VTOI(vp); 1200 dp = ip->i_din2; 1201 easize = dp->di_extsize; 1202 1203 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1204 1205 liovec.iov_base = eae; 1206 liovec.iov_len = easize; 1207 luio.uio_iov = &liovec; 1208 luio.uio_iovcnt = 1; 1209 luio.uio_offset = 0; 1210 luio.uio_resid = easize; 1211 luio.uio_segflg = UIO_SYSSPACE; 1212 luio.uio_rw = UIO_READ; 1213 luio.uio_td = td; 1214 1215 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1216 if (error) { 1217 free(eae, M_TEMP); 1218 return(error); 1219 } 1220 *p = eae; 1221 return (0); 1222 } 1223 1224 static int 1225 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1226 { 1227 struct inode *ip; 1228 struct ufs2_dinode *dp; 1229 int error; 1230 1231 ip = VTOI(vp); 1232 1233 if (ip->i_ea_area != NULL) 1234 return (EBUSY); 1235 dp = ip->i_din2; 1236 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1237 if (error) 1238 return (error); 1239 ip->i_ea_len = dp->di_extsize; 1240 ip->i_ea_error = 0; 1241 return (0); 1242 } 1243 1244 /* 1245 * Vnode extattr transaction commit/abort 1246 */ 1247 static int 1248 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1249 { 1250 struct inode *ip; 1251 struct uio luio; 1252 struct iovec liovec; 1253 int error; 1254 struct ufs2_dinode *dp; 1255 1256 ip = VTOI(vp); 1257 if (ip->i_ea_area == NULL) 1258 return (EINVAL); 1259 dp = ip->i_din2; 1260 error = ip->i_ea_error; 1261 if (commit && error == 0) { 1262 if (cred == NOCRED) 1263 cred = vp->v_mount->mnt_cred; 1264 liovec.iov_base = ip->i_ea_area; 1265 liovec.iov_len = ip->i_ea_len; 1266 luio.uio_iov = &liovec; 1267 luio.uio_iovcnt = 1; 1268 luio.uio_offset = 0; 1269 luio.uio_resid = ip->i_ea_len; 1270 luio.uio_segflg = UIO_SYSSPACE; 1271 luio.uio_rw = UIO_WRITE; 1272 luio.uio_td = td; 1273 /* XXX: I'm not happy about truncating to zero size */ 1274 if (ip->i_ea_len < dp->di_extsize) 1275 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1276 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1277 } 1278 free(ip->i_ea_area, M_TEMP); 1279 ip->i_ea_area = NULL; 1280 ip->i_ea_len = 0; 1281 ip->i_ea_error = 0; 1282 return (error); 1283 } 1284 1285 /* 1286 * Vnode extattr strategy routine for fifos. 1287 * 1288 * We need to check for a read or write of the external attributes. 1289 * Otherwise we just fall through and do the usual thing. 1290 */ 1291 static int 1292 ffsext_strategy(struct vop_strategy_args *ap) 1293 /* 1294 struct vop_strategy_args { 1295 struct vnodeop_desc *a_desc; 1296 struct vnode *a_vp; 1297 struct buf *a_bp; 1298 }; 1299 */ 1300 { 1301 struct vnode *vp; 1302 daddr_t lbn; 1303 1304 vp = ap->a_vp; 1305 lbn = ap->a_bp->b_lblkno; 1306 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1307 lbn < 0 && lbn >= -NXADDR) 1308 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); 1309 if (vp->v_type == VFIFO) 1310 return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); 1311 panic("spec nodes went here"); 1312 } 1313 1314 /* 1315 * Vnode extattr transaction commit/abort 1316 */ 1317 static int 1318 ffs_openextattr(struct vop_openextattr_args *ap) 1319 /* 1320 struct vop_openextattr_args { 1321 struct vnodeop_desc *a_desc; 1322 struct vnode *a_vp; 1323 IN struct ucred *a_cred; 1324 IN struct thread *a_td; 1325 }; 1326 */ 1327 { 1328 struct inode *ip; 1329 struct fs *fs; 1330 1331 ip = VTOI(ap->a_vp); 1332 fs = ip->i_fs; 1333 1334 if (ap->a_vp->v_type == VCHR) 1335 return (EOPNOTSUPP); 1336 1337 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1338 } 1339 1340 1341 /* 1342 * Vnode extattr transaction commit/abort 1343 */ 1344 static int 1345 ffs_closeextattr(struct vop_closeextattr_args *ap) 1346 /* 1347 struct vop_closeextattr_args { 1348 struct vnodeop_desc *a_desc; 1349 struct vnode *a_vp; 1350 int a_commit; 1351 IN struct ucred *a_cred; 1352 IN struct thread *a_td; 1353 }; 1354 */ 1355 { 1356 struct inode *ip; 1357 struct fs *fs; 1358 1359 ip = VTOI(ap->a_vp); 1360 fs = ip->i_fs; 1361 1362 if (ap->a_vp->v_type == VCHR) 1363 return (EOPNOTSUPP); 1364 1365 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) 1366 return (EROFS); 1367 1368 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1369 } 1370 1371 /* 1372 * Vnode operation to remove a named attribute. 1373 */ 1374 static int 1375 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1376 /* 1377 vop_deleteextattr { 1378 IN struct vnode *a_vp; 1379 IN int a_attrnamespace; 1380 IN const char *a_name; 1381 IN struct ucred *a_cred; 1382 IN struct thread *a_td; 1383 }; 1384 */ 1385 { 1386 struct inode *ip; 1387 struct fs *fs; 1388 uint32_t ealength, ul; 1389 int ealen, olen, eapad1, eapad2, error, i, easize; 1390 u_char *eae, *p; 1391 int stand_alone; 1392 1393 ip = VTOI(ap->a_vp); 1394 fs = ip->i_fs; 1395 1396 if (ap->a_vp->v_type == VCHR) 1397 return (EOPNOTSUPP); 1398 1399 if (strlen(ap->a_name) == 0) 1400 return (EINVAL); 1401 1402 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1403 return (EROFS); 1404 1405 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1406 ap->a_cred, ap->a_td, IWRITE); 1407 if (error) { 1408 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1409 ip->i_ea_error = error; 1410 return (error); 1411 } 1412 1413 if (ip->i_ea_area == NULL) { 1414 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1415 if (error) 1416 return (error); 1417 stand_alone = 1; 1418 } else { 1419 stand_alone = 0; 1420 } 1421 1422 ealength = eapad1 = ealen = eapad2 = 0; 1423 1424 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1425 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1426 easize = ip->i_ea_len; 1427 1428 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1429 &p, NULL); 1430 if (olen == -1) { 1431 /* delete but nonexistent */ 1432 free(eae, M_TEMP); 1433 if (stand_alone) 1434 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1435 return(ENOATTR); 1436 } 1437 bcopy(p, &ul, sizeof ul); 1438 i = p - eae + ul; 1439 if (ul != ealength) { 1440 bcopy(p + ul, p + ealength, easize - i); 1441 easize += (ealength - ul); 1442 } 1443 if (easize > NXADDR * fs->fs_bsize) { 1444 free(eae, M_TEMP); 1445 if (stand_alone) 1446 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1447 else if (ip->i_ea_error == 0) 1448 ip->i_ea_error = ENOSPC; 1449 return(ENOSPC); 1450 } 1451 p = ip->i_ea_area; 1452 ip->i_ea_area = eae; 1453 ip->i_ea_len = easize; 1454 free(p, M_TEMP); 1455 if (stand_alone) 1456 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1457 return(error); 1458 } 1459 1460 /* 1461 * Vnode operation to retrieve a named extended attribute. 1462 */ 1463 static int 1464 ffs_getextattr(struct vop_getextattr_args *ap) 1465 /* 1466 vop_getextattr { 1467 IN struct vnode *a_vp; 1468 IN int a_attrnamespace; 1469 IN const char *a_name; 1470 INOUT struct uio *a_uio; 1471 OUT size_t *a_size; 1472 IN struct ucred *a_cred; 1473 IN struct thread *a_td; 1474 }; 1475 */ 1476 { 1477 struct inode *ip; 1478 struct fs *fs; 1479 u_char *eae, *p; 1480 unsigned easize; 1481 int error, ealen, stand_alone; 1482 1483 ip = VTOI(ap->a_vp); 1484 fs = ip->i_fs; 1485 1486 if (ap->a_vp->v_type == VCHR) 1487 return (EOPNOTSUPP); 1488 1489 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1490 ap->a_cred, ap->a_td, IREAD); 1491 if (error) 1492 return (error); 1493 1494 if (ip->i_ea_area == NULL) { 1495 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1496 if (error) 1497 return (error); 1498 stand_alone = 1; 1499 } else { 1500 stand_alone = 0; 1501 } 1502 eae = ip->i_ea_area; 1503 easize = ip->i_ea_len; 1504 1505 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1506 NULL, &p); 1507 if (ealen >= 0) { 1508 error = 0; 1509 if (ap->a_size != NULL) 1510 *ap->a_size = ealen; 1511 else if (ap->a_uio != NULL) 1512 error = uiomove(p, ealen, ap->a_uio); 1513 } else 1514 error = ENOATTR; 1515 if (stand_alone) 1516 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1517 return(error); 1518 } 1519 1520 /* 1521 * Vnode operation to retrieve extended attributes on a vnode. 1522 */ 1523 static int 1524 ffs_listextattr(struct vop_listextattr_args *ap) 1525 /* 1526 vop_listextattr { 1527 IN struct vnode *a_vp; 1528 IN int a_attrnamespace; 1529 INOUT struct uio *a_uio; 1530 OUT size_t *a_size; 1531 IN struct ucred *a_cred; 1532 IN struct thread *a_td; 1533 }; 1534 */ 1535 { 1536 struct inode *ip; 1537 struct fs *fs; 1538 u_char *eae, *p, *pe, *pn; 1539 unsigned easize; 1540 uint32_t ul; 1541 int error, ealen, stand_alone; 1542 1543 ip = VTOI(ap->a_vp); 1544 fs = ip->i_fs; 1545 1546 if (ap->a_vp->v_type == VCHR) 1547 return (EOPNOTSUPP); 1548 1549 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1550 ap->a_cred, ap->a_td, IREAD); 1551 if (error) 1552 return (error); 1553 1554 if (ip->i_ea_area == NULL) { 1555 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1556 if (error) 1557 return (error); 1558 stand_alone = 1; 1559 } else { 1560 stand_alone = 0; 1561 } 1562 eae = ip->i_ea_area; 1563 easize = ip->i_ea_len; 1564 1565 error = 0; 1566 if (ap->a_size != NULL) 1567 *ap->a_size = 0; 1568 pe = eae + easize; 1569 for(p = eae; error == 0 && p < pe; p = pn) { 1570 bcopy(p, &ul, sizeof(ul)); 1571 pn = p + ul; 1572 if (pn > pe) 1573 break; 1574 p += sizeof(ul); 1575 if (*p++ != ap->a_attrnamespace) 1576 continue; 1577 p++; /* pad2 */ 1578 ealen = *p; 1579 if (ap->a_size != NULL) { 1580 *ap->a_size += ealen + 1; 1581 } else if (ap->a_uio != NULL) { 1582 error = uiomove(p, ealen + 1, ap->a_uio); 1583 } 1584 } 1585 if (stand_alone) 1586 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1587 return(error); 1588 } 1589 1590 /* 1591 * Vnode operation to set a named attribute. 1592 */ 1593 static int 1594 ffs_setextattr(struct vop_setextattr_args *ap) 1595 /* 1596 vop_setextattr { 1597 IN struct vnode *a_vp; 1598 IN int a_attrnamespace; 1599 IN const char *a_name; 1600 INOUT struct uio *a_uio; 1601 IN struct ucred *a_cred; 1602 IN struct thread *a_td; 1603 }; 1604 */ 1605 { 1606 struct inode *ip; 1607 struct fs *fs; 1608 uint32_t ealength, ul; 1609 int ealen, olen, eapad1, eapad2, error, i, easize; 1610 u_char *eae, *p; 1611 int stand_alone; 1612 1613 ip = VTOI(ap->a_vp); 1614 fs = ip->i_fs; 1615 1616 if (ap->a_vp->v_type == VCHR) 1617 return (EOPNOTSUPP); 1618 1619 if (strlen(ap->a_name) == 0) 1620 return (EINVAL); 1621 1622 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1623 if (ap->a_uio == NULL) 1624 return (EOPNOTSUPP); 1625 1626 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1627 return (EROFS); 1628 1629 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1630 ap->a_cred, ap->a_td, IWRITE); 1631 if (error) { 1632 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1633 ip->i_ea_error = error; 1634 return (error); 1635 } 1636 1637 if (ip->i_ea_area == NULL) { 1638 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1639 if (error) 1640 return (error); 1641 stand_alone = 1; 1642 } else { 1643 stand_alone = 0; 1644 } 1645 1646 ealen = ap->a_uio->uio_resid; 1647 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1648 eapad1 = 8 - (ealength % 8); 1649 if (eapad1 == 8) 1650 eapad1 = 0; 1651 eapad2 = 8 - (ealen % 8); 1652 if (eapad2 == 8) 1653 eapad2 = 0; 1654 ealength += eapad1 + ealen + eapad2; 1655 1656 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1657 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1658 easize = ip->i_ea_len; 1659 1660 olen = ffs_findextattr(eae, easize, 1661 ap->a_attrnamespace, ap->a_name, &p, NULL); 1662 if (olen == -1) { 1663 /* new, append at end */ 1664 p = eae + easize; 1665 easize += ealength; 1666 } else { 1667 bcopy(p, &ul, sizeof ul); 1668 i = p - eae + ul; 1669 if (ul != ealength) { 1670 bcopy(p + ul, p + ealength, easize - i); 1671 easize += (ealength - ul); 1672 } 1673 } 1674 if (easize > NXADDR * fs->fs_bsize) { 1675 free(eae, M_TEMP); 1676 if (stand_alone) 1677 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1678 else if (ip->i_ea_error == 0) 1679 ip->i_ea_error = ENOSPC; 1680 return(ENOSPC); 1681 } 1682 bcopy(&ealength, p, sizeof(ealength)); 1683 p += sizeof(ealength); 1684 *p++ = ap->a_attrnamespace; 1685 *p++ = eapad2; 1686 *p++ = strlen(ap->a_name); 1687 strcpy(p, ap->a_name); 1688 p += strlen(ap->a_name); 1689 bzero(p, eapad1); 1690 p += eapad1; 1691 error = uiomove(p, ealen, ap->a_uio); 1692 if (error) { 1693 free(eae, M_TEMP); 1694 if (stand_alone) 1695 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1696 else if (ip->i_ea_error == 0) 1697 ip->i_ea_error = error; 1698 return(error); 1699 } 1700 p += ealen; 1701 bzero(p, eapad2); 1702 1703 p = ip->i_ea_area; 1704 ip->i_ea_area = eae; 1705 ip->i_ea_len = easize; 1706 free(p, M_TEMP); 1707 if (stand_alone) 1708 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1709 return(error); 1710 } 1711 1712 /* 1713 * Vnode pointer to File handle 1714 */ 1715 static int 1716 ffs_vptofh(struct vop_vptofh_args *ap) 1717 /* 1718 vop_vptofh { 1719 IN struct vnode *a_vp; 1720 IN struct fid *a_fhp; 1721 }; 1722 */ 1723 { 1724 struct inode *ip; 1725 struct ufid *ufhp; 1726 1727 ip = VTOI(ap->a_vp); 1728 ufhp = (struct ufid *)ap->a_fhp; 1729 ufhp->ufid_len = sizeof(struct ufid); 1730 ufhp->ufid_ino = ip->i_number; 1731 ufhp->ufid_gen = ip->i_gen; 1732 return (0); 1733 } 1734