1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/mountctl.h> 36 #include <sys/namecache.h> 37 #include <sys/buf2.h> 38 #include <vfs/fifofs/fifo.h> 39 40 #include "hammer.h" 41 42 /* 43 * USERFS VNOPS 44 */ 45 static int hammer_vop_fsync(struct vop_fsync_args *); 46 static int hammer_vop_read(struct vop_read_args *); 47 static int hammer_vop_write(struct vop_write_args *); 48 static int hammer_vop_access(struct vop_access_args *); 49 static int hammer_vop_advlock(struct vop_advlock_args *); 50 static int hammer_vop_close(struct vop_close_args *); 51 static int hammer_vop_ncreate(struct vop_ncreate_args *); 52 static int hammer_vop_getattr(struct vop_getattr_args *); 53 static int hammer_vop_nresolve(struct vop_nresolve_args *); 54 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 55 static int hammer_vop_nlink(struct vop_nlink_args *); 56 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 57 static int hammer_vop_nmknod(struct vop_nmknod_args *); 58 static int hammer_vop_open(struct vop_open_args *); 59 static int hammer_vop_print(struct vop_print_args *); 60 static int hammer_vop_readdir(struct vop_readdir_args *); 61 static int hammer_vop_readlink(struct vop_readlink_args *); 62 static int hammer_vop_nremove(struct vop_nremove_args *); 63 static int hammer_vop_nrename(struct vop_nrename_args *); 64 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 65 static int hammer_vop_markatime(struct vop_markatime_args *); 66 static int hammer_vop_setattr(struct vop_setattr_args *); 67 static int hammer_vop_strategy(struct vop_strategy_args *); 68 static int hammer_vop_bmap(struct vop_bmap_args *ap); 69 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 70 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 71 static int hammer_vop_ioctl(struct vop_ioctl_args *); 72 static int hammer_vop_mountctl(struct vop_mountctl_args *); 73 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 74 75 static int hammer_vop_fifoclose (struct vop_close_args *); 76 static int hammer_vop_fiforead (struct vop_read_args *); 77 static int hammer_vop_fifowrite (struct vop_write_args *); 78 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 79 80 struct vop_ops hammer_vnode_vops = { 81 .vop_default = vop_defaultop, 82 .vop_fsync = hammer_vop_fsync, 83 .vop_getpages = vop_stdgetpages, 84 .vop_putpages = vop_stdputpages, 85 .vop_read = hammer_vop_read, 86 .vop_write = hammer_vop_write, 87 .vop_access = hammer_vop_access, 88 .vop_advlock = hammer_vop_advlock, 89 .vop_close = hammer_vop_close, 90 .vop_ncreate = hammer_vop_ncreate, 91 .vop_getattr = hammer_vop_getattr, 92 .vop_inactive = hammer_vop_inactive, 93 .vop_reclaim = hammer_vop_reclaim, 94 .vop_nresolve = hammer_vop_nresolve, 95 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 96 .vop_nlink = hammer_vop_nlink, 97 .vop_nmkdir = hammer_vop_nmkdir, 98 .vop_nmknod = hammer_vop_nmknod, 99 .vop_open = hammer_vop_open, 100 .vop_pathconf = vop_stdpathconf, 101 .vop_print = hammer_vop_print, 102 .vop_readdir = hammer_vop_readdir, 103 .vop_readlink = hammer_vop_readlink, 104 .vop_nremove = hammer_vop_nremove, 105 .vop_nrename = hammer_vop_nrename, 106 .vop_nrmdir = hammer_vop_nrmdir, 107 .vop_markatime = hammer_vop_markatime, 108 .vop_setattr = hammer_vop_setattr, 109 .vop_bmap = hammer_vop_bmap, 110 .vop_strategy = hammer_vop_strategy, 111 .vop_nsymlink = hammer_vop_nsymlink, 112 .vop_nwhiteout = hammer_vop_nwhiteout, 113 .vop_ioctl = hammer_vop_ioctl, 114 .vop_mountctl = hammer_vop_mountctl, 115 .vop_kqfilter = hammer_vop_kqfilter 116 }; 117 118 struct vop_ops hammer_spec_vops = { 119 .vop_default = vop_defaultop, 120 .vop_fsync = hammer_vop_fsync, 121 .vop_read = vop_stdnoread, 122 .vop_write = vop_stdnowrite, 123 .vop_access = hammer_vop_access, 124 .vop_close = hammer_vop_close, 125 .vop_markatime = hammer_vop_markatime, 126 .vop_getattr = hammer_vop_getattr, 127 .vop_inactive = hammer_vop_inactive, 128 .vop_reclaim = hammer_vop_reclaim, 129 .vop_setattr = hammer_vop_setattr 130 }; 131 132 struct vop_ops hammer_fifo_vops = { 133 .vop_default = fifo_vnoperate, 134 .vop_fsync = hammer_vop_fsync, 135 .vop_read = hammer_vop_fiforead, 136 .vop_write = hammer_vop_fifowrite, 137 .vop_access = hammer_vop_access, 138 .vop_close = hammer_vop_fifoclose, 139 .vop_markatime = hammer_vop_markatime, 140 .vop_getattr = hammer_vop_getattr, 141 .vop_inactive = hammer_vop_inactive, 142 .vop_reclaim = hammer_vop_reclaim, 143 .vop_setattr = hammer_vop_setattr, 144 .vop_kqfilter = hammer_vop_fifokqfilter 145 }; 146 147 static __inline 148 void 149 hammer_knote(struct vnode *vp, int flags) 150 { 151 if (flags) 152 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 153 } 154 155 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 156 struct vnode *dvp, struct ucred *cred, 157 int flags, int isdir); 158 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 159 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 160 161 /* 162 * hammer_vop_fsync { vp, waitfor } 163 * 164 * fsync() an inode to disk and wait for it to be completely committed 165 * such that the information would not be undone if a crash occured after 166 * return. 167 * 168 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 169 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 170 * operation. 171 * 172 * Ultimately the combination of a REDO log and use of fast storage 173 * to front-end cluster caches will make fsync fast, but it aint 174 * here yet. And, in anycase, we need real transactional 175 * all-or-nothing features which are not restricted to a single file. 176 */ 177 static 178 int 179 hammer_vop_fsync(struct vop_fsync_args *ap) 180 { 181 hammer_inode_t ip = VTOI(ap->a_vp); 182 hammer_mount_t hmp = ip->hmp; 183 int waitfor = ap->a_waitfor; 184 int mode; 185 186 lwkt_gettoken(&hmp->fs_token); 187 188 /* 189 * Fsync rule relaxation (default is either full synchronous flush 190 * or REDO semantics with synchronous flush). 191 */ 192 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 193 switch(hammer_fsync_mode) { 194 case 0: 195 mode0: 196 /* no REDO, full synchronous flush */ 197 goto skip; 198 case 1: 199 mode1: 200 /* no REDO, full asynchronous flush */ 201 if (waitfor == MNT_WAIT) 202 waitfor = MNT_NOWAIT; 203 goto skip; 204 case 2: 205 /* REDO semantics, synchronous flush */ 206 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 207 goto mode0; 208 mode = HAMMER_FLUSH_UNDOS_AUTO; 209 break; 210 case 3: 211 /* REDO semantics, relaxed asynchronous flush */ 212 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 213 goto mode1; 214 mode = HAMMER_FLUSH_UNDOS_RELAXED; 215 if (waitfor == MNT_WAIT) 216 waitfor = MNT_NOWAIT; 217 break; 218 case 4: 219 /* ignore the fsync() system call */ 220 lwkt_reltoken(&hmp->fs_token); 221 return(0); 222 default: 223 /* we have to do something */ 224 mode = HAMMER_FLUSH_UNDOS_RELAXED; 225 if (waitfor == MNT_WAIT) 226 waitfor = MNT_NOWAIT; 227 break; 228 } 229 230 /* 231 * Fast fsync only needs to flush the UNDO/REDO fifo if 232 * HAMMER_INODE_REDO is non-zero and the only modifications 233 * made to the file are write or write-extends. 234 */ 235 if ((ip->flags & HAMMER_INODE_REDO) && 236 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0) { 237 ++hammer_count_fsyncs; 238 hammer_flusher_flush_undos(hmp, mode); 239 ip->redo_count = 0; 240 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 241 vclrisdirty(ip->vp); 242 lwkt_reltoken(&hmp->fs_token); 243 return(0); 244 } 245 246 /* 247 * REDO is enabled by fsync(), the idea being we really only 248 * want to lay down REDO records when programs are using 249 * fsync() heavily. The first fsync() on the file starts 250 * the gravy train going and later fsync()s keep it hot by 251 * resetting the redo_count. 252 * 253 * We weren't running REDOs before now so we have to fall 254 * through and do a full fsync of what we have. 255 */ 256 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 257 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 258 ip->flags |= HAMMER_INODE_REDO; 259 ip->redo_count = 0; 260 } 261 } 262 skip: 263 264 /* 265 * Do a full flush sequence. 266 * 267 * Attempt to release the vnode while waiting for the inode to 268 * finish flushing. This can really mess up inactive->reclaim 269 * sequences so only do it if the vnode is active. 270 * 271 * WARNING! The VX lock functions must be used. vn_lock() will 272 * fail when this is part of a VOP_RECLAIM sequence. 273 */ 274 ++hammer_count_fsyncs; 275 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 276 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 277 if (waitfor == MNT_WAIT) { 278 int dorelock; 279 280 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 281 vx_unlock(ap->a_vp); 282 dorelock = 1; 283 } else { 284 dorelock = 0; 285 } 286 hammer_wait_inode(ip); 287 if (dorelock) 288 vx_lock(ap->a_vp); 289 } 290 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 291 vclrisdirty(ip->vp); 292 lwkt_reltoken(&hmp->fs_token); 293 return (ip->error); 294 } 295 296 /* 297 * hammer_vop_read { vp, uio, ioflag, cred } 298 * 299 * MPSAFE (for the cache safe does not require fs_token) 300 */ 301 static 302 int 303 hammer_vop_read(struct vop_read_args *ap) 304 { 305 struct hammer_transaction trans; 306 hammer_inode_t ip; 307 hammer_mount_t hmp; 308 off_t offset; 309 struct buf *bp; 310 struct uio *uio; 311 int error; 312 int n; 313 int seqcount; 314 int ioseqcount; 315 int blksize; 316 int bigread; 317 int got_trans; 318 size_t resid; 319 320 if (ap->a_vp->v_type != VREG) 321 return (EINVAL); 322 ip = VTOI(ap->a_vp); 323 hmp = ip->hmp; 324 error = 0; 325 got_trans = 0; 326 uio = ap->a_uio; 327 328 /* 329 * Attempt to shortcut directly to the VM object using lwbufs. 330 * This is much faster than instantiating buffer cache buffers. 331 */ 332 resid = uio->uio_resid; 333 error = vop_helper_read_shortcut(ap); 334 hammer_stats_file_read += resid - uio->uio_resid; 335 if (error) 336 return (error); 337 if (uio->uio_resid == 0) 338 goto finished; 339 340 /* 341 * Allow the UIO's size to override the sequential heuristic. 342 */ 343 blksize = hammer_blocksize(uio->uio_offset); 344 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 345 ioseqcount = (ap->a_ioflag >> 16); 346 if (seqcount < ioseqcount) 347 seqcount = ioseqcount; 348 349 /* 350 * If reading or writing a huge amount of data we have to break 351 * atomicy and allow the operation to be interrupted by a signal 352 * or it can DOS the machine. 353 */ 354 bigread = (uio->uio_resid > 100 * 1024 * 1024); 355 356 /* 357 * Access the data typically in HAMMER_BUFSIZE blocks via the 358 * buffer cache, but HAMMER may use a variable block size based 359 * on the offset. 360 * 361 * XXX Temporary hack, delay the start transaction while we remain 362 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 363 * locked-shared. 364 */ 365 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 366 int64_t base_offset; 367 int64_t file_limit; 368 369 blksize = hammer_blocksize(uio->uio_offset); 370 offset = (int)uio->uio_offset & (blksize - 1); 371 base_offset = uio->uio_offset - offset; 372 373 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 374 break; 375 376 /* 377 * MPSAFE 378 */ 379 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 380 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 381 bp->b_flags &= ~B_AGE; 382 error = 0; 383 goto skip; 384 } 385 if (ap->a_ioflag & IO_NRDELAY) { 386 bqrelse(bp); 387 return (EWOULDBLOCK); 388 } 389 390 /* 391 * MPUNSAFE 392 */ 393 if (got_trans == 0) { 394 hammer_start_transaction(&trans, ip->hmp); 395 got_trans = 1; 396 } 397 398 /* 399 * NOTE: A valid bp has already been acquired, but was not 400 * B_CACHE. 401 */ 402 if (hammer_cluster_enable) { 403 /* 404 * Use file_limit to prevent cluster_read() from 405 * creating buffers of the wrong block size past 406 * the demarc. 407 */ 408 file_limit = ip->ino_data.size; 409 if (base_offset < HAMMER_XDEMARC && 410 file_limit > HAMMER_XDEMARC) { 411 file_limit = HAMMER_XDEMARC; 412 } 413 error = cluster_readx(ap->a_vp, 414 file_limit, base_offset, 415 blksize, uio->uio_resid, 416 seqcount * BKVASIZE, &bp); 417 } else { 418 error = breadnx(ap->a_vp, base_offset, blksize, 419 NULL, NULL, 0, &bp); 420 } 421 if (error) { 422 brelse(bp); 423 break; 424 } 425 skip: 426 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 427 hdkprintf("zone2_offset %016jx read file %016jx@%016jx\n", 428 (intmax_t)bp->b_bio2.bio_offset, 429 (intmax_t)ip->obj_id, 430 (intmax_t)bp->b_loffset); 431 } 432 bp->b_flags &= ~B_IODEBUG; 433 if (blksize == HAMMER_XBUFSIZE) 434 bp->b_flags |= B_CLUSTEROK; 435 436 n = blksize - offset; 437 if (n > uio->uio_resid) 438 n = uio->uio_resid; 439 if (n > ip->ino_data.size - uio->uio_offset) 440 n = (int)(ip->ino_data.size - uio->uio_offset); 441 442 /* 443 * Set B_AGE, data has a lower priority than meta-data. 444 * 445 * Use a hold/unlock/drop sequence to run the uiomove 446 * with the buffer unlocked, avoiding deadlocks against 447 * read()s on mmap()'d spaces. 448 */ 449 bp->b_flags |= B_AGE; 450 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 451 bqrelse(bp); 452 453 if (error) 454 break; 455 hammer_stats_file_read += n; 456 } 457 458 finished: 459 460 /* 461 * Try to update the atime with just the inode lock for maximum 462 * concurrency. If we can't shortcut it we have to get the full 463 * blown transaction. 464 */ 465 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 466 hammer_start_transaction(&trans, ip->hmp); 467 got_trans = 1; 468 } 469 470 if (got_trans) { 471 if ((ip->flags & HAMMER_INODE_RO) == 0 && 472 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 473 lwkt_gettoken(&hmp->fs_token); 474 ip->ino_data.atime = trans.time; 475 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 476 hammer_done_transaction(&trans); 477 lwkt_reltoken(&hmp->fs_token); 478 } else { 479 hammer_done_transaction(&trans); 480 } 481 } 482 return (error); 483 } 484 485 /* 486 * hammer_vop_write { vp, uio, ioflag, cred } 487 */ 488 static 489 int 490 hammer_vop_write(struct vop_write_args *ap) 491 { 492 struct hammer_transaction trans; 493 struct hammer_inode *ip; 494 hammer_mount_t hmp; 495 thread_t td; 496 struct uio *uio; 497 int offset; 498 off_t base_offset; 499 int64_t cluster_eof; 500 struct buf *bp; 501 int kflags; 502 int error; 503 int n; 504 int flags; 505 int seqcount; 506 int bigwrite; 507 508 if (ap->a_vp->v_type != VREG) 509 return (EINVAL); 510 ip = VTOI(ap->a_vp); 511 hmp = ip->hmp; 512 error = 0; 513 kflags = 0; 514 seqcount = ap->a_ioflag >> 16; 515 516 if (ip->flags & HAMMER_INODE_RO) 517 return (EROFS); 518 519 /* 520 * Create a transaction to cover the operations we perform. 521 */ 522 hammer_start_transaction(&trans, hmp); 523 uio = ap->a_uio; 524 525 /* 526 * Check append mode 527 */ 528 if (ap->a_ioflag & IO_APPEND) 529 uio->uio_offset = ip->ino_data.size; 530 531 /* 532 * Check for illegal write offsets. Valid range is 0...2^63-1. 533 * 534 * NOTE: the base_off assignment is required to work around what 535 * I consider to be a GCC-4 optimization bug. 536 */ 537 if (uio->uio_offset < 0) { 538 hammer_done_transaction(&trans); 539 return (EFBIG); 540 } 541 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 542 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 543 hammer_done_transaction(&trans); 544 return (EFBIG); 545 } 546 547 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 548 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 549 hammer_done_transaction(&trans); 550 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 551 return (EFBIG); 552 } 553 554 /* 555 * If reading or writing a huge amount of data we have to break 556 * atomicy and allow the operation to be interrupted by a signal 557 * or it can DOS the machine. 558 * 559 * Preset redo_count so we stop generating REDOs earlier if the 560 * limit is exceeded. 561 * 562 * redo_count is heuristical, SMP races are ok 563 */ 564 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 565 if ((ip->flags & HAMMER_INODE_REDO) && 566 ip->redo_count < hammer_limit_redo) { 567 ip->redo_count += uio->uio_resid; 568 } 569 570 /* 571 * Access the data typically in HAMMER_BUFSIZE blocks via the 572 * buffer cache, but HAMMER may use a variable block size based 573 * on the offset. 574 */ 575 while (uio->uio_resid > 0) { 576 int fixsize = 0; 577 int blksize; 578 int blkmask; 579 int trivial; 580 int endofblk; 581 off_t nsize; 582 583 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 584 break; 585 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 586 break; 587 588 blksize = hammer_blocksize(uio->uio_offset); 589 590 /* 591 * Control the number of pending records associated with 592 * this inode. If too many have accumulated start a 593 * flush. Try to maintain a pipeline with the flusher. 594 * 595 * NOTE: It is possible for other sources to grow the 596 * records but not necessarily issue another flush, 597 * so use a timeout and ensure that a re-flush occurs. 598 */ 599 if (ip->rsv_recs >= hammer_limit_inode_recs) { 600 lwkt_gettoken(&hmp->fs_token); 601 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 602 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 603 ip->flags |= HAMMER_INODE_RECSW; 604 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 605 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 606 } 607 lwkt_reltoken(&hmp->fs_token); 608 } 609 610 /* 611 * Do not allow HAMMER to blow out the buffer cache. Very 612 * large UIOs can lockout other processes due to bwillwrite() 613 * mechanics. 614 * 615 * The hammer inode is not locked during these operations. 616 * The vnode is locked which can interfere with the pageout 617 * daemon for non-UIO_NOCOPY writes but should not interfere 618 * with the buffer cache. Even so, we cannot afford to 619 * allow the pageout daemon to build up too many dirty buffer 620 * cache buffers. 621 * 622 * Only call this if we aren't being recursively called from 623 * a virtual disk device (vn), else we may deadlock. 624 */ 625 if ((ap->a_ioflag & IO_RECURSE) == 0) 626 bwillwrite(blksize); 627 628 /* 629 * Calculate the blocksize at the current offset and figure 630 * out how much we can actually write. 631 */ 632 blkmask = blksize - 1; 633 offset = (int)uio->uio_offset & blkmask; 634 base_offset = uio->uio_offset & ~(int64_t)blkmask; 635 n = blksize - offset; 636 if (n > uio->uio_resid) { 637 n = uio->uio_resid; 638 endofblk = 0; 639 } else { 640 endofblk = 1; 641 } 642 nsize = uio->uio_offset + n; 643 if (nsize > ip->ino_data.size) { 644 if (uio->uio_offset > ip->ino_data.size) 645 trivial = 0; 646 else 647 trivial = 1; 648 nvextendbuf(ap->a_vp, 649 ip->ino_data.size, 650 nsize, 651 hammer_blocksize(ip->ino_data.size), 652 hammer_blocksize(nsize), 653 hammer_blockoff(ip->ino_data.size), 654 hammer_blockoff(nsize), 655 trivial); 656 fixsize = 1; 657 kflags |= NOTE_EXTEND; 658 } 659 660 if (uio->uio_segflg == UIO_NOCOPY) { 661 /* 662 * Issuing a write with the same data backing the 663 * buffer. Instantiate the buffer to collect the 664 * backing vm pages, then read-in any missing bits. 665 * 666 * This case is used by vop_stdputpages(). 667 */ 668 bp = getblk(ap->a_vp, base_offset, 669 blksize, GETBLK_BHEAVY, 0); 670 if ((bp->b_flags & B_CACHE) == 0) { 671 bqrelse(bp); 672 error = bread(ap->a_vp, base_offset, 673 blksize, &bp); 674 } 675 } else if (offset == 0 && uio->uio_resid >= blksize) { 676 /* 677 * Even though we are entirely overwriting the buffer 678 * we may still have to zero it out to avoid a 679 * mmap/write visibility issue. 680 */ 681 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 682 if ((bp->b_flags & B_CACHE) == 0) 683 vfs_bio_clrbuf(bp); 684 } else if (base_offset >= ip->ino_data.size) { 685 /* 686 * If the base offset of the buffer is beyond the 687 * file EOF, we don't have to issue a read. 688 */ 689 bp = getblk(ap->a_vp, base_offset, 690 blksize, GETBLK_BHEAVY, 0); 691 vfs_bio_clrbuf(bp); 692 } else { 693 /* 694 * Partial overwrite, read in any missing bits then 695 * replace the portion being written. 696 */ 697 error = bread(ap->a_vp, base_offset, blksize, &bp); 698 if (error == 0) 699 bheavy(bp); 700 } 701 if (error == 0) 702 error = uiomovebp(bp, bp->b_data + offset, n, uio); 703 704 lwkt_gettoken(&hmp->fs_token); 705 706 /* 707 * Generate REDO records if enabled and redo_count will not 708 * exceeded the limit. 709 * 710 * If redo_count exceeds the limit we stop generating records 711 * and clear HAMMER_INODE_REDO. This will cause the next 712 * fsync() to do a full meta-data sync instead of just an 713 * UNDO/REDO fifo update. 714 * 715 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 716 * will still be tracked. The tracks will be terminated 717 * when the related meta-data (including possible data 718 * modifications which are not tracked via REDO) is 719 * flushed. 720 */ 721 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 722 if (ip->redo_count < hammer_limit_redo) { 723 bp->b_flags |= B_VFSFLAG1; 724 error = hammer_generate_redo(&trans, ip, 725 base_offset + offset, 726 HAMMER_REDO_WRITE, 727 bp->b_data + offset, 728 (size_t)n); 729 } else { 730 ip->flags &= ~HAMMER_INODE_REDO; 731 } 732 } 733 734 /* 735 * If we screwed up we have to undo any VM size changes we 736 * made. 737 */ 738 if (error) { 739 brelse(bp); 740 if (fixsize) { 741 nvtruncbuf(ap->a_vp, ip->ino_data.size, 742 hammer_blocksize(ip->ino_data.size), 743 hammer_blockoff(ip->ino_data.size), 744 0); 745 } 746 lwkt_reltoken(&hmp->fs_token); 747 break; 748 } 749 kflags |= NOTE_WRITE; 750 hammer_stats_file_write += n; 751 if (blksize == HAMMER_XBUFSIZE) 752 bp->b_flags |= B_CLUSTEROK; 753 if (ip->ino_data.size < uio->uio_offset) { 754 ip->ino_data.size = uio->uio_offset; 755 flags = HAMMER_INODE_SDIRTY; 756 } else { 757 flags = 0; 758 } 759 ip->ino_data.mtime = trans.time; 760 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 761 hammer_modify_inode(&trans, ip, flags); 762 763 /* 764 * Once we dirty the buffer any cached zone-X offset 765 * becomes invalid. HAMMER NOTE: no-history mode cannot 766 * allow overwriting over the same data sector unless 767 * we provide UNDOs for the old data, which we don't. 768 */ 769 bp->b_bio2.bio_offset = NOOFFSET; 770 771 lwkt_reltoken(&hmp->fs_token); 772 773 /* 774 * Final buffer disposition. 775 * 776 * Because meta-data updates are deferred, HAMMER is 777 * especially sensitive to excessive bdwrite()s because 778 * the I/O stream is not broken up by disk reads. So the 779 * buffer cache simply cannot keep up. 780 * 781 * WARNING! blksize is variable. cluster_write() is 782 * expected to not blow up if it encounters 783 * buffers that do not match the passed blksize. 784 * 785 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 786 * The ip->rsv_recs check should burst-flush the data. 787 * If we queue it immediately the buf could be left 788 * locked on the device queue for a very long time. 789 * 790 * However, failing to flush a dirty buffer out when 791 * issued from the pageout daemon can result in a low 792 * memory deadlock against bio_page_alloc(), so we 793 * have to bawrite() on IO_ASYNC as well. 794 * 795 * NOTE! To avoid degenerate stalls due to mismatched block 796 * sizes we only honor IO_DIRECT on the write which 797 * abuts the end of the buffer. However, we must 798 * honor IO_SYNC in case someone is silly enough to 799 * configure a HAMMER file as swap, or when HAMMER 800 * is serving NFS (for commits). Ick ick. 801 */ 802 bp->b_flags |= B_AGE; 803 if (blksize == HAMMER_XBUFSIZE) 804 bp->b_flags |= B_CLUSTEROK; 805 806 if (ap->a_ioflag & IO_SYNC) { 807 bwrite(bp); 808 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 809 bawrite(bp); 810 } else if (ap->a_ioflag & IO_ASYNC) { 811 bawrite(bp); 812 } else if (hammer_cluster_enable && 813 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 814 if (base_offset < HAMMER_XDEMARC) 815 cluster_eof = hammer_blockdemarc(base_offset, 816 ip->ino_data.size); 817 else 818 cluster_eof = ip->ino_data.size; 819 cluster_write(bp, cluster_eof, blksize, seqcount); 820 } else { 821 bdwrite(bp); 822 } 823 } 824 hammer_done_transaction(&trans); 825 hammer_knote(ap->a_vp, kflags); 826 827 return (error); 828 } 829 830 /* 831 * hammer_vop_access { vp, mode, cred } 832 * 833 * MPSAFE - does not require fs_token 834 */ 835 static 836 int 837 hammer_vop_access(struct vop_access_args *ap) 838 { 839 struct hammer_inode *ip = VTOI(ap->a_vp); 840 uid_t uid; 841 gid_t gid; 842 int error; 843 844 ++hammer_stats_file_iopsr; 845 uid = hammer_to_unix_xid(&ip->ino_data.uid); 846 gid = hammer_to_unix_xid(&ip->ino_data.gid); 847 848 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 849 ip->ino_data.uflags); 850 return (error); 851 } 852 853 /* 854 * hammer_vop_advlock { vp, id, op, fl, flags } 855 * 856 * MPSAFE - does not require fs_token 857 */ 858 static 859 int 860 hammer_vop_advlock(struct vop_advlock_args *ap) 861 { 862 hammer_inode_t ip = VTOI(ap->a_vp); 863 864 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 865 } 866 867 /* 868 * hammer_vop_close { vp, fflag } 869 * 870 * We can only sync-on-close for normal closes. XXX disabled for now. 871 */ 872 static 873 int 874 hammer_vop_close(struct vop_close_args *ap) 875 { 876 #if 0 877 struct vnode *vp = ap->a_vp; 878 hammer_inode_t ip = VTOI(vp); 879 int waitfor; 880 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 881 if (vn_islocked(vp) == LK_EXCLUSIVE && 882 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 883 if (ip->flags & HAMMER_INODE_CLOSESYNC) 884 waitfor = MNT_WAIT; 885 else 886 waitfor = MNT_NOWAIT; 887 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 888 HAMMER_INODE_CLOSEASYNC); 889 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 890 } 891 } 892 #endif 893 return (vop_stdclose(ap)); 894 } 895 896 /* 897 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 898 * 899 * The operating system has already ensured that the directory entry 900 * does not exist and done all appropriate namespace locking. 901 */ 902 static 903 int 904 hammer_vop_ncreate(struct vop_ncreate_args *ap) 905 { 906 struct hammer_transaction trans; 907 struct hammer_inode *dip; 908 struct hammer_inode *nip; 909 struct nchandle *nch; 910 hammer_mount_t hmp; 911 int error; 912 913 nch = ap->a_nch; 914 dip = VTOI(ap->a_dvp); 915 hmp = dip->hmp; 916 917 if (dip->flags & HAMMER_INODE_RO) 918 return (EROFS); 919 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 920 return (error); 921 922 /* 923 * Create a transaction to cover the operations we perform. 924 */ 925 lwkt_gettoken(&hmp->fs_token); 926 hammer_start_transaction(&trans, hmp); 927 ++hammer_stats_file_iopsw; 928 929 /* 930 * Create a new filesystem object of the requested type. The 931 * returned inode will be referenced and shared-locked to prevent 932 * it from being moved to the flusher. 933 */ 934 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 935 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 936 NULL, &nip); 937 if (error) { 938 hkprintf("hammer_create_inode error %d\n", error); 939 hammer_done_transaction(&trans); 940 *ap->a_vpp = NULL; 941 lwkt_reltoken(&hmp->fs_token); 942 return (error); 943 } 944 945 /* 946 * Add the new filesystem object to the directory. This will also 947 * bump the inode's link count. 948 */ 949 error = hammer_ip_add_directory(&trans, dip, 950 nch->ncp->nc_name, nch->ncp->nc_nlen, 951 nip); 952 if (error) 953 hkprintf("hammer_ip_add_directory error %d\n", error); 954 955 /* 956 * Finish up. 957 */ 958 if (error) { 959 hammer_rel_inode(nip, 0); 960 hammer_done_transaction(&trans); 961 *ap->a_vpp = NULL; 962 } else { 963 error = hammer_get_vnode(nip, ap->a_vpp); 964 hammer_done_transaction(&trans); 965 hammer_rel_inode(nip, 0); 966 if (error == 0) { 967 cache_setunresolved(ap->a_nch); 968 cache_setvp(ap->a_nch, *ap->a_vpp); 969 } 970 hammer_knote(ap->a_dvp, NOTE_WRITE); 971 } 972 lwkt_reltoken(&hmp->fs_token); 973 return (error); 974 } 975 976 /* 977 * hammer_vop_getattr { vp, vap } 978 * 979 * Retrieve an inode's attribute information. When accessing inodes 980 * historically we fake the atime field to ensure consistent results. 981 * The atime field is stored in the B-Tree element and allowed to be 982 * updated without cycling the element. 983 * 984 * MPSAFE - does not require fs_token 985 */ 986 static 987 int 988 hammer_vop_getattr(struct vop_getattr_args *ap) 989 { 990 struct hammer_inode *ip = VTOI(ap->a_vp); 991 struct vattr *vap = ap->a_vap; 992 993 /* 994 * We want the fsid to be different when accessing a filesystem 995 * with different as-of's so programs like diff don't think 996 * the files are the same. 997 * 998 * We also want the fsid to be the same when comparing snapshots, 999 * or when comparing mirrors (which might be backed by different 1000 * physical devices). HAMMER fsids are based on the PFS's 1001 * shared_uuid field. 1002 * 1003 * XXX there is a chance of collision here. The va_fsid reported 1004 * by stat is different from the more involved fsid used in the 1005 * mount structure. 1006 */ 1007 ++hammer_stats_file_iopsr; 1008 hammer_lock_sh(&ip->lock); 1009 vap->va_fsid = ip->pfsm->fsid_udev ^ (uint32_t)ip->obj_asof ^ 1010 (uint32_t)(ip->obj_asof >> 32); 1011 1012 vap->va_fileid = ip->ino_leaf.base.obj_id; 1013 vap->va_mode = ip->ino_data.mode; 1014 vap->va_nlink = ip->ino_data.nlinks; 1015 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1016 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1017 vap->va_rmajor = 0; 1018 vap->va_rminor = 0; 1019 vap->va_size = ip->ino_data.size; 1020 1021 /* 1022 * Special case for @@PFS softlinks. The actual size of the 1023 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1024 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1025 * 1026 * Note that userspace hammer command does not allow users to 1027 * create a @@PFS softlink under an existing other PFS (id!=0) 1028 * so the ip localization here for @@PFS softlink is always 0. 1029 */ 1030 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1031 ip->ino_data.size == 10 && 1032 ip->obj_asof == HAMMER_MAX_TID && 1033 ip->obj_localization == HAMMER_DEF_LOCALIZATION && 1034 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1035 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1036 vap->va_size = 26; 1037 else 1038 vap->va_size = 10; 1039 } 1040 1041 /* 1042 * We must provide a consistent atime and mtime for snapshots 1043 * so people can do a 'tar cf - ... | md5' on them and get 1044 * consistent results. 1045 */ 1046 if (ip->flags & HAMMER_INODE_RO) { 1047 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1048 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1049 } else { 1050 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1051 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1052 } 1053 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1054 vap->va_flags = ip->ino_data.uflags; 1055 vap->va_gen = 1; /* hammer inums are unique for all time */ 1056 vap->va_blocksize = HAMMER_BUFSIZE; 1057 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1058 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1059 ~HAMMER_XBUFMASK64; 1060 } else if (ip->ino_data.size > HAMMER_HBUFSIZE) { 1061 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1062 ~HAMMER_BUFMASK64; 1063 } else { 1064 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1065 } 1066 1067 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1068 vap->va_filerev = 0; /* XXX */ 1069 vap->va_uid_uuid = ip->ino_data.uid; 1070 vap->va_gid_uuid = ip->ino_data.gid; 1071 vap->va_fsid_uuid = ip->hmp->fsid; 1072 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1073 VA_FSID_UUID_VALID; 1074 1075 switch (ip->ino_data.obj_type) { 1076 case HAMMER_OBJTYPE_CDEV: 1077 case HAMMER_OBJTYPE_BDEV: 1078 vap->va_rmajor = ip->ino_data.rmajor; 1079 vap->va_rminor = ip->ino_data.rminor; 1080 break; 1081 default: 1082 break; 1083 } 1084 hammer_unlock(&ip->lock); 1085 return(0); 1086 } 1087 1088 /* 1089 * hammer_vop_nresolve { nch, dvp, cred } 1090 * 1091 * Locate the requested directory entry. 1092 */ 1093 static 1094 int 1095 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1096 { 1097 struct hammer_transaction trans; 1098 struct namecache *ncp; 1099 hammer_mount_t hmp; 1100 hammer_inode_t dip; 1101 hammer_inode_t ip; 1102 hammer_tid_t asof; 1103 struct hammer_cursor cursor; 1104 struct vnode *vp; 1105 int64_t namekey; 1106 int error; 1107 int i; 1108 int nlen; 1109 int flags; 1110 int ispfs; 1111 int64_t obj_id; 1112 uint32_t localization; 1113 uint32_t max_iterations; 1114 1115 /* 1116 * Misc initialization, plus handle as-of name extensions. Look for 1117 * the '@@' extension. Note that as-of files and directories cannot 1118 * be modified. 1119 */ 1120 dip = VTOI(ap->a_dvp); 1121 ncp = ap->a_nch->ncp; 1122 asof = dip->obj_asof; 1123 localization = dip->obj_localization; /* for code consistency */ 1124 nlen = ncp->nc_nlen; 1125 flags = dip->flags & HAMMER_INODE_RO; 1126 ispfs = 0; 1127 hmp = dip->hmp; 1128 1129 lwkt_gettoken(&hmp->fs_token); 1130 hammer_simple_transaction(&trans, hmp); 1131 ++hammer_stats_file_iopsr; 1132 1133 for (i = 0; i < nlen; ++i) { 1134 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1135 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1136 &ispfs, &asof, &localization); 1137 if (error != 0) { 1138 i = nlen; 1139 break; 1140 } 1141 if (asof != HAMMER_MAX_TID) 1142 flags |= HAMMER_INODE_RO; 1143 break; 1144 } 1145 } 1146 nlen = i; 1147 1148 /* 1149 * If this is a PFS softlink we dive into the PFS 1150 */ 1151 if (ispfs && nlen == 0) { 1152 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1153 asof, localization, 1154 flags, &error); 1155 if (error == 0) { 1156 error = hammer_get_vnode(ip, &vp); 1157 hammer_rel_inode(ip, 0); 1158 } else { 1159 vp = NULL; 1160 } 1161 if (error == 0) { 1162 vn_unlock(vp); 1163 cache_setvp(ap->a_nch, vp); 1164 vrele(vp); 1165 } 1166 goto done; 1167 } 1168 1169 /* 1170 * If there is no path component the time extension is relative to dip. 1171 * e.g. "fubar/@@<snapshot>" 1172 * 1173 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1174 * e.g. "fubar/.@@<snapshot>" 1175 * 1176 * ".." is handled by the kernel. We do not currently handle 1177 * "..@<snapshot>". 1178 */ 1179 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1180 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1181 asof, dip->obj_localization, 1182 flags, &error); 1183 if (error == 0) { 1184 error = hammer_get_vnode(ip, &vp); 1185 hammer_rel_inode(ip, 0); 1186 } else { 1187 vp = NULL; 1188 } 1189 if (error == 0) { 1190 vn_unlock(vp); 1191 cache_setvp(ap->a_nch, vp); 1192 vrele(vp); 1193 } 1194 goto done; 1195 } 1196 1197 /* 1198 * Calculate the namekey and setup the key range for the scan. This 1199 * works kinda like a chained hash table where the lower 32 bits 1200 * of the namekey synthesize the chain. 1201 * 1202 * The key range is inclusive of both key_beg and key_end. 1203 */ 1204 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1205 &max_iterations); 1206 1207 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1208 cursor.key_beg.localization = dip->obj_localization | 1209 hammer_dir_localization(dip); 1210 cursor.key_beg.obj_id = dip->obj_id; 1211 cursor.key_beg.key = namekey; 1212 cursor.key_beg.create_tid = 0; 1213 cursor.key_beg.delete_tid = 0; 1214 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1215 cursor.key_beg.obj_type = 0; 1216 1217 cursor.key_end = cursor.key_beg; 1218 cursor.key_end.key += max_iterations; 1219 cursor.asof = asof; 1220 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1221 1222 /* 1223 * Scan all matching records (the chain), locate the one matching 1224 * the requested path component. 1225 * 1226 * The hammer_ip_*() functions merge in-memory records with on-disk 1227 * records for the purposes of the search. 1228 */ 1229 obj_id = 0; 1230 localization = HAMMER_DEF_LOCALIZATION; 1231 1232 if (error == 0) { 1233 error = hammer_ip_first(&cursor); 1234 while (error == 0) { 1235 error = hammer_ip_resolve_data(&cursor); 1236 if (error) 1237 break; 1238 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1239 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1240 obj_id = cursor.data->entry.obj_id; 1241 localization = cursor.data->entry.localization; 1242 break; 1243 } 1244 error = hammer_ip_next(&cursor); 1245 } 1246 } 1247 hammer_done_cursor(&cursor); 1248 1249 /* 1250 * Lookup the obj_id. This should always succeed. If it does not 1251 * the filesystem may be damaged and we return a dummy inode. 1252 */ 1253 if (error == 0) { 1254 ip = hammer_get_inode(&trans, dip, obj_id, 1255 asof, localization, 1256 flags, &error); 1257 if (error == ENOENT) { 1258 hkprintf("WARNING: Missing inode for dirent \"%s\"\n" 1259 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1260 ncp->nc_name, 1261 (long long)obj_id, (long long)asof, 1262 localization); 1263 error = 0; 1264 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1265 asof, localization, 1266 flags, &error); 1267 } 1268 if (error == 0) { 1269 error = hammer_get_vnode(ip, &vp); 1270 hammer_rel_inode(ip, 0); 1271 } else { 1272 vp = NULL; 1273 } 1274 if (error == 0) { 1275 vn_unlock(vp); 1276 cache_setvp(ap->a_nch, vp); 1277 vrele(vp); 1278 } 1279 } else if (error == ENOENT) { 1280 cache_setvp(ap->a_nch, NULL); 1281 } 1282 done: 1283 hammer_done_transaction(&trans); 1284 lwkt_reltoken(&hmp->fs_token); 1285 return (error); 1286 } 1287 1288 /* 1289 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1290 * 1291 * Locate the parent directory of a directory vnode. 1292 * 1293 * dvp is referenced but not locked. *vpp must be returned referenced and 1294 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1295 * at the root, instead it could indicate that the directory we were in was 1296 * removed. 1297 * 1298 * NOTE: as-of sequences are not linked into the directory structure. If 1299 * we are at the root with a different asof then the mount point, reload 1300 * the same directory with the mount point's asof. I'm not sure what this 1301 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1302 * get confused, but it hasn't been tested. 1303 */ 1304 static 1305 int 1306 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1307 { 1308 struct hammer_transaction trans; 1309 struct hammer_inode *dip; 1310 struct hammer_inode *ip; 1311 hammer_mount_t hmp; 1312 int64_t parent_obj_id; 1313 uint32_t parent_obj_localization; 1314 hammer_tid_t asof; 1315 int error; 1316 1317 dip = VTOI(ap->a_dvp); 1318 asof = dip->obj_asof; 1319 hmp = dip->hmp; 1320 1321 /* 1322 * Whos are parent? This could be the root of a pseudo-filesystem 1323 * whos parent is in another localization domain. 1324 */ 1325 lwkt_gettoken(&hmp->fs_token); 1326 parent_obj_id = dip->ino_data.parent_obj_id; 1327 if (dip->obj_id == HAMMER_OBJID_ROOT) 1328 parent_obj_localization = HAMMER_DEF_LOCALIZATION; 1329 else 1330 parent_obj_localization = dip->obj_localization; 1331 1332 /* 1333 * It's probably a PFS root when dip->ino_data.parent_obj_id is 0. 1334 */ 1335 if (parent_obj_id == 0) { 1336 if (dip->obj_id == HAMMER_OBJID_ROOT && 1337 asof != hmp->asof) { 1338 parent_obj_id = dip->obj_id; 1339 asof = hmp->asof; 1340 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1341 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1342 (long long)dip->obj_asof); 1343 } else { 1344 *ap->a_vpp = NULL; 1345 lwkt_reltoken(&hmp->fs_token); 1346 return ENOENT; 1347 } 1348 } 1349 1350 hammer_simple_transaction(&trans, hmp); 1351 ++hammer_stats_file_iopsr; 1352 1353 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1354 asof, parent_obj_localization, 1355 dip->flags, &error); 1356 if (ip) { 1357 error = hammer_get_vnode(ip, ap->a_vpp); 1358 hammer_rel_inode(ip, 0); 1359 } else { 1360 *ap->a_vpp = NULL; 1361 } 1362 hammer_done_transaction(&trans); 1363 lwkt_reltoken(&hmp->fs_token); 1364 return (error); 1365 } 1366 1367 /* 1368 * hammer_vop_nlink { nch, dvp, vp, cred } 1369 */ 1370 static 1371 int 1372 hammer_vop_nlink(struct vop_nlink_args *ap) 1373 { 1374 struct hammer_transaction trans; 1375 struct hammer_inode *dip; 1376 struct hammer_inode *ip; 1377 struct nchandle *nch; 1378 hammer_mount_t hmp; 1379 int error; 1380 1381 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1382 return(EXDEV); 1383 1384 nch = ap->a_nch; 1385 dip = VTOI(ap->a_dvp); 1386 ip = VTOI(ap->a_vp); 1387 hmp = dip->hmp; 1388 1389 if (dip->obj_localization != ip->obj_localization) 1390 return(EXDEV); 1391 1392 if (dip->flags & HAMMER_INODE_RO) 1393 return (EROFS); 1394 if (ip->flags & HAMMER_INODE_RO) 1395 return (EROFS); 1396 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1397 return (error); 1398 1399 /* 1400 * Create a transaction to cover the operations we perform. 1401 */ 1402 lwkt_gettoken(&hmp->fs_token); 1403 hammer_start_transaction(&trans, hmp); 1404 ++hammer_stats_file_iopsw; 1405 1406 /* 1407 * Add the filesystem object to the directory. Note that neither 1408 * dip nor ip are referenced or locked, but their vnodes are 1409 * referenced. This function will bump the inode's link count. 1410 */ 1411 error = hammer_ip_add_directory(&trans, dip, 1412 nch->ncp->nc_name, nch->ncp->nc_nlen, 1413 ip); 1414 1415 /* 1416 * Finish up. 1417 */ 1418 if (error == 0) { 1419 cache_setunresolved(nch); 1420 cache_setvp(nch, ap->a_vp); 1421 } 1422 hammer_done_transaction(&trans); 1423 hammer_knote(ap->a_vp, NOTE_LINK); 1424 hammer_knote(ap->a_dvp, NOTE_WRITE); 1425 lwkt_reltoken(&hmp->fs_token); 1426 return (error); 1427 } 1428 1429 /* 1430 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1431 * 1432 * The operating system has already ensured that the directory entry 1433 * does not exist and done all appropriate namespace locking. 1434 */ 1435 static 1436 int 1437 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1438 { 1439 struct hammer_transaction trans; 1440 struct hammer_inode *dip; 1441 struct hammer_inode *nip; 1442 struct nchandle *nch; 1443 hammer_mount_t hmp; 1444 int error; 1445 1446 nch = ap->a_nch; 1447 dip = VTOI(ap->a_dvp); 1448 hmp = dip->hmp; 1449 1450 if (dip->flags & HAMMER_INODE_RO) 1451 return (EROFS); 1452 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1453 return (error); 1454 1455 /* 1456 * Create a transaction to cover the operations we perform. 1457 */ 1458 lwkt_gettoken(&hmp->fs_token); 1459 hammer_start_transaction(&trans, hmp); 1460 ++hammer_stats_file_iopsw; 1461 1462 /* 1463 * Create a new filesystem object of the requested type. The 1464 * returned inode will be referenced but not locked. 1465 */ 1466 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1467 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1468 NULL, &nip); 1469 if (error) { 1470 hkprintf("hammer_mkdir error %d\n", error); 1471 hammer_done_transaction(&trans); 1472 *ap->a_vpp = NULL; 1473 lwkt_reltoken(&hmp->fs_token); 1474 return (error); 1475 } 1476 /* 1477 * Add the new filesystem object to the directory. This will also 1478 * bump the inode's link count. 1479 */ 1480 error = hammer_ip_add_directory(&trans, dip, 1481 nch->ncp->nc_name, nch->ncp->nc_nlen, 1482 nip); 1483 if (error) 1484 hkprintf("hammer_mkdir (add) error %d\n", error); 1485 1486 /* 1487 * Finish up. 1488 */ 1489 if (error) { 1490 hammer_rel_inode(nip, 0); 1491 *ap->a_vpp = NULL; 1492 } else { 1493 error = hammer_get_vnode(nip, ap->a_vpp); 1494 hammer_rel_inode(nip, 0); 1495 if (error == 0) { 1496 cache_setunresolved(ap->a_nch); 1497 cache_setvp(ap->a_nch, *ap->a_vpp); 1498 } 1499 } 1500 hammer_done_transaction(&trans); 1501 if (error == 0) 1502 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1503 lwkt_reltoken(&hmp->fs_token); 1504 return (error); 1505 } 1506 1507 /* 1508 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1509 * 1510 * The operating system has already ensured that the directory entry 1511 * does not exist and done all appropriate namespace locking. 1512 */ 1513 static 1514 int 1515 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1516 { 1517 struct hammer_transaction trans; 1518 struct hammer_inode *dip; 1519 struct hammer_inode *nip; 1520 struct nchandle *nch; 1521 hammer_mount_t hmp; 1522 int error; 1523 1524 nch = ap->a_nch; 1525 dip = VTOI(ap->a_dvp); 1526 hmp = dip->hmp; 1527 1528 if (dip->flags & HAMMER_INODE_RO) 1529 return (EROFS); 1530 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1531 return (error); 1532 1533 /* 1534 * Create a transaction to cover the operations we perform. 1535 */ 1536 lwkt_gettoken(&hmp->fs_token); 1537 hammer_start_transaction(&trans, hmp); 1538 ++hammer_stats_file_iopsw; 1539 1540 /* 1541 * Create a new filesystem object of the requested type. The 1542 * returned inode will be referenced but not locked. 1543 * 1544 * If mknod specifies a directory a pseudo-fs is created. 1545 */ 1546 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1547 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1548 NULL, &nip); 1549 if (error) { 1550 hammer_done_transaction(&trans); 1551 *ap->a_vpp = NULL; 1552 lwkt_reltoken(&hmp->fs_token); 1553 return (error); 1554 } 1555 1556 /* 1557 * Add the new filesystem object to the directory. This will also 1558 * bump the inode's link count. 1559 */ 1560 error = hammer_ip_add_directory(&trans, dip, 1561 nch->ncp->nc_name, nch->ncp->nc_nlen, 1562 nip); 1563 1564 /* 1565 * Finish up. 1566 */ 1567 if (error) { 1568 hammer_rel_inode(nip, 0); 1569 *ap->a_vpp = NULL; 1570 } else { 1571 error = hammer_get_vnode(nip, ap->a_vpp); 1572 hammer_rel_inode(nip, 0); 1573 if (error == 0) { 1574 cache_setunresolved(ap->a_nch); 1575 cache_setvp(ap->a_nch, *ap->a_vpp); 1576 } 1577 } 1578 hammer_done_transaction(&trans); 1579 if (error == 0) 1580 hammer_knote(ap->a_dvp, NOTE_WRITE); 1581 lwkt_reltoken(&hmp->fs_token); 1582 return (error); 1583 } 1584 1585 /* 1586 * hammer_vop_open { vp, mode, cred, fp } 1587 * 1588 * MPSAFE (does not require fs_token) 1589 */ 1590 static 1591 int 1592 hammer_vop_open(struct vop_open_args *ap) 1593 { 1594 hammer_inode_t ip; 1595 1596 ++hammer_stats_file_iopsr; 1597 ip = VTOI(ap->a_vp); 1598 1599 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1600 return (EROFS); 1601 return(vop_stdopen(ap)); 1602 } 1603 1604 /* 1605 * hammer_vop_print { vp } 1606 */ 1607 static 1608 int 1609 hammer_vop_print(struct vop_print_args *ap) 1610 { 1611 return EOPNOTSUPP; 1612 } 1613 1614 /* 1615 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1616 */ 1617 static 1618 int 1619 hammer_vop_readdir(struct vop_readdir_args *ap) 1620 { 1621 struct hammer_transaction trans; 1622 struct hammer_cursor cursor; 1623 struct hammer_inode *ip; 1624 hammer_mount_t hmp; 1625 struct uio *uio; 1626 hammer_base_elm_t base; 1627 int error; 1628 int cookie_index; 1629 int ncookies; 1630 off_t *cookies; 1631 off_t saveoff; 1632 int r; 1633 int dtype; 1634 1635 ++hammer_stats_file_iopsr; 1636 ip = VTOI(ap->a_vp); 1637 uio = ap->a_uio; 1638 saveoff = uio->uio_offset; 1639 hmp = ip->hmp; 1640 1641 if (ap->a_ncookies) { 1642 ncookies = uio->uio_resid / 16 + 1; 1643 if (ncookies > 1024) 1644 ncookies = 1024; 1645 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1646 cookie_index = 0; 1647 } else { 1648 ncookies = -1; 1649 cookies = NULL; 1650 cookie_index = 0; 1651 } 1652 1653 lwkt_gettoken(&hmp->fs_token); 1654 hammer_simple_transaction(&trans, hmp); 1655 1656 /* 1657 * Handle artificial entries 1658 * 1659 * It should be noted that the minimum value for a directory 1660 * hash key on-media is 0x0000000100000000, so we can use anything 1661 * less then that to represent our 'special' key space. 1662 */ 1663 error = 0; 1664 if (saveoff == 0) { 1665 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1666 if (r) 1667 goto done; 1668 if (cookies) 1669 cookies[cookie_index] = saveoff; 1670 ++saveoff; 1671 ++cookie_index; 1672 if (cookie_index == ncookies) 1673 goto done; 1674 } 1675 if (saveoff == 1) { 1676 if (ip->ino_data.parent_obj_id) { 1677 r = vop_write_dirent(&error, uio, 1678 ip->ino_data.parent_obj_id, 1679 DT_DIR, 2, ".."); 1680 } else { 1681 r = vop_write_dirent(&error, uio, 1682 ip->obj_id, DT_DIR, 2, ".."); 1683 } 1684 if (r) 1685 goto done; 1686 if (cookies) 1687 cookies[cookie_index] = saveoff; 1688 ++saveoff; 1689 ++cookie_index; 1690 if (cookie_index == ncookies) 1691 goto done; 1692 } 1693 1694 /* 1695 * Key range (begin and end inclusive) to scan. Directory keys 1696 * directly translate to a 64 bit 'seek' position. 1697 */ 1698 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1699 cursor.key_beg.localization = ip->obj_localization | 1700 hammer_dir_localization(ip); 1701 cursor.key_beg.obj_id = ip->obj_id; 1702 cursor.key_beg.create_tid = 0; 1703 cursor.key_beg.delete_tid = 0; 1704 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1705 cursor.key_beg.obj_type = 0; 1706 cursor.key_beg.key = saveoff; 1707 1708 cursor.key_end = cursor.key_beg; 1709 cursor.key_end.key = HAMMER_MAX_KEY; 1710 cursor.asof = ip->obj_asof; 1711 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1712 1713 error = hammer_ip_first(&cursor); 1714 1715 while (error == 0) { 1716 error = hammer_ip_resolve_data(&cursor); 1717 if (error) 1718 break; 1719 base = &cursor.leaf->base; 1720 saveoff = base->key; 1721 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1722 1723 if (base->obj_id != ip->obj_id) 1724 hpanic("bad record at %p", cursor.node); 1725 1726 /* 1727 * Convert pseudo-filesystems into softlinks 1728 */ 1729 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1730 r = vop_write_dirent( 1731 &error, uio, cursor.data->entry.obj_id, 1732 dtype, 1733 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1734 (void *)cursor.data->entry.name); 1735 if (r) 1736 break; 1737 ++saveoff; 1738 if (cookies) 1739 cookies[cookie_index] = base->key; 1740 ++cookie_index; 1741 if (cookie_index == ncookies) 1742 break; 1743 error = hammer_ip_next(&cursor); 1744 } 1745 hammer_done_cursor(&cursor); 1746 1747 done: 1748 hammer_done_transaction(&trans); 1749 1750 if (ap->a_eofflag) 1751 *ap->a_eofflag = (error == ENOENT); 1752 uio->uio_offset = saveoff; 1753 if (error && cookie_index == 0) { 1754 if (error == ENOENT) 1755 error = 0; 1756 if (cookies) { 1757 kfree(cookies, M_TEMP); 1758 *ap->a_ncookies = 0; 1759 *ap->a_cookies = NULL; 1760 } 1761 } else { 1762 if (error == ENOENT) 1763 error = 0; 1764 if (cookies) { 1765 *ap->a_ncookies = cookie_index; 1766 *ap->a_cookies = cookies; 1767 } 1768 } 1769 lwkt_reltoken(&hmp->fs_token); 1770 return(error); 1771 } 1772 1773 /* 1774 * hammer_vop_readlink { vp, uio, cred } 1775 */ 1776 static 1777 int 1778 hammer_vop_readlink(struct vop_readlink_args *ap) 1779 { 1780 struct hammer_transaction trans; 1781 struct hammer_cursor cursor; 1782 struct hammer_inode *ip; 1783 hammer_mount_t hmp; 1784 char buf[32]; 1785 uint32_t localization; 1786 hammer_pseudofs_inmem_t pfsm; 1787 int error; 1788 1789 ip = VTOI(ap->a_vp); 1790 hmp = ip->hmp; 1791 1792 lwkt_gettoken(&hmp->fs_token); 1793 1794 /* 1795 * Shortcut if the symlink data was stuffed into ino_data. 1796 * 1797 * Also expand special "@@PFS%05d" softlinks (expansion only 1798 * occurs for non-historical (current) accesses made from the 1799 * primary filesystem). 1800 * 1801 * Note that userspace hammer command does not allow users to 1802 * create a @@PFS softlink under an existing other PFS (id!=0) 1803 * so the ip localization here for @@PFS softlink is always 0. 1804 */ 1805 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1806 char *ptr; 1807 int bytes; 1808 1809 ptr = ip->ino_data.ext.symlink; 1810 bytes = (int)ip->ino_data.size; 1811 if (bytes == 10 && 1812 ip->obj_asof == HAMMER_MAX_TID && 1813 ip->obj_localization == HAMMER_DEF_LOCALIZATION && 1814 strncmp(ptr, "@@PFS", 5) == 0) { 1815 hammer_simple_transaction(&trans, hmp); 1816 bcopy(ptr + 5, buf, 5); 1817 buf[5] = 0; 1818 localization = pfs_to_lo(strtoul(buf, NULL, 10)); 1819 pfsm = hammer_load_pseudofs(&trans, localization, 1820 &error); 1821 if (error == 0) { 1822 if (pfsm->pfsd.mirror_flags & 1823 HAMMER_PFSD_SLAVE) { 1824 /* vap->va_size == 26 */ 1825 ksnprintf(buf, sizeof(buf), 1826 "@@0x%016llx:%05d", 1827 (long long)pfsm->pfsd.sync_end_tid, 1828 lo_to_pfs(localization)); 1829 } else { 1830 /* vap->va_size == 10 */ 1831 ksnprintf(buf, sizeof(buf), 1832 "@@-1:%05d", 1833 lo_to_pfs(localization)); 1834 } 1835 ptr = buf; 1836 bytes = strlen(buf); 1837 } 1838 if (pfsm) 1839 hammer_rel_pseudofs(hmp, pfsm); 1840 hammer_done_transaction(&trans); 1841 } 1842 error = uiomove(ptr, bytes, ap->a_uio); 1843 lwkt_reltoken(&hmp->fs_token); 1844 return(error); 1845 } 1846 1847 /* 1848 * Long version 1849 */ 1850 hammer_simple_transaction(&trans, hmp); 1851 ++hammer_stats_file_iopsr; 1852 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1853 1854 /* 1855 * Key range (begin and end inclusive) to scan. Directory keys 1856 * directly translate to a 64 bit 'seek' position. 1857 */ 1858 cursor.key_beg.localization = ip->obj_localization | 1859 HAMMER_LOCALIZE_MISC; 1860 cursor.key_beg.obj_id = ip->obj_id; 1861 cursor.key_beg.create_tid = 0; 1862 cursor.key_beg.delete_tid = 0; 1863 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1864 cursor.key_beg.obj_type = 0; 1865 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1866 cursor.asof = ip->obj_asof; 1867 cursor.flags |= HAMMER_CURSOR_ASOF; 1868 1869 error = hammer_ip_lookup(&cursor); 1870 if (error == 0) { 1871 error = hammer_ip_resolve_data(&cursor); 1872 if (error == 0) { 1873 KKASSERT(cursor.leaf->data_len >= 1874 HAMMER_SYMLINK_NAME_OFF); 1875 error = uiomove(cursor.data->symlink.name, 1876 cursor.leaf->data_len - 1877 HAMMER_SYMLINK_NAME_OFF, 1878 ap->a_uio); 1879 } 1880 } 1881 hammer_done_cursor(&cursor); 1882 hammer_done_transaction(&trans); 1883 lwkt_reltoken(&hmp->fs_token); 1884 return(error); 1885 } 1886 1887 /* 1888 * hammer_vop_nremove { nch, dvp, cred } 1889 */ 1890 static 1891 int 1892 hammer_vop_nremove(struct vop_nremove_args *ap) 1893 { 1894 struct hammer_transaction trans; 1895 struct hammer_inode *dip; 1896 hammer_mount_t hmp; 1897 int error; 1898 1899 dip = VTOI(ap->a_dvp); 1900 hmp = dip->hmp; 1901 1902 if (hammer_nohistory(dip) == 0 && 1903 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1904 return (error); 1905 } 1906 1907 lwkt_gettoken(&hmp->fs_token); 1908 hammer_start_transaction(&trans, hmp); 1909 ++hammer_stats_file_iopsw; 1910 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1911 hammer_done_transaction(&trans); 1912 if (error == 0) 1913 hammer_knote(ap->a_dvp, NOTE_WRITE); 1914 lwkt_reltoken(&hmp->fs_token); 1915 return (error); 1916 } 1917 1918 /* 1919 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1920 */ 1921 static 1922 int 1923 hammer_vop_nrename(struct vop_nrename_args *ap) 1924 { 1925 struct hammer_transaction trans; 1926 struct namecache *fncp; 1927 struct namecache *tncp; 1928 struct hammer_inode *fdip; 1929 struct hammer_inode *tdip; 1930 struct hammer_inode *ip; 1931 hammer_mount_t hmp; 1932 struct hammer_cursor cursor; 1933 int64_t namekey; 1934 uint32_t max_iterations; 1935 int nlen, error; 1936 1937 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1938 return(EXDEV); 1939 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1940 return(EXDEV); 1941 1942 fdip = VTOI(ap->a_fdvp); 1943 tdip = VTOI(ap->a_tdvp); 1944 fncp = ap->a_fnch->ncp; 1945 tncp = ap->a_tnch->ncp; 1946 ip = VTOI(fncp->nc_vp); 1947 KKASSERT(ip != NULL); 1948 1949 hmp = ip->hmp; 1950 1951 if (fdip->obj_localization != tdip->obj_localization) 1952 return(EXDEV); 1953 if (fdip->obj_localization != ip->obj_localization) 1954 return(EXDEV); 1955 1956 if (fdip->flags & HAMMER_INODE_RO) 1957 return (EROFS); 1958 if (tdip->flags & HAMMER_INODE_RO) 1959 return (EROFS); 1960 if (ip->flags & HAMMER_INODE_RO) 1961 return (EROFS); 1962 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1963 return (error); 1964 1965 lwkt_gettoken(&hmp->fs_token); 1966 hammer_start_transaction(&trans, hmp); 1967 ++hammer_stats_file_iopsw; 1968 1969 /* 1970 * Remove tncp from the target directory and then link ip as 1971 * tncp. XXX pass trans to dounlink 1972 * 1973 * Force the inode sync-time to match the transaction so it is 1974 * in-sync with the creation of the target directory entry. 1975 */ 1976 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1977 ap->a_cred, 0, -1); 1978 if (error == 0 || error == ENOENT) { 1979 error = hammer_ip_add_directory(&trans, tdip, 1980 tncp->nc_name, tncp->nc_nlen, 1981 ip); 1982 if (error == 0) { 1983 ip->ino_data.parent_obj_id = tdip->obj_id; 1984 ip->ino_data.ctime = trans.time; 1985 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 1986 } 1987 } 1988 if (error) 1989 goto failed; /* XXX */ 1990 1991 /* 1992 * Locate the record in the originating directory and remove it. 1993 * 1994 * Calculate the namekey and setup the key range for the scan. This 1995 * works kinda like a chained hash table where the lower 32 bits 1996 * of the namekey synthesize the chain. 1997 * 1998 * The key range is inclusive of both key_beg and key_end. 1999 */ 2000 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2001 &max_iterations); 2002 retry: 2003 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2004 cursor.key_beg.localization = fdip->obj_localization | 2005 hammer_dir_localization(fdip); 2006 cursor.key_beg.obj_id = fdip->obj_id; 2007 cursor.key_beg.key = namekey; 2008 cursor.key_beg.create_tid = 0; 2009 cursor.key_beg.delete_tid = 0; 2010 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2011 cursor.key_beg.obj_type = 0; 2012 2013 cursor.key_end = cursor.key_beg; 2014 cursor.key_end.key += max_iterations; 2015 cursor.asof = fdip->obj_asof; 2016 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2017 2018 /* 2019 * Scan all matching records (the chain), locate the one matching 2020 * the requested path component. 2021 * 2022 * The hammer_ip_*() functions merge in-memory records with on-disk 2023 * records for the purposes of the search. 2024 */ 2025 error = hammer_ip_first(&cursor); 2026 while (error == 0) { 2027 if (hammer_ip_resolve_data(&cursor) != 0) 2028 break; 2029 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2030 KKASSERT(nlen > 0); 2031 if (fncp->nc_nlen == nlen && 2032 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2033 break; 2034 } 2035 error = hammer_ip_next(&cursor); 2036 } 2037 2038 /* 2039 * If all is ok we have to get the inode so we can adjust nlinks. 2040 * 2041 * WARNING: hammer_ip_del_directory() may have to terminate the 2042 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2043 * twice. 2044 */ 2045 if (error == 0) 2046 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2047 2048 /* 2049 * XXX A deadlock here will break rename's atomicy for the purposes 2050 * of crash recovery. 2051 */ 2052 if (error == EDEADLK) { 2053 hammer_done_cursor(&cursor); 2054 goto retry; 2055 } 2056 2057 /* 2058 * Cleanup and tell the kernel that the rename succeeded. 2059 * 2060 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2061 * without formally acquiring the vp since the vp might 2062 * have zero refs on it, or in the middle of a reclaim, 2063 * etc. 2064 */ 2065 hammer_done_cursor(&cursor); 2066 if (error == 0) { 2067 cache_rename(ap->a_fnch, ap->a_tnch); 2068 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2069 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2070 while (ip->vp) { 2071 struct vnode *vp; 2072 2073 error = hammer_get_vnode(ip, &vp); 2074 if (error == 0 && vp) { 2075 vn_unlock(vp); 2076 hammer_knote(ip->vp, NOTE_RENAME); 2077 vrele(vp); 2078 break; 2079 } 2080 hdkprintf("ip/vp race2 avoided\n"); 2081 } 2082 } 2083 2084 failed: 2085 hammer_done_transaction(&trans); 2086 lwkt_reltoken(&hmp->fs_token); 2087 return (error); 2088 } 2089 2090 /* 2091 * hammer_vop_nrmdir { nch, dvp, cred } 2092 */ 2093 static 2094 int 2095 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2096 { 2097 struct hammer_transaction trans; 2098 struct hammer_inode *dip; 2099 hammer_mount_t hmp; 2100 int error; 2101 2102 dip = VTOI(ap->a_dvp); 2103 hmp = dip->hmp; 2104 2105 if (hammer_nohistory(dip) == 0 && 2106 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2107 return (error); 2108 } 2109 2110 lwkt_gettoken(&hmp->fs_token); 2111 hammer_start_transaction(&trans, hmp); 2112 ++hammer_stats_file_iopsw; 2113 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2114 hammer_done_transaction(&trans); 2115 if (error == 0) 2116 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2117 lwkt_reltoken(&hmp->fs_token); 2118 return (error); 2119 } 2120 2121 /* 2122 * hammer_vop_markatime { vp, cred } 2123 */ 2124 static 2125 int 2126 hammer_vop_markatime(struct vop_markatime_args *ap) 2127 { 2128 struct hammer_transaction trans; 2129 struct hammer_inode *ip; 2130 hammer_mount_t hmp; 2131 2132 ip = VTOI(ap->a_vp); 2133 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2134 return (EROFS); 2135 if (ip->flags & HAMMER_INODE_RO) 2136 return (EROFS); 2137 hmp = ip->hmp; 2138 if (hmp->mp->mnt_flag & MNT_NOATIME) 2139 return (0); 2140 lwkt_gettoken(&hmp->fs_token); 2141 hammer_start_transaction(&trans, hmp); 2142 ++hammer_stats_file_iopsw; 2143 2144 ip->ino_data.atime = trans.time; 2145 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2146 hammer_done_transaction(&trans); 2147 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2148 lwkt_reltoken(&hmp->fs_token); 2149 return (0); 2150 } 2151 2152 /* 2153 * hammer_vop_setattr { vp, vap, cred } 2154 */ 2155 static 2156 int 2157 hammer_vop_setattr(struct vop_setattr_args *ap) 2158 { 2159 struct hammer_transaction trans; 2160 struct hammer_inode *ip; 2161 struct vattr *vap; 2162 hammer_mount_t hmp; 2163 int modflags; 2164 int error; 2165 int truncating; 2166 int blksize; 2167 int kflags; 2168 #if 0 2169 int64_t aligned_size; 2170 #endif 2171 uint32_t flags; 2172 2173 vap = ap->a_vap; 2174 ip = ap->a_vp->v_data; 2175 modflags = 0; 2176 kflags = 0; 2177 hmp = ip->hmp; 2178 2179 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2180 return(EROFS); 2181 if (ip->flags & HAMMER_INODE_RO) 2182 return (EROFS); 2183 if (hammer_nohistory(ip) == 0 && 2184 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2185 return (error); 2186 } 2187 2188 lwkt_gettoken(&hmp->fs_token); 2189 hammer_start_transaction(&trans, hmp); 2190 ++hammer_stats_file_iopsw; 2191 error = 0; 2192 2193 if (vap->va_flags != VNOVAL) { 2194 flags = ip->ino_data.uflags; 2195 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2196 hammer_to_unix_xid(&ip->ino_data.uid), 2197 ap->a_cred); 2198 if (error == 0) { 2199 if (ip->ino_data.uflags != flags) { 2200 ip->ino_data.uflags = flags; 2201 ip->ino_data.ctime = trans.time; 2202 modflags |= HAMMER_INODE_DDIRTY; 2203 kflags |= NOTE_ATTRIB; 2204 } 2205 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2206 error = 0; 2207 goto done; 2208 } 2209 } 2210 goto done; 2211 } 2212 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2213 error = EPERM; 2214 goto done; 2215 } 2216 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2217 mode_t cur_mode = ip->ino_data.mode; 2218 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2219 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2220 uuid_t uuid_uid; 2221 uuid_t uuid_gid; 2222 2223 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2224 ap->a_cred, 2225 &cur_uid, &cur_gid, &cur_mode); 2226 if (error == 0) { 2227 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2228 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2229 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2230 sizeof(uuid_uid)) || 2231 bcmp(&uuid_gid, &ip->ino_data.gid, 2232 sizeof(uuid_gid)) || 2233 ip->ino_data.mode != cur_mode) { 2234 ip->ino_data.uid = uuid_uid; 2235 ip->ino_data.gid = uuid_gid; 2236 ip->ino_data.mode = cur_mode; 2237 ip->ino_data.ctime = trans.time; 2238 modflags |= HAMMER_INODE_DDIRTY; 2239 } 2240 kflags |= NOTE_ATTRIB; 2241 } 2242 } 2243 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2244 switch(ap->a_vp->v_type) { 2245 case VREG: 2246 if (vap->va_size == ip->ino_data.size) 2247 break; 2248 2249 /* 2250 * Log the operation if in fast-fsync mode or if 2251 * there are unterminated redo write records present. 2252 * 2253 * The second check is needed so the recovery code 2254 * properly truncates write redos even if nominal 2255 * REDO operations is turned off due to excessive 2256 * writes, because the related records might be 2257 * destroyed and never lay down a TERM_WRITE. 2258 */ 2259 if ((ip->flags & HAMMER_INODE_REDO) || 2260 (ip->flags & HAMMER_INODE_RDIRTY)) { 2261 error = hammer_generate_redo(&trans, ip, 2262 vap->va_size, 2263 HAMMER_REDO_TRUNC, 2264 NULL, 0); 2265 } 2266 blksize = hammer_blocksize(vap->va_size); 2267 2268 /* 2269 * XXX break atomicy, we can deadlock the backend 2270 * if we do not release the lock. Probably not a 2271 * big deal here. 2272 */ 2273 if (vap->va_size < ip->ino_data.size) { 2274 nvtruncbuf(ap->a_vp, vap->va_size, 2275 blksize, 2276 hammer_blockoff(vap->va_size), 2277 0); 2278 truncating = 1; 2279 kflags |= NOTE_WRITE; 2280 } else { 2281 nvextendbuf(ap->a_vp, 2282 ip->ino_data.size, 2283 vap->va_size, 2284 hammer_blocksize(ip->ino_data.size), 2285 hammer_blocksize(vap->va_size), 2286 hammer_blockoff(ip->ino_data.size), 2287 hammer_blockoff(vap->va_size), 2288 0); 2289 truncating = 0; 2290 kflags |= NOTE_WRITE | NOTE_EXTEND; 2291 } 2292 ip->ino_data.size = vap->va_size; 2293 ip->ino_data.mtime = trans.time; 2294 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2295 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2296 2297 /* 2298 * On-media truncation is cached in the inode until 2299 * the inode is synchronized. We must immediately 2300 * handle any frontend records. 2301 */ 2302 if (truncating) { 2303 hammer_ip_frontend_trunc(ip, vap->va_size); 2304 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2305 ip->flags |= HAMMER_INODE_TRUNCATED; 2306 ip->trunc_off = vap->va_size; 2307 hammer_inode_dirty(ip); 2308 } else if (ip->trunc_off > vap->va_size) { 2309 ip->trunc_off = vap->va_size; 2310 } 2311 } 2312 2313 #if 0 2314 /* 2315 * When truncating, nvtruncbuf() may have cleaned out 2316 * a portion of the last block on-disk in the buffer 2317 * cache. We must clean out any frontend records 2318 * for blocks beyond the new last block. 2319 */ 2320 aligned_size = (vap->va_size + (blksize - 1)) & 2321 ~(int64_t)(blksize - 1); 2322 if (truncating && vap->va_size < aligned_size) { 2323 aligned_size -= blksize; 2324 hammer_ip_frontend_trunc(ip, aligned_size); 2325 } 2326 #endif 2327 break; 2328 case VDATABASE: 2329 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2330 ip->flags |= HAMMER_INODE_TRUNCATED; 2331 ip->trunc_off = vap->va_size; 2332 hammer_inode_dirty(ip); 2333 } else if (ip->trunc_off > vap->va_size) { 2334 ip->trunc_off = vap->va_size; 2335 } 2336 hammer_ip_frontend_trunc(ip, vap->va_size); 2337 ip->ino_data.size = vap->va_size; 2338 ip->ino_data.mtime = trans.time; 2339 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2340 kflags |= NOTE_ATTRIB; 2341 break; 2342 default: 2343 error = EINVAL; 2344 goto done; 2345 } 2346 break; 2347 } 2348 if (vap->va_atime.tv_sec != VNOVAL) { 2349 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2350 modflags |= HAMMER_INODE_ATIME; 2351 kflags |= NOTE_ATTRIB; 2352 } 2353 if (vap->va_mtime.tv_sec != VNOVAL) { 2354 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2355 modflags |= HAMMER_INODE_MTIME; 2356 kflags |= NOTE_ATTRIB; 2357 } 2358 if (vap->va_mode != (mode_t)VNOVAL) { 2359 mode_t cur_mode = ip->ino_data.mode; 2360 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2361 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2362 2363 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2364 cur_uid, cur_gid, &cur_mode); 2365 if (error == 0 && ip->ino_data.mode != cur_mode) { 2366 ip->ino_data.mode = cur_mode; 2367 ip->ino_data.ctime = trans.time; 2368 modflags |= HAMMER_INODE_DDIRTY; 2369 kflags |= NOTE_ATTRIB; 2370 } 2371 } 2372 done: 2373 if (error == 0) 2374 hammer_modify_inode(&trans, ip, modflags); 2375 hammer_done_transaction(&trans); 2376 hammer_knote(ap->a_vp, kflags); 2377 lwkt_reltoken(&hmp->fs_token); 2378 return (error); 2379 } 2380 2381 /* 2382 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2383 */ 2384 static 2385 int 2386 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2387 { 2388 struct hammer_transaction trans; 2389 struct hammer_inode *dip; 2390 struct hammer_inode *nip; 2391 hammer_record_t record; 2392 struct nchandle *nch; 2393 hammer_mount_t hmp; 2394 int error; 2395 int bytes; 2396 2397 ap->a_vap->va_type = VLNK; 2398 2399 nch = ap->a_nch; 2400 dip = VTOI(ap->a_dvp); 2401 hmp = dip->hmp; 2402 2403 if (dip->flags & HAMMER_INODE_RO) 2404 return (EROFS); 2405 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2406 return (error); 2407 2408 /* 2409 * Create a transaction to cover the operations we perform. 2410 */ 2411 lwkt_gettoken(&hmp->fs_token); 2412 hammer_start_transaction(&trans, hmp); 2413 ++hammer_stats_file_iopsw; 2414 2415 /* 2416 * Create a new filesystem object of the requested type. The 2417 * returned inode will be referenced but not locked. 2418 */ 2419 2420 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2421 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2422 NULL, &nip); 2423 if (error) { 2424 hammer_done_transaction(&trans); 2425 *ap->a_vpp = NULL; 2426 lwkt_reltoken(&hmp->fs_token); 2427 return (error); 2428 } 2429 2430 /* 2431 * Add a record representing the symlink. symlink stores the link 2432 * as pure data, not a string, and is no \0 terminated. 2433 */ 2434 if (error == 0) { 2435 bytes = strlen(ap->a_target); 2436 2437 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2438 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2439 } else { 2440 record = hammer_alloc_mem_record(nip, bytes); 2441 record->type = HAMMER_MEM_RECORD_GENERAL; 2442 2443 record->leaf.base.localization = nip->obj_localization | 2444 HAMMER_LOCALIZE_MISC; 2445 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2446 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2447 record->leaf.data_len = bytes; 2448 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2449 bcopy(ap->a_target, record->data->symlink.name, bytes); 2450 error = hammer_ip_add_record(&trans, record); 2451 } 2452 2453 /* 2454 * Set the file size to the length of the link. 2455 */ 2456 if (error == 0) { 2457 nip->ino_data.size = bytes; 2458 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2459 } 2460 } 2461 if (error == 0) 2462 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2463 nch->ncp->nc_nlen, nip); 2464 2465 /* 2466 * Finish up. 2467 */ 2468 if (error) { 2469 hammer_rel_inode(nip, 0); 2470 *ap->a_vpp = NULL; 2471 } else { 2472 error = hammer_get_vnode(nip, ap->a_vpp); 2473 hammer_rel_inode(nip, 0); 2474 if (error == 0) { 2475 cache_setunresolved(ap->a_nch); 2476 cache_setvp(ap->a_nch, *ap->a_vpp); 2477 hammer_knote(ap->a_dvp, NOTE_WRITE); 2478 } 2479 } 2480 hammer_done_transaction(&trans); 2481 lwkt_reltoken(&hmp->fs_token); 2482 return (error); 2483 } 2484 2485 /* 2486 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2487 */ 2488 static 2489 int 2490 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2491 { 2492 struct hammer_transaction trans; 2493 struct hammer_inode *dip; 2494 hammer_mount_t hmp; 2495 int error; 2496 2497 dip = VTOI(ap->a_dvp); 2498 hmp = dip->hmp; 2499 2500 if (hammer_nohistory(dip) == 0 && 2501 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2502 return (error); 2503 } 2504 2505 lwkt_gettoken(&hmp->fs_token); 2506 hammer_start_transaction(&trans, hmp); 2507 ++hammer_stats_file_iopsw; 2508 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2509 ap->a_cred, ap->a_flags, -1); 2510 hammer_done_transaction(&trans); 2511 lwkt_reltoken(&hmp->fs_token); 2512 2513 return (error); 2514 } 2515 2516 /* 2517 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2518 */ 2519 static 2520 int 2521 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2522 { 2523 struct hammer_inode *ip = ap->a_vp->v_data; 2524 hammer_mount_t hmp = ip->hmp; 2525 int error; 2526 2527 ++hammer_stats_file_iopsr; 2528 lwkt_gettoken(&hmp->fs_token); 2529 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2530 ap->a_fflag, ap->a_cred); 2531 lwkt_reltoken(&hmp->fs_token); 2532 return (error); 2533 } 2534 2535 static 2536 int 2537 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2538 { 2539 static const struct mountctl_opt extraopt[] = { 2540 { HMNT_NOHISTORY, "nohistory" }, 2541 { HMNT_MASTERID, "master" }, 2542 { HMNT_NOMIRROR, "nomirror" }, 2543 { 0, NULL} 2544 2545 }; 2546 struct hammer_mount *hmp; 2547 struct mount *mp; 2548 int usedbytes; 2549 int error; 2550 2551 error = 0; 2552 usedbytes = 0; 2553 mp = ap->a_head.a_ops->head.vv_mount; 2554 KKASSERT(mp->mnt_data != NULL); 2555 hmp = (struct hammer_mount *)mp->mnt_data; 2556 2557 lwkt_gettoken(&hmp->fs_token); 2558 2559 switch(ap->a_op) { 2560 case MOUNTCTL_SET_EXPORT: 2561 if (ap->a_ctllen != sizeof(struct export_args)) 2562 error = EINVAL; 2563 else 2564 error = hammer_vfs_export(mp, ap->a_op, 2565 (const struct export_args *)ap->a_ctl); 2566 break; 2567 case MOUNTCTL_MOUNTFLAGS: 2568 /* 2569 * Call standard mountctl VOP function 2570 * so we get user mount flags. 2571 */ 2572 error = vop_stdmountctl(ap); 2573 if (error) 2574 break; 2575 2576 usedbytes = *ap->a_res; 2577 2578 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2579 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2580 ap->a_buf, 2581 ap->a_buflen - usedbytes, 2582 &error); 2583 } 2584 2585 *ap->a_res += usedbytes; 2586 break; 2587 default: 2588 error = vop_stdmountctl(ap); 2589 break; 2590 } 2591 lwkt_reltoken(&hmp->fs_token); 2592 return(error); 2593 } 2594 2595 /* 2596 * hammer_vop_strategy { vp, bio } 2597 * 2598 * Strategy call, used for regular file read & write only. Note that the 2599 * bp may represent a cluster. 2600 * 2601 * To simplify operation and allow better optimizations in the future, 2602 * this code does not make any assumptions with regards to buffer alignment 2603 * or size. 2604 */ 2605 static 2606 int 2607 hammer_vop_strategy(struct vop_strategy_args *ap) 2608 { 2609 struct buf *bp; 2610 int error; 2611 2612 bp = ap->a_bio->bio_buf; 2613 2614 switch(bp->b_cmd) { 2615 case BUF_CMD_READ: 2616 error = hammer_vop_strategy_read(ap); 2617 break; 2618 case BUF_CMD_WRITE: 2619 error = hammer_vop_strategy_write(ap); 2620 break; 2621 default: 2622 bp->b_error = error = EINVAL; 2623 bp->b_flags |= B_ERROR; 2624 biodone(ap->a_bio); 2625 break; 2626 } 2627 2628 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2629 2630 return (error); 2631 } 2632 2633 /* 2634 * Read from a regular file. Iterate the related records and fill in the 2635 * BIO/BUF. Gaps are zero-filled. 2636 * 2637 * The support code in hammer_object.c should be used to deal with mixed 2638 * in-memory and on-disk records. 2639 * 2640 * NOTE: Can be called from the cluster code with an oversized buf. 2641 * 2642 * XXX atime update 2643 */ 2644 static 2645 int 2646 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2647 { 2648 struct hammer_transaction trans; 2649 struct hammer_inode *ip; 2650 struct hammer_inode *dip; 2651 hammer_mount_t hmp; 2652 struct hammer_cursor cursor; 2653 hammer_base_elm_t base; 2654 hammer_off_t disk_offset; 2655 struct bio *bio; 2656 struct bio *nbio; 2657 struct buf *bp; 2658 int64_t rec_offset; 2659 int64_t ran_end; 2660 int64_t tmp64; 2661 int error; 2662 int boff; 2663 int roff; 2664 int n; 2665 int isdedupable; 2666 2667 bio = ap->a_bio; 2668 bp = bio->bio_buf; 2669 ip = ap->a_vp->v_data; 2670 hmp = ip->hmp; 2671 2672 /* 2673 * The zone-2 disk offset may have been set by the cluster code via 2674 * a BMAP operation, or else should be NOOFFSET. 2675 * 2676 * Checking the high bits for a match against zone-2 should suffice. 2677 * 2678 * In cases where a lot of data duplication is present it may be 2679 * more beneficial to drop through and doubule-buffer through the 2680 * device. 2681 */ 2682 nbio = push_bio(bio); 2683 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2684 HAMMER_ZONE_LARGE_DATA) { 2685 if (hammer_double_buffer == 0) { 2686 lwkt_gettoken(&hmp->fs_token); 2687 error = hammer_io_direct_read(hmp, nbio, NULL); 2688 lwkt_reltoken(&hmp->fs_token); 2689 return (error); 2690 } 2691 2692 /* 2693 * Try to shortcut requests for double_buffer mode too. 2694 * Since this mode runs through the device buffer cache 2695 * only compatible buffer sizes (meaning those generated 2696 * by normal filesystem buffers) are legal. 2697 */ 2698 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2699 lwkt_gettoken(&hmp->fs_token); 2700 error = hammer_io_indirect_read(hmp, nbio, NULL); 2701 lwkt_reltoken(&hmp->fs_token); 2702 return (error); 2703 } 2704 } 2705 2706 /* 2707 * Well, that sucked. Do it the hard way. If all the stars are 2708 * aligned we may still be able to issue a direct-read. 2709 */ 2710 lwkt_gettoken(&hmp->fs_token); 2711 hammer_simple_transaction(&trans, hmp); 2712 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2713 2714 /* 2715 * Key range (begin and end inclusive) to scan. Note that the key's 2716 * stored in the actual records represent BASE+LEN, not BASE. The 2717 * first record containing bio_offset will have a key > bio_offset. 2718 */ 2719 cursor.key_beg.localization = ip->obj_localization | 2720 HAMMER_LOCALIZE_MISC; 2721 cursor.key_beg.obj_id = ip->obj_id; 2722 cursor.key_beg.create_tid = 0; 2723 cursor.key_beg.delete_tid = 0; 2724 cursor.key_beg.obj_type = 0; 2725 cursor.key_beg.key = bio->bio_offset + 1; 2726 cursor.asof = ip->obj_asof; 2727 cursor.flags |= HAMMER_CURSOR_ASOF; 2728 2729 cursor.key_end = cursor.key_beg; 2730 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2731 #if 0 2732 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2733 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2734 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2735 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2736 } else 2737 #endif 2738 { 2739 ran_end = bio->bio_offset + bp->b_bufsize; 2740 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2741 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2742 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2743 if (tmp64 < ran_end) 2744 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2745 else 2746 cursor.key_end.key = ran_end + MAXPHYS + 1; 2747 } 2748 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2749 2750 /* 2751 * Set NOSWAPCACHE for cursor data extraction if double buffering 2752 * is disabled or (if the file is not marked cacheable via chflags 2753 * and vm.swapcache_use_chflags is enabled). 2754 */ 2755 if (hammer_double_buffer == 0 || 2756 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2757 vm_swapcache_use_chflags)) { 2758 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2759 } 2760 2761 error = hammer_ip_first(&cursor); 2762 boff = 0; 2763 2764 while (error == 0) { 2765 /* 2766 * Get the base file offset of the record. The key for 2767 * data records is (base + bytes) rather then (base). 2768 */ 2769 base = &cursor.leaf->base; 2770 rec_offset = base->key - cursor.leaf->data_len; 2771 2772 /* 2773 * Calculate the gap, if any, and zero-fill it. 2774 * 2775 * n is the offset of the start of the record verses our 2776 * current seek offset in the bio. 2777 */ 2778 n = (int)(rec_offset - (bio->bio_offset + boff)); 2779 if (n > 0) { 2780 if (n > bp->b_bufsize - boff) 2781 n = bp->b_bufsize - boff; 2782 bzero((char *)bp->b_data + boff, n); 2783 boff += n; 2784 n = 0; 2785 } 2786 2787 /* 2788 * Calculate the data offset in the record and the number 2789 * of bytes we can copy. 2790 * 2791 * There are two degenerate cases. First, boff may already 2792 * be at bp->b_bufsize. Secondly, the data offset within 2793 * the record may exceed the record's size. 2794 */ 2795 roff = -n; 2796 rec_offset += roff; 2797 n = cursor.leaf->data_len - roff; 2798 if (n <= 0) { 2799 hdkprintf("bad n=%d roff=%d\n", n, roff); 2800 n = 0; 2801 } else if (n > bp->b_bufsize - boff) { 2802 n = bp->b_bufsize - boff; 2803 } 2804 2805 /* 2806 * Deal with cached truncations. This cool bit of code 2807 * allows truncate()/ftruncate() to avoid having to sync 2808 * the file. 2809 * 2810 * If the frontend is truncated then all backend records are 2811 * subject to the frontend's truncation. 2812 * 2813 * If the backend is truncated then backend records on-disk 2814 * (but not in-memory) are subject to the backend's 2815 * truncation. In-memory records owned by the backend 2816 * represent data written after the truncation point on the 2817 * backend and must not be truncated. 2818 * 2819 * Truncate operations deal with frontend buffer cache 2820 * buffers and frontend-owned in-memory records synchronously. 2821 */ 2822 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2823 if (hammer_cursor_ondisk(&cursor)/* || 2824 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2825 if (ip->trunc_off <= rec_offset) 2826 n = 0; 2827 else if (ip->trunc_off < rec_offset + n) 2828 n = (int)(ip->trunc_off - rec_offset); 2829 } 2830 } 2831 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2832 if (hammer_cursor_ondisk(&cursor)) { 2833 if (ip->sync_trunc_off <= rec_offset) 2834 n = 0; 2835 else if (ip->sync_trunc_off < rec_offset + n) 2836 n = (int)(ip->sync_trunc_off - rec_offset); 2837 } 2838 } 2839 2840 /* 2841 * Try to issue a direct read into our bio if possible, 2842 * otherwise resolve the element data into a hammer_buffer 2843 * and copy. 2844 * 2845 * The buffer on-disk should be zerod past any real 2846 * truncation point, but may not be for any synthesized 2847 * truncation point from above. 2848 * 2849 * NOTE: disk_offset is only valid if the cursor data is 2850 * on-disk. 2851 */ 2852 disk_offset = cursor.leaf->data_offset + roff; 2853 isdedupable = (boff == 0 && n == bp->b_bufsize && 2854 hammer_cursor_ondisk(&cursor) && 2855 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2856 2857 if (isdedupable && hammer_double_buffer == 0) { 2858 /* 2859 * Direct read case 2860 */ 2861 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2862 HAMMER_ZONE_LARGE_DATA); 2863 nbio->bio_offset = disk_offset; 2864 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2865 if (hammer_live_dedup && error == 0) 2866 hammer_dedup_cache_add(ip, cursor.leaf); 2867 goto done; 2868 } else if (isdedupable) { 2869 /* 2870 * Async I/O case for reading from backing store 2871 * and copying the data to the filesystem buffer. 2872 * live-dedup has to verify the data anyway if it 2873 * gets a hit later so we can just add the entry 2874 * now. 2875 */ 2876 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2877 HAMMER_ZONE_LARGE_DATA); 2878 nbio->bio_offset = disk_offset; 2879 if (hammer_live_dedup) 2880 hammer_dedup_cache_add(ip, cursor.leaf); 2881 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2882 goto done; 2883 } else if (n) { 2884 error = hammer_ip_resolve_data(&cursor); 2885 if (error == 0) { 2886 if (hammer_live_dedup && isdedupable) 2887 hammer_dedup_cache_add(ip, cursor.leaf); 2888 bcopy((char *)cursor.data + roff, 2889 (char *)bp->b_data + boff, n); 2890 } 2891 } 2892 if (error) 2893 break; 2894 2895 /* 2896 * We have to be sure that the only elements added to the 2897 * dedup cache are those which are already on-media. 2898 */ 2899 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2900 hammer_dedup_cache_add(ip, cursor.leaf); 2901 2902 /* 2903 * Iterate until we have filled the request. 2904 */ 2905 boff += n; 2906 if (boff == bp->b_bufsize) 2907 break; 2908 error = hammer_ip_next(&cursor); 2909 } 2910 2911 /* 2912 * There may have been a gap after the last record 2913 */ 2914 if (error == ENOENT) 2915 error = 0; 2916 if (error == 0 && boff != bp->b_bufsize) { 2917 KKASSERT(boff < bp->b_bufsize); 2918 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2919 /* boff = bp->b_bufsize; */ 2920 } 2921 2922 /* 2923 * Disallow swapcache operation on the vnode buffer if double 2924 * buffering is enabled, the swapcache will get the data via 2925 * the block device buffer. 2926 */ 2927 if (hammer_double_buffer) 2928 bp->b_flags |= B_NOTMETA; 2929 2930 /* 2931 * Cleanup 2932 */ 2933 bp->b_resid = 0; 2934 bp->b_error = error; 2935 if (error) 2936 bp->b_flags |= B_ERROR; 2937 biodone(ap->a_bio); 2938 2939 done: 2940 /* 2941 * Cache the b-tree node for the last data read in cache[1]. 2942 * 2943 * If we hit the file EOF then also cache the node in the 2944 * governing directory's cache[3], it will be used to initialize 2945 * the new inode's cache[1] for any inodes looked up via the directory. 2946 * 2947 * This doesn't reduce disk accesses since the B-Tree chain is 2948 * likely cached, but it does reduce cpu overhead when looking 2949 * up file offsets for cpdup/tar/cpio style iterations. 2950 */ 2951 if (cursor.node) 2952 hammer_cache_node(&ip->cache[1], cursor.node); 2953 if (ran_end >= ip->ino_data.size) { 2954 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2955 ip->obj_asof, ip->obj_localization); 2956 if (dip) { 2957 hammer_cache_node(&dip->cache[3], cursor.node); 2958 hammer_rel_inode(dip, 0); 2959 } 2960 } 2961 hammer_done_cursor(&cursor); 2962 hammer_done_transaction(&trans); 2963 lwkt_reltoken(&hmp->fs_token); 2964 return(error); 2965 } 2966 2967 /* 2968 * BMAP operation - used to support cluster_read() only. 2969 * 2970 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2971 * 2972 * This routine may return EOPNOTSUPP if the opration is not supported for 2973 * the specified offset. The contents of the pointer arguments do not 2974 * need to be initialized in that case. 2975 * 2976 * If a disk address is available and properly aligned return 0 with 2977 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2978 * to the run-length relative to that offset. Callers may assume that 2979 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2980 * large, so return EOPNOTSUPP if it is not sufficiently large. 2981 */ 2982 static 2983 int 2984 hammer_vop_bmap(struct vop_bmap_args *ap) 2985 { 2986 struct hammer_transaction trans; 2987 struct hammer_inode *ip; 2988 hammer_mount_t hmp; 2989 struct hammer_cursor cursor; 2990 hammer_base_elm_t base; 2991 int64_t rec_offset; 2992 int64_t ran_end; 2993 int64_t tmp64; 2994 int64_t base_offset; 2995 int64_t base_disk_offset; 2996 int64_t last_offset; 2997 hammer_off_t last_disk_offset; 2998 hammer_off_t disk_offset; 2999 int rec_len; 3000 int error; 3001 int blksize; 3002 3003 ++hammer_stats_file_iopsr; 3004 ip = ap->a_vp->v_data; 3005 hmp = ip->hmp; 3006 3007 /* 3008 * We can only BMAP regular files. We can't BMAP database files, 3009 * directories, etc. 3010 */ 3011 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3012 return(EOPNOTSUPP); 3013 3014 /* 3015 * bmap is typically called with runp/runb both NULL when used 3016 * for writing. We do not support BMAP for writing atm. 3017 */ 3018 if (ap->a_cmd != BUF_CMD_READ) 3019 return(EOPNOTSUPP); 3020 3021 /* 3022 * Scan the B-Tree to acquire blockmap addresses, then translate 3023 * to raw addresses. 3024 */ 3025 lwkt_gettoken(&hmp->fs_token); 3026 hammer_simple_transaction(&trans, hmp); 3027 3028 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3029 3030 /* 3031 * Key range (begin and end inclusive) to scan. Note that the key's 3032 * stored in the actual records represent BASE+LEN, not BASE. The 3033 * first record containing bio_offset will have a key > bio_offset. 3034 */ 3035 cursor.key_beg.localization = ip->obj_localization | 3036 HAMMER_LOCALIZE_MISC; 3037 cursor.key_beg.obj_id = ip->obj_id; 3038 cursor.key_beg.create_tid = 0; 3039 cursor.key_beg.delete_tid = 0; 3040 cursor.key_beg.obj_type = 0; 3041 if (ap->a_runb) 3042 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3043 else 3044 cursor.key_beg.key = ap->a_loffset + 1; 3045 if (cursor.key_beg.key < 0) 3046 cursor.key_beg.key = 0; 3047 cursor.asof = ip->obj_asof; 3048 cursor.flags |= HAMMER_CURSOR_ASOF; 3049 3050 cursor.key_end = cursor.key_beg; 3051 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3052 3053 ran_end = ap->a_loffset + MAXPHYS; 3054 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3055 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3056 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3057 if (tmp64 < ran_end) 3058 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3059 else 3060 cursor.key_end.key = ran_end + MAXPHYS + 1; 3061 3062 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3063 3064 error = hammer_ip_first(&cursor); 3065 base_offset = last_offset = 0; 3066 base_disk_offset = last_disk_offset = 0; 3067 3068 while (error == 0) { 3069 /* 3070 * Get the base file offset of the record. The key for 3071 * data records is (base + bytes) rather then (base). 3072 * 3073 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3074 * The extra bytes should be zero on-disk and the BMAP op 3075 * should still be ok. 3076 */ 3077 base = &cursor.leaf->base; 3078 rec_offset = base->key - cursor.leaf->data_len; 3079 rec_len = cursor.leaf->data_len; 3080 3081 /* 3082 * Incorporate any cached truncation. 3083 * 3084 * NOTE: Modifications to rec_len based on synthesized 3085 * truncation points remove the guarantee that any extended 3086 * data on disk is zero (since the truncations may not have 3087 * taken place on-media yet). 3088 */ 3089 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3090 if (hammer_cursor_ondisk(&cursor) || 3091 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3092 if (ip->trunc_off <= rec_offset) 3093 rec_len = 0; 3094 else if (ip->trunc_off < rec_offset + rec_len) 3095 rec_len = (int)(ip->trunc_off - rec_offset); 3096 } 3097 } 3098 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3099 if (hammer_cursor_ondisk(&cursor)) { 3100 if (ip->sync_trunc_off <= rec_offset) 3101 rec_len = 0; 3102 else if (ip->sync_trunc_off < rec_offset + rec_len) 3103 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3104 } 3105 } 3106 3107 /* 3108 * Accumulate information. If we have hit a discontiguous 3109 * block reset base_offset unless we are already beyond the 3110 * requested offset. If we are, that's it, we stop. 3111 */ 3112 if (error) 3113 break; 3114 if (hammer_cursor_ondisk(&cursor)) { 3115 disk_offset = cursor.leaf->data_offset; 3116 if (rec_offset != last_offset || 3117 disk_offset != last_disk_offset) { 3118 if (rec_offset > ap->a_loffset) 3119 break; 3120 base_offset = rec_offset; 3121 base_disk_offset = disk_offset; 3122 } 3123 last_offset = rec_offset + rec_len; 3124 last_disk_offset = disk_offset + rec_len; 3125 3126 if (hammer_live_dedup) 3127 hammer_dedup_cache_add(ip, cursor.leaf); 3128 } 3129 3130 error = hammer_ip_next(&cursor); 3131 } 3132 3133 if (cursor.node) 3134 hammer_cache_node(&ip->cache[1], cursor.node); 3135 3136 hammer_done_cursor(&cursor); 3137 hammer_done_transaction(&trans); 3138 lwkt_reltoken(&hmp->fs_token); 3139 3140 /* 3141 * If we couldn't find any records or the records we did find were 3142 * all behind the requested offset, return failure. A forward 3143 * truncation can leave a hole w/ no on-disk records. 3144 */ 3145 if (last_offset == 0 || last_offset < ap->a_loffset) 3146 return (EOPNOTSUPP); 3147 3148 /* 3149 * Figure out the block size at the requested offset and adjust 3150 * our limits so the cluster_read() does not create inappropriately 3151 * sized buffer cache buffers. 3152 */ 3153 blksize = hammer_blocksize(ap->a_loffset); 3154 if (hammer_blocksize(base_offset) != blksize) { 3155 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3156 } 3157 if (last_offset != ap->a_loffset && 3158 hammer_blocksize(last_offset - 1) != blksize) { 3159 last_offset = hammer_blockdemarc(ap->a_loffset, 3160 last_offset - 1); 3161 } 3162 3163 /* 3164 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3165 * from occuring. 3166 */ 3167 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3168 3169 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3170 /* 3171 * Only large-data zones can be direct-IOd 3172 */ 3173 error = EOPNOTSUPP; 3174 } else if ((disk_offset & HAMMER_BUFMASK) || 3175 (last_offset - ap->a_loffset) < blksize) { 3176 /* 3177 * doffsetp is not aligned or the forward run size does 3178 * not cover a whole buffer, disallow the direct I/O. 3179 */ 3180 error = EOPNOTSUPP; 3181 } else { 3182 /* 3183 * We're good. 3184 */ 3185 *ap->a_doffsetp = disk_offset; 3186 if (ap->a_runb) { 3187 *ap->a_runb = ap->a_loffset - base_offset; 3188 KKASSERT(*ap->a_runb >= 0); 3189 } 3190 if (ap->a_runp) { 3191 *ap->a_runp = last_offset - ap->a_loffset; 3192 KKASSERT(*ap->a_runp >= 0); 3193 } 3194 error = 0; 3195 } 3196 return(error); 3197 } 3198 3199 /* 3200 * Write to a regular file. Because this is a strategy call the OS is 3201 * trying to actually get data onto the media. 3202 */ 3203 static 3204 int 3205 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3206 { 3207 hammer_record_t record; 3208 hammer_mount_t hmp; 3209 hammer_inode_t ip; 3210 struct bio *bio; 3211 struct buf *bp; 3212 int blksize __debugvar; 3213 int bytes; 3214 int error; 3215 3216 bio = ap->a_bio; 3217 bp = bio->bio_buf; 3218 ip = ap->a_vp->v_data; 3219 hmp = ip->hmp; 3220 3221 blksize = hammer_blocksize(bio->bio_offset); 3222 KKASSERT(bp->b_bufsize == blksize); 3223 3224 if (ip->flags & HAMMER_INODE_RO) { 3225 bp->b_error = EROFS; 3226 bp->b_flags |= B_ERROR; 3227 biodone(ap->a_bio); 3228 return(EROFS); 3229 } 3230 3231 lwkt_gettoken(&hmp->fs_token); 3232 3233 /* 3234 * Disallow swapcache operation on the vnode buffer if double 3235 * buffering is enabled, the swapcache will get the data via 3236 * the block device buffer. 3237 */ 3238 if (hammer_double_buffer) 3239 bp->b_flags |= B_NOTMETA; 3240 3241 /* 3242 * Interlock with inode destruction (no in-kernel or directory 3243 * topology visibility). If we queue new IO while trying to 3244 * destroy the inode we can deadlock the vtrunc call in 3245 * hammer_inode_unloadable_check(). 3246 * 3247 * Besides, there's no point flushing a bp associated with an 3248 * inode that is being destroyed on-media and has no kernel 3249 * references. 3250 */ 3251 if ((ip->flags | ip->sync_flags) & 3252 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3253 bp->b_resid = 0; 3254 biodone(ap->a_bio); 3255 lwkt_reltoken(&hmp->fs_token); 3256 return(0); 3257 } 3258 3259 /* 3260 * Reserve space and issue a direct-write from the front-end. 3261 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3262 * allocations. 3263 * 3264 * An in-memory record will be installed to reference the storage 3265 * until the flusher can get to it. 3266 * 3267 * Since we own the high level bio the front-end will not try to 3268 * do a direct-read until the write completes. 3269 * 3270 * NOTE: The only time we do not reserve a full-sized buffers 3271 * worth of data is if the file is small. We do not try to 3272 * allocate a fragment (from the small-data zone) at the end of 3273 * an otherwise large file as this can lead to wildly separated 3274 * data. 3275 */ 3276 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3277 KKASSERT(bio->bio_offset < ip->ino_data.size); 3278 if (bio->bio_offset || ip->ino_data.size > HAMMER_HBUFSIZE) 3279 bytes = bp->b_bufsize; 3280 else 3281 bytes = ((int)ip->ino_data.size + 15) & ~15; 3282 3283 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3284 bytes, &error); 3285 3286 /* 3287 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3288 * in hammer_vop_write(). We must flag the record so the proper 3289 * REDO_TERM_WRITE entry is generated during the flush. 3290 */ 3291 if (record) { 3292 if (bp->b_flags & B_VFSFLAG1) { 3293 record->flags |= HAMMER_RECF_REDO; 3294 bp->b_flags &= ~B_VFSFLAG1; 3295 } 3296 if (record->flags & HAMMER_RECF_DEDUPED) { 3297 bp->b_resid = 0; 3298 hammer_ip_replace_bulk(hmp, record); 3299 biodone(ap->a_bio); 3300 } else { 3301 hammer_io_direct_write(hmp, bio, record); 3302 } 3303 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3304 hammer_flush_inode(ip, 0); 3305 } else { 3306 bp->b_bio2.bio_offset = NOOFFSET; 3307 bp->b_error = error; 3308 bp->b_flags |= B_ERROR; 3309 biodone(ap->a_bio); 3310 } 3311 lwkt_reltoken(&hmp->fs_token); 3312 return(error); 3313 } 3314 3315 /* 3316 * dounlink - disconnect a directory entry 3317 * 3318 * XXX whiteout support not really in yet 3319 */ 3320 static int 3321 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3322 struct vnode *dvp, struct ucred *cred, 3323 int flags, int isdir) 3324 { 3325 struct namecache *ncp; 3326 hammer_inode_t dip; 3327 hammer_inode_t ip; 3328 hammer_mount_t hmp; 3329 struct hammer_cursor cursor; 3330 int64_t namekey; 3331 uint32_t max_iterations; 3332 int nlen, error; 3333 3334 /* 3335 * Calculate the namekey and setup the key range for the scan. This 3336 * works kinda like a chained hash table where the lower 32 bits 3337 * of the namekey synthesize the chain. 3338 * 3339 * The key range is inclusive of both key_beg and key_end. 3340 */ 3341 dip = VTOI(dvp); 3342 ncp = nch->ncp; 3343 hmp = dip->hmp; 3344 3345 if (dip->flags & HAMMER_INODE_RO) 3346 return (EROFS); 3347 3348 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3349 &max_iterations); 3350 retry: 3351 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3352 cursor.key_beg.localization = dip->obj_localization | 3353 hammer_dir_localization(dip); 3354 cursor.key_beg.obj_id = dip->obj_id; 3355 cursor.key_beg.key = namekey; 3356 cursor.key_beg.create_tid = 0; 3357 cursor.key_beg.delete_tid = 0; 3358 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3359 cursor.key_beg.obj_type = 0; 3360 3361 cursor.key_end = cursor.key_beg; 3362 cursor.key_end.key += max_iterations; 3363 cursor.asof = dip->obj_asof; 3364 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3365 3366 /* 3367 * Scan all matching records (the chain), locate the one matching 3368 * the requested path component. info->last_error contains the 3369 * error code on search termination and could be 0, ENOENT, or 3370 * something else. 3371 * 3372 * The hammer_ip_*() functions merge in-memory records with on-disk 3373 * records for the purposes of the search. 3374 */ 3375 error = hammer_ip_first(&cursor); 3376 3377 while (error == 0) { 3378 error = hammer_ip_resolve_data(&cursor); 3379 if (error) 3380 break; 3381 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3382 KKASSERT(nlen > 0); 3383 if (ncp->nc_nlen == nlen && 3384 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3385 break; 3386 } 3387 error = hammer_ip_next(&cursor); 3388 } 3389 3390 /* 3391 * If all is ok we have to get the inode so we can adjust nlinks. 3392 * To avoid a deadlock with the flusher we must release the inode 3393 * lock on the directory when acquiring the inode for the entry. 3394 * 3395 * If the target is a directory, it must be empty. 3396 */ 3397 if (error == 0) { 3398 hammer_unlock(&cursor.ip->lock); 3399 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3400 hmp->asof, 3401 cursor.data->entry.localization, 3402 0, &error); 3403 hammer_lock_sh(&cursor.ip->lock); 3404 if (error == ENOENT) { 3405 hkprintf("WARNING: Removing dirent w/missing inode " 3406 "\"%s\"\n" 3407 "\tobj_id = %016llx\n", 3408 ncp->nc_name, 3409 (long long)cursor.data->entry.obj_id); 3410 error = 0; 3411 } 3412 3413 /* 3414 * If isdir >= 0 we validate that the entry is or is not a 3415 * directory. If isdir < 0 we don't care. 3416 */ 3417 if (error == 0 && isdir >= 0 && ip) { 3418 if (isdir && 3419 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3420 error = ENOTDIR; 3421 } else if (isdir == 0 && 3422 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3423 error = EISDIR; 3424 } 3425 } 3426 3427 /* 3428 * If we are trying to remove a directory the directory must 3429 * be empty. 3430 * 3431 * The check directory code can loop and deadlock/retry. Our 3432 * own cursor's node locks must be released to avoid a 3-way 3433 * deadlock with the flusher if the check directory code 3434 * blocks. 3435 * 3436 * If any changes whatsoever have been made to the cursor 3437 * set EDEADLK and retry. 3438 * 3439 * WARNING: See warnings in hammer_unlock_cursor() 3440 * function. 3441 */ 3442 if (error == 0 && ip && ip->ino_data.obj_type == 3443 HAMMER_OBJTYPE_DIRECTORY) { 3444 hammer_unlock_cursor(&cursor); 3445 error = hammer_ip_check_directory_empty(trans, ip); 3446 hammer_lock_cursor(&cursor); 3447 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3448 hkprintf("Warning: avoided deadlock " 3449 "on rmdir '%s'\n", 3450 ncp->nc_name); 3451 error = EDEADLK; 3452 } 3453 } 3454 3455 /* 3456 * Delete the directory entry. 3457 * 3458 * WARNING: hammer_ip_del_directory() may have to terminate 3459 * the cursor to avoid a deadlock. It is ok to call 3460 * hammer_done_cursor() twice. 3461 */ 3462 if (error == 0) { 3463 error = hammer_ip_del_directory(trans, &cursor, 3464 dip, ip); 3465 } 3466 hammer_done_cursor(&cursor); 3467 if (error == 0) { 3468 /* 3469 * Tell the namecache that we are now unlinked. 3470 */ 3471 cache_unlink(nch); 3472 3473 /* 3474 * NOTE: ip->vp, if non-NULL, cannot be directly 3475 * referenced without formally acquiring the 3476 * vp since the vp might have zero refs on it, 3477 * or in the middle of a reclaim, etc. 3478 * 3479 * NOTE: The cache_setunresolved() can rip the vp 3480 * out from under us since the vp may not have 3481 * any refs, in which case ip->vp will be NULL 3482 * from the outset. 3483 */ 3484 while (ip && ip->vp) { 3485 struct vnode *vp; 3486 3487 error = hammer_get_vnode(ip, &vp); 3488 if (error == 0 && vp) { 3489 vn_unlock(vp); 3490 hammer_knote(ip->vp, NOTE_DELETE); 3491 #if 0 3492 /* 3493 * Don't do this, it can deadlock 3494 * on concurrent rm's of hardlinks. 3495 * Shouldn't be needed any more. 3496 */ 3497 cache_inval_vp(ip->vp, CINV_DESTROY); 3498 #endif 3499 vrele(vp); 3500 break; 3501 } 3502 hdkprintf("ip/vp race1 avoided\n"); 3503 } 3504 } 3505 if (ip) 3506 hammer_rel_inode(ip, 0); 3507 } else { 3508 hammer_done_cursor(&cursor); 3509 } 3510 if (error == EDEADLK) 3511 goto retry; 3512 3513 return (error); 3514 } 3515 3516 /************************************************************************ 3517 * FIFO AND SPECFS OPS * 3518 ************************************************************************ 3519 * 3520 */ 3521 static int 3522 hammer_vop_fifoclose (struct vop_close_args *ap) 3523 { 3524 /* XXX update itimes */ 3525 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3526 } 3527 3528 static int 3529 hammer_vop_fiforead (struct vop_read_args *ap) 3530 { 3531 int error; 3532 3533 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3534 /* XXX update access time */ 3535 return (error); 3536 } 3537 3538 static int 3539 hammer_vop_fifowrite (struct vop_write_args *ap) 3540 { 3541 int error; 3542 3543 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3544 /* XXX update access time */ 3545 return (error); 3546 } 3547 3548 static 3549 int 3550 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3551 { 3552 int error; 3553 3554 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3555 if (error) 3556 error = hammer_vop_kqfilter(ap); 3557 return(error); 3558 } 3559 3560 /************************************************************************ 3561 * KQFILTER OPS * 3562 ************************************************************************ 3563 * 3564 */ 3565 static void filt_hammerdetach(struct knote *kn); 3566 static int filt_hammerread(struct knote *kn, long hint); 3567 static int filt_hammerwrite(struct knote *kn, long hint); 3568 static int filt_hammervnode(struct knote *kn, long hint); 3569 3570 static struct filterops hammerread_filtops = 3571 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3572 NULL, filt_hammerdetach, filt_hammerread }; 3573 static struct filterops hammerwrite_filtops = 3574 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3575 NULL, filt_hammerdetach, filt_hammerwrite }; 3576 static struct filterops hammervnode_filtops = 3577 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3578 NULL, filt_hammerdetach, filt_hammervnode }; 3579 3580 static 3581 int 3582 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3583 { 3584 struct vnode *vp = ap->a_vp; 3585 struct knote *kn = ap->a_kn; 3586 3587 switch (kn->kn_filter) { 3588 case EVFILT_READ: 3589 kn->kn_fop = &hammerread_filtops; 3590 break; 3591 case EVFILT_WRITE: 3592 kn->kn_fop = &hammerwrite_filtops; 3593 break; 3594 case EVFILT_VNODE: 3595 kn->kn_fop = &hammervnode_filtops; 3596 break; 3597 default: 3598 return (EOPNOTSUPP); 3599 } 3600 3601 kn->kn_hook = (caddr_t)vp; 3602 3603 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3604 3605 return(0); 3606 } 3607 3608 static void 3609 filt_hammerdetach(struct knote *kn) 3610 { 3611 struct vnode *vp = (void *)kn->kn_hook; 3612 3613 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3614 } 3615 3616 static int 3617 filt_hammerread(struct knote *kn, long hint) 3618 { 3619 struct vnode *vp = (void *)kn->kn_hook; 3620 hammer_inode_t ip = VTOI(vp); 3621 hammer_mount_t hmp = ip->hmp; 3622 off_t off; 3623 3624 if (hint == NOTE_REVOKE) { 3625 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3626 return(1); 3627 } 3628 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3629 off = ip->ino_data.size - kn->kn_fp->f_offset; 3630 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3631 lwkt_reltoken(&hmp->fs_token); 3632 if (kn->kn_sfflags & NOTE_OLDAPI) 3633 return(1); 3634 return (kn->kn_data != 0); 3635 } 3636 3637 static int 3638 filt_hammerwrite(struct knote *kn, long hint) 3639 { 3640 if (hint == NOTE_REVOKE) 3641 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3642 kn->kn_data = 0; 3643 return (1); 3644 } 3645 3646 static int 3647 filt_hammervnode(struct knote *kn, long hint) 3648 { 3649 if (kn->kn_sfflags & hint) 3650 kn->kn_fflags |= hint; 3651 if (hint == NOTE_REVOKE) { 3652 kn->kn_flags |= (EV_EOF | EV_NODATA); 3653 return (1); 3654 } 3655 return (kn->kn_fflags != 0); 3656 } 3657 3658