1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/mountctl.h> 36 #include <sys/namecache.h> 37 #include <sys/buf2.h> 38 #include <vfs/fifofs/fifo.h> 39 40 #include "hammer.h" 41 42 /* 43 * USERFS VNOPS 44 */ 45 static int hammer_vop_fsync(struct vop_fsync_args *); 46 static int hammer_vop_read(struct vop_read_args *); 47 static int hammer_vop_write(struct vop_write_args *); 48 static int hammer_vop_access(struct vop_access_args *); 49 static int hammer_vop_advlock(struct vop_advlock_args *); 50 static int hammer_vop_close(struct vop_close_args *); 51 static int hammer_vop_ncreate(struct vop_ncreate_args *); 52 static int hammer_vop_getattr(struct vop_getattr_args *); 53 static int hammer_vop_nresolve(struct vop_nresolve_args *); 54 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 55 static int hammer_vop_nlink(struct vop_nlink_args *); 56 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 57 static int hammer_vop_nmknod(struct vop_nmknod_args *); 58 static int hammer_vop_open(struct vop_open_args *); 59 static int hammer_vop_print(struct vop_print_args *); 60 static int hammer_vop_readdir(struct vop_readdir_args *); 61 static int hammer_vop_readlink(struct vop_readlink_args *); 62 static int hammer_vop_nremove(struct vop_nremove_args *); 63 static int hammer_vop_nrename(struct vop_nrename_args *); 64 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 65 static int hammer_vop_markatime(struct vop_markatime_args *); 66 static int hammer_vop_setattr(struct vop_setattr_args *); 67 static int hammer_vop_strategy(struct vop_strategy_args *); 68 static int hammer_vop_bmap(struct vop_bmap_args *ap); 69 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 70 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 71 static int hammer_vop_ioctl(struct vop_ioctl_args *); 72 static int hammer_vop_mountctl(struct vop_mountctl_args *); 73 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 74 75 static int hammer_vop_fifoclose (struct vop_close_args *); 76 static int hammer_vop_fiforead (struct vop_read_args *); 77 static int hammer_vop_fifowrite (struct vop_write_args *); 78 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 79 80 struct vop_ops hammer_vnode_vops = { 81 .vop_default = vop_defaultop, 82 .vop_fsync = hammer_vop_fsync, 83 .vop_getpages = vop_stdgetpages, 84 .vop_putpages = vop_stdputpages, 85 .vop_read = hammer_vop_read, 86 .vop_write = hammer_vop_write, 87 .vop_access = hammer_vop_access, 88 .vop_advlock = hammer_vop_advlock, 89 .vop_close = hammer_vop_close, 90 .vop_ncreate = hammer_vop_ncreate, 91 .vop_getattr = hammer_vop_getattr, 92 .vop_inactive = hammer_vop_inactive, 93 .vop_reclaim = hammer_vop_reclaim, 94 .vop_nresolve = hammer_vop_nresolve, 95 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 96 .vop_nlink = hammer_vop_nlink, 97 .vop_nmkdir = hammer_vop_nmkdir, 98 .vop_nmknod = hammer_vop_nmknod, 99 .vop_open = hammer_vop_open, 100 .vop_pathconf = vop_stdpathconf, 101 .vop_print = hammer_vop_print, 102 .vop_readdir = hammer_vop_readdir, 103 .vop_readlink = hammer_vop_readlink, 104 .vop_nremove = hammer_vop_nremove, 105 .vop_nrename = hammer_vop_nrename, 106 .vop_nrmdir = hammer_vop_nrmdir, 107 .vop_markatime = hammer_vop_markatime, 108 .vop_setattr = hammer_vop_setattr, 109 .vop_bmap = hammer_vop_bmap, 110 .vop_strategy = hammer_vop_strategy, 111 .vop_nsymlink = hammer_vop_nsymlink, 112 .vop_nwhiteout = hammer_vop_nwhiteout, 113 .vop_ioctl = hammer_vop_ioctl, 114 .vop_mountctl = hammer_vop_mountctl, 115 .vop_kqfilter = hammer_vop_kqfilter 116 }; 117 118 struct vop_ops hammer_spec_vops = { 119 .vop_default = vop_defaultop, 120 .vop_fsync = hammer_vop_fsync, 121 .vop_read = vop_stdnoread, 122 .vop_write = vop_stdnowrite, 123 .vop_access = hammer_vop_access, 124 .vop_close = hammer_vop_close, 125 .vop_markatime = hammer_vop_markatime, 126 .vop_getattr = hammer_vop_getattr, 127 .vop_inactive = hammer_vop_inactive, 128 .vop_reclaim = hammer_vop_reclaim, 129 .vop_setattr = hammer_vop_setattr 130 }; 131 132 struct vop_ops hammer_fifo_vops = { 133 .vop_default = fifo_vnoperate, 134 .vop_fsync = hammer_vop_fsync, 135 .vop_read = hammer_vop_fiforead, 136 .vop_write = hammer_vop_fifowrite, 137 .vop_access = hammer_vop_access, 138 .vop_close = hammer_vop_fifoclose, 139 .vop_markatime = hammer_vop_markatime, 140 .vop_getattr = hammer_vop_getattr, 141 .vop_inactive = hammer_vop_inactive, 142 .vop_reclaim = hammer_vop_reclaim, 143 .vop_setattr = hammer_vop_setattr, 144 .vop_kqfilter = hammer_vop_fifokqfilter 145 }; 146 147 static __inline 148 void 149 hammer_knote(struct vnode *vp, int flags) 150 { 151 if (flags) 152 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 153 } 154 155 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 156 struct vnode *dvp, struct ucred *cred, 157 int flags, int isdir); 158 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 159 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 160 161 /* 162 * hammer_vop_fsync { vp, waitfor } 163 * 164 * fsync() an inode to disk and wait for it to be completely committed 165 * such that the information would not be undone if a crash occured after 166 * return. 167 * 168 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 169 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 170 * operation. 171 * 172 * Ultimately the combination of a REDO log and use of fast storage 173 * to front-end cluster caches will make fsync fast, but it aint 174 * here yet. And, in anycase, we need real transactional 175 * all-or-nothing features which are not restricted to a single file. 176 */ 177 static 178 int 179 hammer_vop_fsync(struct vop_fsync_args *ap) 180 { 181 hammer_inode_t ip = VTOI(ap->a_vp); 182 hammer_mount_t hmp = ip->hmp; 183 int waitfor = ap->a_waitfor; 184 int mode; 185 186 lwkt_gettoken(&hmp->fs_token); 187 188 /* 189 * Fsync rule relaxation (default is either full synchronous flush 190 * or REDO semantics with synchronous flush). 191 */ 192 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 193 switch(hammer_fsync_mode) { 194 case 0: 195 mode0: 196 /* no REDO, full synchronous flush */ 197 goto skip; 198 case 1: 199 mode1: 200 /* no REDO, full asynchronous flush */ 201 if (waitfor == MNT_WAIT) 202 waitfor = MNT_NOWAIT; 203 goto skip; 204 case 2: 205 /* REDO semantics, synchronous flush */ 206 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 207 goto mode0; 208 mode = HAMMER_FLUSH_UNDOS_AUTO; 209 break; 210 case 3: 211 /* REDO semantics, relaxed asynchronous flush */ 212 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 213 goto mode1; 214 mode = HAMMER_FLUSH_UNDOS_RELAXED; 215 if (waitfor == MNT_WAIT) 216 waitfor = MNT_NOWAIT; 217 break; 218 case 4: 219 /* ignore the fsync() system call */ 220 lwkt_reltoken(&hmp->fs_token); 221 return(0); 222 default: 223 /* we have to do something */ 224 mode = HAMMER_FLUSH_UNDOS_RELAXED; 225 if (waitfor == MNT_WAIT) 226 waitfor = MNT_NOWAIT; 227 break; 228 } 229 230 /* 231 * Fast fsync only needs to flush the UNDO/REDO fifo if 232 * HAMMER_INODE_REDO is non-zero and the only modifications 233 * made to the file are write or write-extends. 234 */ 235 if ((ip->flags & HAMMER_INODE_REDO) && 236 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0) { 237 ++hammer_count_fsyncs; 238 hammer_flusher_flush_undos(hmp, mode); 239 ip->redo_count = 0; 240 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 241 vclrisdirty(ip->vp); 242 lwkt_reltoken(&hmp->fs_token); 243 return(0); 244 } 245 246 /* 247 * REDO is enabled by fsync(), the idea being we really only 248 * want to lay down REDO records when programs are using 249 * fsync() heavily. The first fsync() on the file starts 250 * the gravy train going and later fsync()s keep it hot by 251 * resetting the redo_count. 252 * 253 * We weren't running REDOs before now so we have to fall 254 * through and do a full fsync of what we have. 255 */ 256 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 257 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 258 ip->flags |= HAMMER_INODE_REDO; 259 ip->redo_count = 0; 260 } 261 } 262 skip: 263 264 /* 265 * Do a full flush sequence. 266 * 267 * Attempt to release the vnode while waiting for the inode to 268 * finish flushing. This can really mess up inactive->reclaim 269 * sequences so only do it if the vnode is active. 270 * 271 * WARNING! The VX lock functions must be used. vn_lock() will 272 * fail when this is part of a VOP_RECLAIM sequence. 273 */ 274 ++hammer_count_fsyncs; 275 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 276 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 277 if (waitfor == MNT_WAIT) { 278 int dorelock; 279 280 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 281 vx_unlock(ap->a_vp); 282 dorelock = 1; 283 } else { 284 dorelock = 0; 285 } 286 hammer_wait_inode(ip); 287 if (dorelock) 288 vx_lock(ap->a_vp); 289 } 290 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 291 vclrisdirty(ip->vp); 292 lwkt_reltoken(&hmp->fs_token); 293 return (ip->error); 294 } 295 296 /* 297 * hammer_vop_read { vp, uio, ioflag, cred } 298 * 299 * MPSAFE (for the cache safe does not require fs_token) 300 */ 301 static 302 int 303 hammer_vop_read(struct vop_read_args *ap) 304 { 305 struct hammer_transaction trans; 306 hammer_inode_t ip; 307 hammer_mount_t hmp; 308 off_t offset; 309 struct buf *bp; 310 struct uio *uio; 311 int error; 312 int n; 313 int seqcount; 314 int ioseqcount; 315 int blksize; 316 int bigread; 317 int got_trans; 318 size_t resid; 319 320 if (ap->a_vp->v_type != VREG) 321 return (EINVAL); 322 ip = VTOI(ap->a_vp); 323 hmp = ip->hmp; 324 error = 0; 325 got_trans = 0; 326 uio = ap->a_uio; 327 328 /* 329 * Attempt to shortcut directly to the VM object using lwbufs. 330 * This is much faster than instantiating buffer cache buffers. 331 */ 332 resid = uio->uio_resid; 333 error = vop_helper_read_shortcut(ap); 334 hammer_stats_file_read += resid - uio->uio_resid; 335 if (error) 336 return (error); 337 if (uio->uio_resid == 0) 338 goto finished; 339 340 /* 341 * Allow the UIO's size to override the sequential heuristic. 342 */ 343 blksize = hammer_blocksize(uio->uio_offset); 344 seqcount = (uio->uio_resid + (MAXBSIZE - 1)) / MAXBSIZE; 345 ioseqcount = (ap->a_ioflag >> 16); 346 if (seqcount < ioseqcount) 347 seqcount = ioseqcount; 348 349 /* 350 * If reading or writing a huge amount of data we have to break 351 * atomicy and allow the operation to be interrupted by a signal 352 * or it can DOS the machine. 353 */ 354 bigread = (uio->uio_resid > 100 * 1024 * 1024); 355 356 /* 357 * Access the data typically in HAMMER_BUFSIZE blocks via the 358 * buffer cache, but HAMMER may use a variable block size based 359 * on the offset. 360 * 361 * XXX Temporary hack, delay the start transaction while we remain 362 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 363 * locked-shared. 364 */ 365 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 366 int64_t base_offset; 367 int64_t file_limit; 368 369 blksize = hammer_blocksize(uio->uio_offset); 370 offset = (int)uio->uio_offset & (blksize - 1); 371 base_offset = uio->uio_offset - offset; 372 373 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 374 break; 375 376 /* 377 * MPSAFE 378 */ 379 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 380 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 381 bp->b_flags &= ~B_AGE; 382 error = 0; 383 goto skip; 384 } 385 if (ap->a_ioflag & IO_NRDELAY) { 386 bqrelse(bp); 387 return (EWOULDBLOCK); 388 } 389 390 /* 391 * MPUNSAFE 392 */ 393 if (got_trans == 0) { 394 hammer_start_transaction(&trans, ip->hmp); 395 got_trans = 1; 396 } 397 398 /* 399 * NOTE: A valid bp has already been acquired, but was not 400 * B_CACHE. 401 */ 402 if (hammer_cluster_enable) { 403 /* 404 * Use file_limit to prevent cluster_read() from 405 * creating buffers of the wrong block size past 406 * the demarc. 407 */ 408 file_limit = ip->ino_data.size; 409 if (base_offset < HAMMER_XDEMARC && 410 file_limit > HAMMER_XDEMARC) { 411 file_limit = HAMMER_XDEMARC; 412 } 413 error = cluster_readx(ap->a_vp, 414 file_limit, base_offset, 415 blksize, uio->uio_resid, 416 seqcount * MAXBSIZE, &bp); 417 } else { 418 error = breadnx(ap->a_vp, base_offset, blksize, 419 NULL, NULL, 0, &bp); 420 } 421 if (error) { 422 brelse(bp); 423 break; 424 } 425 skip: 426 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IOISSUED)) { 427 hdkprintf("zone2_offset %016jx read file %016jx@%016jx\n", 428 (intmax_t)bp->b_bio2.bio_offset, 429 (intmax_t)ip->obj_id, 430 (intmax_t)bp->b_loffset); 431 } 432 bp->b_flags &= ~B_IOISSUED; 433 if (blksize == HAMMER_XBUFSIZE) 434 bp->b_flags |= B_CLUSTEROK; 435 436 n = blksize - offset; 437 if (n > uio->uio_resid) 438 n = uio->uio_resid; 439 if (n > ip->ino_data.size - uio->uio_offset) 440 n = (int)(ip->ino_data.size - uio->uio_offset); 441 442 /* 443 * Set B_AGE, data has a lower priority than meta-data. 444 * 445 * Use a hold/unlock/drop sequence to run the uiomove 446 * with the buffer unlocked, avoiding deadlocks against 447 * read()s on mmap()'d spaces. 448 */ 449 bp->b_flags |= B_AGE; 450 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 451 bqrelse(bp); 452 453 if (error) 454 break; 455 hammer_stats_file_read += n; 456 } 457 458 finished: 459 460 /* 461 * Try to update the atime with just the inode lock for maximum 462 * concurrency. If we can't shortcut it we have to get the full 463 * blown transaction. 464 */ 465 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 466 hammer_start_transaction(&trans, ip->hmp); 467 got_trans = 1; 468 } 469 470 if (got_trans) { 471 if ((ip->flags & HAMMER_INODE_RO) == 0 && 472 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 473 lwkt_gettoken(&hmp->fs_token); 474 ip->ino_data.atime = trans.time; 475 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 476 hammer_done_transaction(&trans); 477 lwkt_reltoken(&hmp->fs_token); 478 } else { 479 hammer_done_transaction(&trans); 480 } 481 } 482 return (error); 483 } 484 485 /* 486 * hammer_vop_write { vp, uio, ioflag, cred } 487 */ 488 static 489 int 490 hammer_vop_write(struct vop_write_args *ap) 491 { 492 struct hammer_transaction trans; 493 hammer_inode_t ip; 494 hammer_mount_t hmp; 495 thread_t td; 496 struct uio *uio; 497 int offset; 498 off_t base_offset; 499 int64_t cluster_eof; 500 struct buf *bp; 501 int kflags; 502 int error; 503 int n; 504 int flags; 505 int seqcount; 506 int bigwrite; 507 508 if (ap->a_vp->v_type != VREG) 509 return (EINVAL); 510 ip = VTOI(ap->a_vp); 511 hmp = ip->hmp; 512 error = 0; 513 kflags = 0; 514 seqcount = ap->a_ioflag >> 16; 515 516 if (ip->flags & HAMMER_INODE_RO) 517 return (EROFS); 518 519 /* 520 * Create a transaction to cover the operations we perform. 521 */ 522 hammer_start_transaction(&trans, hmp); 523 uio = ap->a_uio; 524 525 /* 526 * Check append mode 527 */ 528 if (ap->a_ioflag & IO_APPEND) 529 uio->uio_offset = ip->ino_data.size; 530 531 /* 532 * Check for illegal write offsets. Valid range is 0...2^63-1. 533 * 534 * NOTE: the base_off assignment is required to work around what 535 * I consider to be a GCC-4 optimization bug. 536 */ 537 if (uio->uio_offset < 0) { 538 hammer_done_transaction(&trans); 539 return (EFBIG); 540 } 541 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 542 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 543 hammer_done_transaction(&trans); 544 return (EFBIG); 545 } 546 547 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 548 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 549 hammer_done_transaction(&trans); 550 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 551 return (EFBIG); 552 } 553 554 /* 555 * If reading or writing a huge amount of data we have to break 556 * atomicy and allow the operation to be interrupted by a signal 557 * or it can DOS the machine. 558 * 559 * Preset redo_count so we stop generating REDOs earlier if the 560 * limit is exceeded. 561 * 562 * redo_count is heuristical, SMP races are ok 563 */ 564 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 565 if ((ip->flags & HAMMER_INODE_REDO) && 566 ip->redo_count < hammer_limit_redo) { 567 ip->redo_count += uio->uio_resid; 568 } 569 570 /* 571 * Access the data typically in HAMMER_BUFSIZE blocks via the 572 * buffer cache, but HAMMER may use a variable block size based 573 * on the offset. 574 */ 575 while (uio->uio_resid > 0) { 576 int fixsize = 0; 577 int blksize; 578 int blkmask; 579 int trivial; 580 int endofblk; 581 off_t nsize; 582 583 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 584 break; 585 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 586 break; 587 588 blksize = hammer_blocksize(uio->uio_offset); 589 590 /* 591 * Control the number of pending records associated with 592 * this inode. If too many have accumulated start a 593 * flush. Try to maintain a pipeline with the flusher. 594 * 595 * NOTE: It is possible for other sources to grow the 596 * records but not necessarily issue another flush, 597 * so use a timeout and ensure that a re-flush occurs. 598 */ 599 if (ip->rsv_recs >= hammer_limit_inode_recs) { 600 lwkt_gettoken(&hmp->fs_token); 601 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 602 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 603 ip->flags |= HAMMER_INODE_RECSW; 604 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 605 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 606 } 607 lwkt_reltoken(&hmp->fs_token); 608 } 609 610 /* 611 * Do not allow HAMMER to blow out the buffer cache. Very 612 * large UIOs can lockout other processes due to bwillwrite() 613 * mechanics. 614 * 615 * The hammer inode is not locked during these operations. 616 * The vnode is locked which can interfere with the pageout 617 * daemon for non-UIO_NOCOPY writes but should not interfere 618 * with the buffer cache. Even so, we cannot afford to 619 * allow the pageout daemon to build up too many dirty buffer 620 * cache buffers. 621 * 622 * Only call this if we aren't being recursively called from 623 * a virtual disk device (vn), else we may deadlock. 624 */ 625 if ((ap->a_ioflag & IO_RECURSE) == 0) 626 bwillwrite(blksize); 627 628 /* 629 * Calculate the blocksize at the current offset and figure 630 * out how much we can actually write. 631 */ 632 blkmask = blksize - 1; 633 offset = (int)uio->uio_offset & blkmask; 634 base_offset = uio->uio_offset & ~(int64_t)blkmask; 635 n = blksize - offset; 636 if (n > uio->uio_resid) { 637 n = uio->uio_resid; 638 endofblk = 0; 639 } else { 640 endofblk = 1; 641 } 642 nsize = uio->uio_offset + n; 643 if (nsize > ip->ino_data.size) { 644 if (uio->uio_offset > ip->ino_data.size) 645 trivial = 0; 646 else 647 trivial = 1; 648 nvextendbuf(ap->a_vp, 649 ip->ino_data.size, 650 nsize, 651 hammer_blocksize(ip->ino_data.size), 652 hammer_blocksize(nsize), 653 hammer_blockoff(ip->ino_data.size), 654 hammer_blockoff(nsize), 655 trivial); 656 fixsize = 1; 657 kflags |= NOTE_EXTEND; 658 } 659 660 if (uio->uio_segflg == UIO_NOCOPY) { 661 /* 662 * Issuing a write with the same data backing the 663 * buffer. Instantiate the buffer to collect the 664 * backing vm pages, then read-in any missing bits. 665 * 666 * This case is used by vop_stdputpages(). 667 */ 668 bp = getblk(ap->a_vp, base_offset, 669 blksize, GETBLK_BHEAVY, 0); 670 if ((bp->b_flags & B_CACHE) == 0) { 671 bqrelse(bp); 672 error = bread(ap->a_vp, base_offset, 673 blksize, &bp); 674 } 675 } else if (offset == 0 && uio->uio_resid >= blksize) { 676 /* 677 * Even though we are entirely overwriting the buffer 678 * we may still have to zero it out to avoid a 679 * mmap/write visibility issue. 680 */ 681 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 682 if ((bp->b_flags & B_CACHE) == 0) 683 vfs_bio_clrbuf(bp); 684 } else if (base_offset >= ip->ino_data.size) { 685 /* 686 * If the base offset of the buffer is beyond the 687 * file EOF, we don't have to issue a read. 688 */ 689 bp = getblk(ap->a_vp, base_offset, 690 blksize, GETBLK_BHEAVY, 0); 691 vfs_bio_clrbuf(bp); 692 } else { 693 /* 694 * Partial overwrite, read in any missing bits then 695 * replace the portion being written. 696 */ 697 error = bread(ap->a_vp, base_offset, blksize, &bp); 698 if (error == 0) 699 bheavy(bp); 700 } 701 if (error == 0) 702 error = uiomovebp(bp, bp->b_data + offset, n, uio); 703 704 lwkt_gettoken(&hmp->fs_token); 705 706 /* 707 * Generate REDO records if enabled and redo_count will not 708 * exceeded the limit. 709 * 710 * If redo_count exceeds the limit we stop generating records 711 * and clear HAMMER_INODE_REDO. This will cause the next 712 * fsync() to do a full meta-data sync instead of just an 713 * UNDO/REDO fifo update. 714 * 715 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 716 * will still be tracked. The tracks will be terminated 717 * when the related meta-data (including possible data 718 * modifications which are not tracked via REDO) is 719 * flushed. 720 */ 721 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 722 if (ip->redo_count < hammer_limit_redo) { 723 bp->b_flags |= B_VFSFLAG1; 724 error = hammer_generate_redo(&trans, ip, 725 base_offset + offset, 726 HAMMER_REDO_WRITE, 727 bp->b_data + offset, 728 (size_t)n); 729 } else { 730 ip->flags &= ~HAMMER_INODE_REDO; 731 } 732 } 733 734 /* 735 * If we screwed up we have to undo any VM size changes we 736 * made. 737 */ 738 if (error) { 739 brelse(bp); 740 if (fixsize) { 741 nvtruncbuf(ap->a_vp, ip->ino_data.size, 742 hammer_blocksize(ip->ino_data.size), 743 hammer_blockoff(ip->ino_data.size), 744 0); 745 } 746 lwkt_reltoken(&hmp->fs_token); 747 break; 748 } 749 kflags |= NOTE_WRITE; 750 hammer_stats_file_write += n; 751 if (blksize == HAMMER_XBUFSIZE) 752 bp->b_flags |= B_CLUSTEROK; 753 if (ip->ino_data.size < uio->uio_offset) { 754 ip->ino_data.size = uio->uio_offset; 755 flags = HAMMER_INODE_SDIRTY; 756 } else { 757 flags = 0; 758 } 759 ip->ino_data.mtime = trans.time; 760 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 761 hammer_modify_inode(&trans, ip, flags); 762 763 /* 764 * Once we dirty the buffer any cached zone-X offset 765 * becomes invalid. HAMMER NOTE: no-history mode cannot 766 * allow overwriting over the same data sector unless 767 * we provide UNDOs for the old data, which we don't. 768 */ 769 bp->b_bio2.bio_offset = NOOFFSET; 770 771 lwkt_reltoken(&hmp->fs_token); 772 773 /* 774 * Final buffer disposition. 775 * 776 * Because meta-data updates are deferred, HAMMER is 777 * especially sensitive to excessive bdwrite()s because 778 * the I/O stream is not broken up by disk reads. So the 779 * buffer cache simply cannot keep up. 780 * 781 * WARNING! blksize is variable. cluster_write() is 782 * expected to not blow up if it encounters 783 * buffers that do not match the passed blksize. 784 * 785 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 786 * The ip->rsv_recs check should burst-flush the data. 787 * If we queue it immediately the buf could be left 788 * locked on the device queue for a very long time. 789 * 790 * However, failing to flush a dirty buffer out when 791 * issued from the pageout daemon can result in a low 792 * memory deadlock against bio_page_alloc(), so we 793 * have to bawrite() on IO_ASYNC as well. 794 * 795 * NOTE! To avoid degenerate stalls due to mismatched block 796 * sizes we only honor IO_DIRECT on the write which 797 * abuts the end of the buffer. However, we must 798 * honor IO_SYNC in case someone is silly enough to 799 * configure a HAMMER file as swap, or when HAMMER 800 * is serving NFS (for commits). Ick ick. 801 */ 802 bp->b_flags |= B_AGE; 803 if (blksize == HAMMER_XBUFSIZE) 804 bp->b_flags |= B_CLUSTEROK; 805 806 if (ap->a_ioflag & IO_SYNC) { 807 bwrite(bp); 808 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 809 bawrite(bp); 810 } else if (ap->a_ioflag & IO_ASYNC) { 811 bawrite(bp); 812 } else if (hammer_cluster_enable && 813 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 814 if (base_offset < HAMMER_XDEMARC) 815 cluster_eof = hammer_blockdemarc(base_offset, 816 ip->ino_data.size); 817 else 818 cluster_eof = ip->ino_data.size; 819 cluster_write(bp, cluster_eof, blksize, seqcount); 820 } else { 821 bdwrite(bp); 822 } 823 } 824 hammer_done_transaction(&trans); 825 hammer_knote(ap->a_vp, kflags); 826 827 return (error); 828 } 829 830 /* 831 * hammer_vop_access { vp, mode, cred } 832 * 833 * MPSAFE - does not require fs_token 834 */ 835 static 836 int 837 hammer_vop_access(struct vop_access_args *ap) 838 { 839 hammer_inode_t ip = VTOI(ap->a_vp); 840 uid_t uid; 841 gid_t gid; 842 int error; 843 844 uid = hammer_to_unix_xid(&ip->ino_data.uid); 845 gid = hammer_to_unix_xid(&ip->ino_data.gid); 846 847 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 848 ip->ino_data.uflags); 849 return (error); 850 } 851 852 /* 853 * hammer_vop_advlock { vp, id, op, fl, flags } 854 * 855 * MPSAFE - does not require fs_token 856 */ 857 static 858 int 859 hammer_vop_advlock(struct vop_advlock_args *ap) 860 { 861 hammer_inode_t ip = VTOI(ap->a_vp); 862 863 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 864 } 865 866 /* 867 * hammer_vop_close { vp, fflag } 868 * 869 * We can only sync-on-close for normal closes. XXX disabled for now. 870 */ 871 static 872 int 873 hammer_vop_close(struct vop_close_args *ap) 874 { 875 #if 0 876 struct vnode *vp = ap->a_vp; 877 hammer_inode_t ip = VTOI(vp); 878 int waitfor; 879 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 880 if (vn_islocked(vp) == LK_EXCLUSIVE && 881 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 882 if (ip->flags & HAMMER_INODE_CLOSESYNC) 883 waitfor = MNT_WAIT; 884 else 885 waitfor = MNT_NOWAIT; 886 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 887 HAMMER_INODE_CLOSEASYNC); 888 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 889 } 890 } 891 #endif 892 return (vop_stdclose(ap)); 893 } 894 895 /* 896 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 897 * 898 * The operating system has already ensured that the directory entry 899 * does not exist and done all appropriate namespace locking. 900 */ 901 static 902 int 903 hammer_vop_ncreate(struct vop_ncreate_args *ap) 904 { 905 struct hammer_transaction trans; 906 hammer_inode_t dip; 907 hammer_inode_t nip; 908 struct nchandle *nch; 909 hammer_mount_t hmp; 910 int error; 911 912 nch = ap->a_nch; 913 dip = VTOI(ap->a_dvp); 914 hmp = dip->hmp; 915 916 if (dip->flags & HAMMER_INODE_RO) 917 return (EROFS); 918 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 919 return (error); 920 921 /* 922 * Create a transaction to cover the operations we perform. 923 */ 924 lwkt_gettoken(&hmp->fs_token); 925 hammer_start_transaction(&trans, hmp); 926 927 /* 928 * Create a new filesystem object of the requested type. The 929 * returned inode will be referenced and shared-locked to prevent 930 * it from being moved to the flusher. 931 */ 932 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 933 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 934 NULL, &nip); 935 if (error) { 936 hkprintf("hammer_create_inode error %d\n", error); 937 hammer_done_transaction(&trans); 938 *ap->a_vpp = NULL; 939 lwkt_reltoken(&hmp->fs_token); 940 return (error); 941 } 942 943 /* 944 * Add the new filesystem object to the directory. This will also 945 * bump the inode's link count. 946 */ 947 error = hammer_ip_add_direntry(&trans, dip, 948 nch->ncp->nc_name, nch->ncp->nc_nlen, 949 nip); 950 if (error) 951 hkprintf("hammer_ip_add_direntry error %d\n", error); 952 953 /* 954 * Finish up. 955 */ 956 if (error) { 957 hammer_rel_inode(nip, 0); 958 hammer_done_transaction(&trans); 959 *ap->a_vpp = NULL; 960 } else { 961 error = hammer_get_vnode(nip, ap->a_vpp); 962 hammer_done_transaction(&trans); 963 hammer_rel_inode(nip, 0); 964 if (error == 0) { 965 cache_setunresolved(ap->a_nch); 966 cache_setvp(ap->a_nch, *ap->a_vpp); 967 } 968 hammer_knote(ap->a_dvp, NOTE_WRITE); 969 } 970 lwkt_reltoken(&hmp->fs_token); 971 return (error); 972 } 973 974 /* 975 * hammer_vop_getattr { vp, vap } 976 * 977 * Retrieve an inode's attribute information. When accessing inodes 978 * historically we fake the atime field to ensure consistent results. 979 * The atime field is stored in the B-Tree element and allowed to be 980 * updated without cycling the element. 981 * 982 * MPSAFE - does not require fs_token 983 */ 984 static 985 int 986 hammer_vop_getattr(struct vop_getattr_args *ap) 987 { 988 hammer_inode_t ip = VTOI(ap->a_vp); 989 struct vattr *vap = ap->a_vap; 990 991 /* 992 * We want the fsid to be different when accessing a filesystem 993 * with different as-of's so programs like diff don't think 994 * the files are the same. 995 * 996 * We also want the fsid to be the same when comparing snapshots, 997 * or when comparing mirrors (which might be backed by different 998 * physical devices). HAMMER fsids are based on the PFS's 999 * shared_uuid field. 1000 * 1001 * XXX there is a chance of collision here. The va_fsid reported 1002 * by stat is different from the more involved fsid used in the 1003 * mount structure. 1004 */ 1005 hammer_lock_sh(&ip->lock); 1006 vap->va_fsid = ip->pfsm->fsid_udev ^ (uint32_t)ip->obj_asof ^ 1007 (uint32_t)(ip->obj_asof >> 32); 1008 1009 vap->va_fileid = ip->ino_leaf.base.obj_id; 1010 vap->va_mode = ip->ino_data.mode; 1011 vap->va_nlink = ip->ino_data.nlinks; 1012 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1013 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1014 vap->va_rmajor = 0; 1015 vap->va_rminor = 0; 1016 vap->va_size = ip->ino_data.size; 1017 1018 /* 1019 * Special case for @@PFS softlinks. The actual size of the 1020 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1021 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1022 * 1023 * Note that userspace hammer command does not allow users to 1024 * create a @@PFS softlink under an existing other PFS (id!=0) 1025 * so the ip localization here for @@PFS softlink is always 0. 1026 */ 1027 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1028 ip->ino_data.size == 10 && 1029 ip->obj_asof == HAMMER_MAX_TID && 1030 ip->obj_localization == HAMMER_DEF_LOCALIZATION && 1031 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1032 if (hammer_is_pfs_slave(&ip->pfsm->pfsd)) 1033 vap->va_size = 26; 1034 else 1035 vap->va_size = 10; 1036 } 1037 1038 /* 1039 * We must provide a consistent atime and mtime for snapshots 1040 * so people can do a 'tar cf - ... | md5' on them and get 1041 * consistent results. 1042 */ 1043 if (ip->flags & HAMMER_INODE_RO) { 1044 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1045 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1046 } else { 1047 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1048 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1049 } 1050 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1051 vap->va_flags = ip->ino_data.uflags; 1052 vap->va_gen = 1; /* hammer inums are unique for all time */ 1053 vap->va_blocksize = HAMMER_BUFSIZE; 1054 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1055 vap->va_bytes = HAMMER_XBUFSIZE64_DOALIGN(ip->ino_data.size); 1056 } else if (ip->ino_data.size > HAMMER_HBUFSIZE) { 1057 vap->va_bytes = HAMMER_BUFSIZE64_DOALIGN(ip->ino_data.size); 1058 } else { 1059 vap->va_bytes = HAMMER_DATA_DOALIGN(ip->ino_data.size); 1060 } 1061 1062 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1063 vap->va_filerev = 0; /* XXX */ 1064 vap->va_uid_uuid = ip->ino_data.uid; 1065 vap->va_gid_uuid = ip->ino_data.gid; 1066 vap->va_fsid_uuid = ip->hmp->fsid; 1067 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1068 VA_FSID_UUID_VALID; 1069 1070 switch (ip->ino_data.obj_type) { 1071 case HAMMER_OBJTYPE_CDEV: 1072 case HAMMER_OBJTYPE_BDEV: 1073 vap->va_rmajor = ip->ino_data.rmajor; 1074 vap->va_rminor = ip->ino_data.rminor; 1075 break; 1076 default: 1077 break; 1078 } 1079 hammer_unlock(&ip->lock); 1080 return(0); 1081 } 1082 1083 /* 1084 * hammer_vop_nresolve { nch, dvp, cred } 1085 * 1086 * Locate the requested directory entry. 1087 */ 1088 static 1089 int 1090 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1091 { 1092 struct hammer_transaction trans; 1093 struct namecache *ncp; 1094 hammer_mount_t hmp; 1095 hammer_inode_t dip; 1096 hammer_inode_t ip; 1097 hammer_tid_t asof; 1098 struct hammer_cursor cursor; 1099 struct vnode *vp; 1100 int64_t namekey; 1101 int error; 1102 int i; 1103 int nlen; 1104 int flags; 1105 int ispfs; 1106 int64_t obj_id; 1107 uint32_t localization; 1108 uint32_t max_iterations; 1109 1110 /* 1111 * Misc initialization, plus handle as-of name extensions. Look for 1112 * the '@@' extension. Note that as-of files and directories cannot 1113 * be modified. 1114 */ 1115 dip = VTOI(ap->a_dvp); 1116 ncp = ap->a_nch->ncp; 1117 asof = dip->obj_asof; 1118 localization = dip->obj_localization; /* for code consistency */ 1119 nlen = ncp->nc_nlen; 1120 flags = dip->flags & HAMMER_INODE_RO; 1121 ispfs = 0; 1122 hmp = dip->hmp; 1123 1124 lwkt_gettoken(&hmp->fs_token); 1125 hammer_simple_transaction(&trans, hmp); 1126 1127 for (i = 0; i < nlen; ++i) { 1128 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1129 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1130 &ispfs, &asof, &localization); 1131 if (error != 0) { 1132 i = nlen; 1133 break; 1134 } 1135 if (asof != HAMMER_MAX_TID) 1136 flags |= HAMMER_INODE_RO; 1137 break; 1138 } 1139 } 1140 nlen = i; 1141 1142 /* 1143 * If this is a PFS softlink we dive into the PFS 1144 */ 1145 if (ispfs && nlen == 0) { 1146 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1147 asof, localization, 1148 flags, &error); 1149 if (error == 0) { 1150 error = hammer_get_vnode(ip, &vp); 1151 hammer_rel_inode(ip, 0); 1152 } else { 1153 vp = NULL; 1154 } 1155 if (error == 0) { 1156 vn_unlock(vp); 1157 cache_setvp(ap->a_nch, vp); 1158 vrele(vp); 1159 } 1160 goto done; 1161 } 1162 1163 /* 1164 * If there is no path component the time extension is relative to dip. 1165 * e.g. "fubar/@@<snapshot>" 1166 * 1167 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1168 * e.g. "fubar/.@@<snapshot>" 1169 * 1170 * ".." is handled by the kernel. We do not currently handle 1171 * "..@<snapshot>". 1172 */ 1173 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1174 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1175 asof, dip->obj_localization, 1176 flags, &error); 1177 if (error == 0) { 1178 error = hammer_get_vnode(ip, &vp); 1179 hammer_rel_inode(ip, 0); 1180 } else { 1181 vp = NULL; 1182 } 1183 if (error == 0) { 1184 vn_unlock(vp); 1185 cache_setvp(ap->a_nch, vp); 1186 vrele(vp); 1187 } 1188 goto done; 1189 } 1190 1191 /* 1192 * Calculate the namekey and setup the key range for the scan. This 1193 * works kinda like a chained hash table where the lower 32 bits 1194 * of the namekey synthesize the chain. 1195 * 1196 * The key range is inclusive of both key_beg and key_end. 1197 */ 1198 namekey = hammer_direntry_namekey(dip, ncp->nc_name, nlen, 1199 &max_iterations); 1200 1201 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1202 cursor.key_beg.localization = dip->obj_localization | 1203 hammer_dir_localization(dip); 1204 cursor.key_beg.obj_id = dip->obj_id; 1205 cursor.key_beg.key = namekey; 1206 cursor.key_beg.create_tid = 0; 1207 cursor.key_beg.delete_tid = 0; 1208 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1209 cursor.key_beg.obj_type = 0; 1210 1211 cursor.key_end = cursor.key_beg; 1212 cursor.key_end.key += max_iterations; 1213 cursor.asof = asof; 1214 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1215 1216 /* 1217 * Scan all matching records (the chain), locate the one matching 1218 * the requested path component. 1219 * 1220 * The hammer_ip_*() functions merge in-memory records with on-disk 1221 * records for the purposes of the search. 1222 */ 1223 obj_id = 0; 1224 localization = HAMMER_DEF_LOCALIZATION; 1225 1226 if (error == 0) { 1227 error = hammer_ip_first(&cursor); 1228 while (error == 0) { 1229 error = hammer_ip_resolve_data(&cursor); 1230 if (error) 1231 break; 1232 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1233 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1234 obj_id = cursor.data->entry.obj_id; 1235 localization = cursor.data->entry.localization; 1236 break; 1237 } 1238 error = hammer_ip_next(&cursor); 1239 } 1240 } 1241 hammer_done_cursor(&cursor); 1242 1243 /* 1244 * Lookup the obj_id. This should always succeed. If it does not 1245 * the filesystem may be damaged and we return a dummy inode. 1246 */ 1247 if (error == 0) { 1248 ip = hammer_get_inode(&trans, dip, obj_id, 1249 asof, localization, 1250 flags, &error); 1251 if (error == ENOENT) { 1252 hkprintf("WARNING: Missing inode for dirent \"%s\"\n" 1253 "\tobj_id = %016jx, asof=%016jx, lo=%08x\n", 1254 ncp->nc_name, 1255 (intmax_t)obj_id, (intmax_t)asof, 1256 localization); 1257 error = 0; 1258 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1259 asof, localization, 1260 flags, &error); 1261 } 1262 if (error == 0) { 1263 error = hammer_get_vnode(ip, &vp); 1264 hammer_rel_inode(ip, 0); 1265 } else { 1266 vp = NULL; 1267 } 1268 if (error == 0) { 1269 vn_unlock(vp); 1270 cache_setvp(ap->a_nch, vp); 1271 vrele(vp); 1272 } 1273 } else if (error == ENOENT) { 1274 cache_setvp(ap->a_nch, NULL); 1275 } 1276 done: 1277 hammer_done_transaction(&trans); 1278 lwkt_reltoken(&hmp->fs_token); 1279 return (error); 1280 } 1281 1282 /* 1283 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1284 * 1285 * Locate the parent directory of a directory vnode. 1286 * 1287 * dvp is referenced but not locked. *vpp must be returned referenced and 1288 * locked. A parent_obj_id of 0 indicates that we are at the root. 1289 * 1290 * NOTE: as-of sequences are not linked into the directory structure. If 1291 * we are at the root with a different asof then the mount point, reload 1292 * the same directory with the mount point's asof. I'm not sure what this 1293 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1294 * get confused, but it hasn't been tested. 1295 */ 1296 static 1297 int 1298 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1299 { 1300 struct hammer_transaction trans; 1301 hammer_inode_t dip; 1302 hammer_inode_t ip; 1303 hammer_mount_t hmp; 1304 int64_t parent_obj_id; 1305 uint32_t parent_obj_localization; 1306 hammer_tid_t asof; 1307 int error; 1308 1309 dip = VTOI(ap->a_dvp); 1310 asof = dip->obj_asof; 1311 hmp = dip->hmp; 1312 1313 /* 1314 * Whos are parent? This could be the root of a pseudo-filesystem 1315 * whos parent is in another localization domain. 1316 */ 1317 lwkt_gettoken(&hmp->fs_token); 1318 parent_obj_id = dip->ino_data.parent_obj_id; 1319 if (dip->obj_id == HAMMER_OBJID_ROOT) 1320 parent_obj_localization = HAMMER_DEF_LOCALIZATION; 1321 else 1322 parent_obj_localization = dip->obj_localization; 1323 1324 /* 1325 * It's probably a PFS root when dip->ino_data.parent_obj_id is 0. 1326 */ 1327 if (parent_obj_id == 0) { 1328 if (dip->obj_id == HAMMER_OBJID_ROOT && 1329 asof != hmp->asof) { 1330 parent_obj_id = dip->obj_id; 1331 asof = hmp->asof; 1332 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1333 ksnprintf(*ap->a_fakename, 19, "0x%016jx", 1334 (intmax_t)dip->obj_asof); 1335 } else { 1336 *ap->a_vpp = NULL; 1337 lwkt_reltoken(&hmp->fs_token); 1338 return ENOENT; 1339 } 1340 } 1341 1342 hammer_simple_transaction(&trans, hmp); 1343 1344 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1345 asof, parent_obj_localization, 1346 dip->flags, &error); 1347 if (ip) { 1348 error = hammer_get_vnode(ip, ap->a_vpp); 1349 hammer_rel_inode(ip, 0); 1350 } else { 1351 *ap->a_vpp = NULL; 1352 } 1353 hammer_done_transaction(&trans); 1354 lwkt_reltoken(&hmp->fs_token); 1355 return (error); 1356 } 1357 1358 /* 1359 * hammer_vop_nlink { nch, dvp, vp, cred } 1360 */ 1361 static 1362 int 1363 hammer_vop_nlink(struct vop_nlink_args *ap) 1364 { 1365 struct hammer_transaction trans; 1366 hammer_inode_t dip; 1367 hammer_inode_t ip; 1368 struct nchandle *nch; 1369 hammer_mount_t hmp; 1370 int error; 1371 1372 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1373 return(EXDEV); 1374 1375 nch = ap->a_nch; 1376 dip = VTOI(ap->a_dvp); 1377 ip = VTOI(ap->a_vp); 1378 hmp = dip->hmp; 1379 1380 if (dip->obj_localization != ip->obj_localization) 1381 return(EXDEV); 1382 1383 if (dip->flags & HAMMER_INODE_RO) 1384 return (EROFS); 1385 if (ip->flags & HAMMER_INODE_RO) 1386 return (EROFS); 1387 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1388 return (error); 1389 1390 /* 1391 * Create a transaction to cover the operations we perform. 1392 */ 1393 lwkt_gettoken(&hmp->fs_token); 1394 hammer_start_transaction(&trans, hmp); 1395 1396 /* 1397 * Add the filesystem object to the directory. Note that neither 1398 * dip nor ip are referenced or locked, but their vnodes are 1399 * referenced. This function will bump the inode's link count. 1400 */ 1401 error = hammer_ip_add_direntry(&trans, dip, 1402 nch->ncp->nc_name, nch->ncp->nc_nlen, 1403 ip); 1404 1405 /* 1406 * Finish up. 1407 */ 1408 if (error == 0) { 1409 cache_setunresolved(nch); 1410 cache_setvp(nch, ap->a_vp); 1411 } 1412 hammer_done_transaction(&trans); 1413 hammer_knote(ap->a_vp, NOTE_LINK); 1414 hammer_knote(ap->a_dvp, NOTE_WRITE); 1415 lwkt_reltoken(&hmp->fs_token); 1416 return (error); 1417 } 1418 1419 /* 1420 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1421 * 1422 * The operating system has already ensured that the directory entry 1423 * does not exist and done all appropriate namespace locking. 1424 */ 1425 static 1426 int 1427 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1428 { 1429 struct hammer_transaction trans; 1430 hammer_inode_t dip; 1431 hammer_inode_t nip; 1432 struct nchandle *nch; 1433 hammer_mount_t hmp; 1434 int error; 1435 1436 nch = ap->a_nch; 1437 dip = VTOI(ap->a_dvp); 1438 hmp = dip->hmp; 1439 1440 if (dip->flags & HAMMER_INODE_RO) 1441 return (EROFS); 1442 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1443 return (error); 1444 1445 /* 1446 * Create a transaction to cover the operations we perform. 1447 */ 1448 lwkt_gettoken(&hmp->fs_token); 1449 hammer_start_transaction(&trans, hmp); 1450 1451 /* 1452 * Create a new filesystem object of the requested type. The 1453 * returned inode will be referenced but not locked. 1454 */ 1455 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1456 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1457 NULL, &nip); 1458 if (error) { 1459 hammer_done_transaction(&trans); 1460 *ap->a_vpp = NULL; 1461 lwkt_reltoken(&hmp->fs_token); 1462 return (error); 1463 } 1464 /* 1465 * Add the new filesystem object to the directory. This will also 1466 * bump the inode's link count. 1467 */ 1468 error = hammer_ip_add_direntry(&trans, dip, 1469 nch->ncp->nc_name, nch->ncp->nc_nlen, 1470 nip); 1471 if (error) 1472 hkprintf("hammer_mkdir (add) error %d\n", error); 1473 1474 /* 1475 * Finish up. 1476 */ 1477 if (error) { 1478 hammer_rel_inode(nip, 0); 1479 *ap->a_vpp = NULL; 1480 } else { 1481 error = hammer_get_vnode(nip, ap->a_vpp); 1482 hammer_rel_inode(nip, 0); 1483 if (error == 0) { 1484 cache_setunresolved(ap->a_nch); 1485 cache_setvp(ap->a_nch, *ap->a_vpp); 1486 } 1487 } 1488 hammer_done_transaction(&trans); 1489 if (error == 0) 1490 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1491 lwkt_reltoken(&hmp->fs_token); 1492 return (error); 1493 } 1494 1495 /* 1496 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1497 * 1498 * The operating system has already ensured that the directory entry 1499 * does not exist and done all appropriate namespace locking. 1500 */ 1501 static 1502 int 1503 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1504 { 1505 struct hammer_transaction trans; 1506 hammer_inode_t dip; 1507 hammer_inode_t nip; 1508 struct nchandle *nch; 1509 hammer_mount_t hmp; 1510 int error; 1511 1512 nch = ap->a_nch; 1513 dip = VTOI(ap->a_dvp); 1514 hmp = dip->hmp; 1515 1516 if (dip->flags & HAMMER_INODE_RO) 1517 return (EROFS); 1518 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1519 return (error); 1520 1521 /* 1522 * Create a transaction to cover the operations we perform. 1523 */ 1524 lwkt_gettoken(&hmp->fs_token); 1525 hammer_start_transaction(&trans, hmp); 1526 1527 /* 1528 * Create a new filesystem object of the requested type. The 1529 * returned inode will be referenced but not locked. 1530 * 1531 * If mknod specifies a directory a pseudo-fs is created. 1532 */ 1533 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1534 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1535 NULL, &nip); 1536 if (error) { 1537 hammer_done_transaction(&trans); 1538 *ap->a_vpp = NULL; 1539 lwkt_reltoken(&hmp->fs_token); 1540 return (error); 1541 } 1542 1543 /* 1544 * Add the new filesystem object to the directory. This will also 1545 * bump the inode's link count. 1546 */ 1547 error = hammer_ip_add_direntry(&trans, dip, 1548 nch->ncp->nc_name, nch->ncp->nc_nlen, 1549 nip); 1550 1551 /* 1552 * Finish up. 1553 */ 1554 if (error) { 1555 hammer_rel_inode(nip, 0); 1556 *ap->a_vpp = NULL; 1557 } else { 1558 error = hammer_get_vnode(nip, ap->a_vpp); 1559 hammer_rel_inode(nip, 0); 1560 if (error == 0) { 1561 cache_setunresolved(ap->a_nch); 1562 cache_setvp(ap->a_nch, *ap->a_vpp); 1563 } 1564 } 1565 hammer_done_transaction(&trans); 1566 if (error == 0) 1567 hammer_knote(ap->a_dvp, NOTE_WRITE); 1568 lwkt_reltoken(&hmp->fs_token); 1569 return (error); 1570 } 1571 1572 /* 1573 * hammer_vop_open { vp, mode, cred, fp } 1574 * 1575 * MPSAFE (does not require fs_token) 1576 */ 1577 static 1578 int 1579 hammer_vop_open(struct vop_open_args *ap) 1580 { 1581 hammer_inode_t ip; 1582 1583 ip = VTOI(ap->a_vp); 1584 1585 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1586 return (EROFS); 1587 return(vop_stdopen(ap)); 1588 } 1589 1590 /* 1591 * hammer_vop_print { vp } 1592 */ 1593 static 1594 int 1595 hammer_vop_print(struct vop_print_args *ap) 1596 { 1597 return EOPNOTSUPP; 1598 } 1599 1600 /* 1601 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1602 */ 1603 static 1604 int 1605 hammer_vop_readdir(struct vop_readdir_args *ap) 1606 { 1607 struct hammer_transaction trans; 1608 struct hammer_cursor cursor; 1609 hammer_inode_t ip; 1610 hammer_mount_t hmp; 1611 struct uio *uio; 1612 hammer_base_elm_t base; 1613 int error; 1614 int cookie_index; 1615 int ncookies; 1616 off_t *cookies; 1617 off_t saveoff; 1618 int r; 1619 int dtype; 1620 1621 ip = VTOI(ap->a_vp); 1622 uio = ap->a_uio; 1623 saveoff = uio->uio_offset; 1624 hmp = ip->hmp; 1625 1626 if (ap->a_ncookies) { 1627 ncookies = uio->uio_resid / 16 + 1; 1628 if (ncookies > 1024) 1629 ncookies = 1024; 1630 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1631 cookie_index = 0; 1632 } else { 1633 ncookies = -1; 1634 cookies = NULL; 1635 cookie_index = 0; 1636 } 1637 1638 lwkt_gettoken(&hmp->fs_token); 1639 hammer_simple_transaction(&trans, hmp); 1640 1641 /* 1642 * Handle artificial entries 1643 * 1644 * It should be noted that the minimum value for a directory 1645 * hash key on-media is 0x0000000100000000, so we can use anything 1646 * less then that to represent our 'special' key space. 1647 */ 1648 error = 0; 1649 if (saveoff == 0) { 1650 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1651 if (r) 1652 goto done; 1653 if (cookies) 1654 cookies[cookie_index] = saveoff; 1655 ++saveoff; 1656 ++cookie_index; 1657 if (cookie_index == ncookies) 1658 goto done; 1659 } 1660 if (saveoff == 1) { 1661 if (ip->ino_data.parent_obj_id) { 1662 r = vop_write_dirent(&error, uio, 1663 ip->ino_data.parent_obj_id, 1664 DT_DIR, 2, ".."); 1665 } else { 1666 r = vop_write_dirent(&error, uio, 1667 ip->obj_id, DT_DIR, 2, ".."); 1668 } 1669 if (r) 1670 goto done; 1671 if (cookies) 1672 cookies[cookie_index] = saveoff; 1673 ++saveoff; 1674 ++cookie_index; 1675 if (cookie_index == ncookies) 1676 goto done; 1677 } 1678 1679 /* 1680 * Key range (begin and end inclusive) to scan. Directory keys 1681 * directly translate to a 64 bit 'seek' position. 1682 */ 1683 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1684 cursor.key_beg.localization = ip->obj_localization | 1685 hammer_dir_localization(ip); 1686 cursor.key_beg.obj_id = ip->obj_id; 1687 cursor.key_beg.create_tid = 0; 1688 cursor.key_beg.delete_tid = 0; 1689 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1690 cursor.key_beg.obj_type = 0; 1691 cursor.key_beg.key = saveoff; 1692 1693 cursor.key_end = cursor.key_beg; 1694 cursor.key_end.key = HAMMER_MAX_KEY; 1695 cursor.asof = ip->obj_asof; 1696 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1697 1698 error = hammer_ip_first(&cursor); 1699 1700 while (error == 0) { 1701 error = hammer_ip_resolve_data(&cursor); 1702 if (error) 1703 break; 1704 base = &cursor.leaf->base; 1705 saveoff = base->key; 1706 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1707 1708 if (base->obj_id != ip->obj_id) 1709 hpanic("bad record at %p", cursor.node); 1710 1711 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1712 r = vop_write_dirent( 1713 &error, uio, cursor.data->entry.obj_id, 1714 dtype, 1715 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1716 (void *)cursor.data->entry.name); 1717 if (r) 1718 break; 1719 ++saveoff; 1720 if (cookies) 1721 cookies[cookie_index] = base->key; 1722 ++cookie_index; 1723 if (cookie_index == ncookies) 1724 break; 1725 error = hammer_ip_next(&cursor); 1726 } 1727 hammer_done_cursor(&cursor); 1728 1729 done: 1730 hammer_done_transaction(&trans); 1731 1732 if (ap->a_eofflag) 1733 *ap->a_eofflag = (error == ENOENT); 1734 uio->uio_offset = saveoff; 1735 if (error && cookie_index == 0) { 1736 if (error == ENOENT) 1737 error = 0; 1738 if (cookies) { 1739 kfree(cookies, M_TEMP); 1740 *ap->a_ncookies = 0; 1741 *ap->a_cookies = NULL; 1742 } 1743 } else { 1744 if (error == ENOENT) 1745 error = 0; 1746 if (cookies) { 1747 *ap->a_ncookies = cookie_index; 1748 *ap->a_cookies = cookies; 1749 } 1750 } 1751 lwkt_reltoken(&hmp->fs_token); 1752 return(error); 1753 } 1754 1755 /* 1756 * hammer_vop_readlink { vp, uio, cred } 1757 */ 1758 static 1759 int 1760 hammer_vop_readlink(struct vop_readlink_args *ap) 1761 { 1762 struct hammer_transaction trans; 1763 struct hammer_cursor cursor; 1764 hammer_inode_t ip; 1765 hammer_mount_t hmp; 1766 char buf[32]; 1767 uint32_t localization; 1768 hammer_pseudofs_inmem_t pfsm; 1769 int error; 1770 1771 ip = VTOI(ap->a_vp); 1772 hmp = ip->hmp; 1773 1774 lwkt_gettoken(&hmp->fs_token); 1775 1776 /* 1777 * Shortcut if the symlink data was stuffed into ino_data. 1778 * 1779 * Also expand special "@@PFS%05d" softlinks (expansion only 1780 * occurs for non-historical (current) accesses made from the 1781 * primary filesystem). 1782 * 1783 * Note that userspace hammer command does not allow users to 1784 * create a @@PFS softlink under an existing other PFS (id!=0) 1785 * so the ip localization here for @@PFS softlink is always 0. 1786 */ 1787 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1788 char *ptr; 1789 int bytes; 1790 1791 ptr = ip->ino_data.ext.symlink; 1792 bytes = (int)ip->ino_data.size; 1793 if (bytes == 10 && 1794 ip->obj_asof == HAMMER_MAX_TID && 1795 ip->obj_localization == HAMMER_DEF_LOCALIZATION && 1796 strncmp(ptr, "@@PFS", 5) == 0) { 1797 hammer_simple_transaction(&trans, hmp); 1798 bcopy(ptr + 5, buf, 5); 1799 buf[5] = 0; 1800 localization = pfs_to_lo(strtoul(buf, NULL, 10)); 1801 pfsm = hammer_load_pseudofs(&trans, localization, 1802 &error); 1803 if (error == 0) { 1804 if (hammer_is_pfs_slave(&pfsm->pfsd)) { 1805 /* vap->va_size == 26 */ 1806 ksnprintf(buf, sizeof(buf), 1807 "@@0x%016jx:%05d", 1808 (intmax_t)pfsm->pfsd.sync_end_tid, 1809 lo_to_pfs(localization)); 1810 } else { 1811 /* vap->va_size == 10 */ 1812 ksnprintf(buf, sizeof(buf), 1813 "@@-1:%05d", 1814 lo_to_pfs(localization)); 1815 } 1816 ptr = buf; 1817 bytes = strlen(buf); 1818 } 1819 if (pfsm) 1820 hammer_rel_pseudofs(hmp, pfsm); 1821 hammer_done_transaction(&trans); 1822 } 1823 error = uiomove(ptr, bytes, ap->a_uio); 1824 lwkt_reltoken(&hmp->fs_token); 1825 return(error); 1826 } 1827 1828 /* 1829 * Long version 1830 */ 1831 hammer_simple_transaction(&trans, hmp); 1832 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1833 1834 /* 1835 * Key range (begin and end inclusive) to scan. Directory keys 1836 * directly translate to a 64 bit 'seek' position. 1837 */ 1838 cursor.key_beg.localization = ip->obj_localization | 1839 HAMMER_LOCALIZE_MISC; 1840 cursor.key_beg.obj_id = ip->obj_id; 1841 cursor.key_beg.create_tid = 0; 1842 cursor.key_beg.delete_tid = 0; 1843 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1844 cursor.key_beg.obj_type = 0; 1845 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1846 cursor.asof = ip->obj_asof; 1847 cursor.flags |= HAMMER_CURSOR_ASOF; 1848 1849 error = hammer_ip_lookup(&cursor); 1850 if (error == 0) { 1851 error = hammer_ip_resolve_data(&cursor); 1852 if (error == 0) { 1853 KKASSERT(cursor.leaf->data_len >= 1854 HAMMER_SYMLINK_NAME_OFF); 1855 error = uiomove(cursor.data->symlink.name, 1856 cursor.leaf->data_len - 1857 HAMMER_SYMLINK_NAME_OFF, 1858 ap->a_uio); 1859 } 1860 } 1861 hammer_done_cursor(&cursor); 1862 hammer_done_transaction(&trans); 1863 lwkt_reltoken(&hmp->fs_token); 1864 return(error); 1865 } 1866 1867 /* 1868 * hammer_vop_nremove { nch, dvp, cred } 1869 */ 1870 static 1871 int 1872 hammer_vop_nremove(struct vop_nremove_args *ap) 1873 { 1874 struct hammer_transaction trans; 1875 hammer_inode_t dip; 1876 hammer_mount_t hmp; 1877 int error; 1878 1879 dip = VTOI(ap->a_dvp); 1880 hmp = dip->hmp; 1881 1882 if (hammer_nohistory(dip) == 0 && 1883 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1884 return (error); 1885 } 1886 1887 lwkt_gettoken(&hmp->fs_token); 1888 hammer_start_transaction(&trans, hmp); 1889 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1890 hammer_done_transaction(&trans); 1891 if (error == 0) 1892 hammer_knote(ap->a_dvp, NOTE_WRITE); 1893 lwkt_reltoken(&hmp->fs_token); 1894 return (error); 1895 } 1896 1897 /* 1898 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1899 */ 1900 static 1901 int 1902 hammer_vop_nrename(struct vop_nrename_args *ap) 1903 { 1904 struct hammer_transaction trans; 1905 struct namecache *fncp; 1906 struct namecache *tncp; 1907 hammer_inode_t fdip; 1908 hammer_inode_t tdip; 1909 hammer_inode_t ip; 1910 hammer_mount_t hmp; 1911 struct hammer_cursor cursor; 1912 int64_t namekey; 1913 uint32_t max_iterations; 1914 int nlen, error; 1915 1916 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1917 return(EXDEV); 1918 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1919 return(EXDEV); 1920 1921 fdip = VTOI(ap->a_fdvp); 1922 tdip = VTOI(ap->a_tdvp); 1923 fncp = ap->a_fnch->ncp; 1924 tncp = ap->a_tnch->ncp; 1925 ip = VTOI(fncp->nc_vp); 1926 KKASSERT(ip != NULL); 1927 1928 hmp = ip->hmp; 1929 1930 if (fdip->obj_localization != tdip->obj_localization) 1931 return(EXDEV); 1932 if (fdip->obj_localization != ip->obj_localization) 1933 return(EXDEV); 1934 1935 if (fdip->flags & HAMMER_INODE_RO) 1936 return (EROFS); 1937 if (tdip->flags & HAMMER_INODE_RO) 1938 return (EROFS); 1939 if (ip->flags & HAMMER_INODE_RO) 1940 return (EROFS); 1941 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1942 return (error); 1943 1944 lwkt_gettoken(&hmp->fs_token); 1945 hammer_start_transaction(&trans, hmp); 1946 1947 /* 1948 * Remove tncp from the target directory and then link ip as 1949 * tncp. XXX pass trans to dounlink 1950 * 1951 * Force the inode sync-time to match the transaction so it is 1952 * in-sync with the creation of the target directory entry. 1953 */ 1954 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1955 ap->a_cred, 0, -1); 1956 if (error == 0 || error == ENOENT) { 1957 error = hammer_ip_add_direntry(&trans, tdip, 1958 tncp->nc_name, tncp->nc_nlen, 1959 ip); 1960 if (error == 0) { 1961 ip->ino_data.parent_obj_id = tdip->obj_id; 1962 ip->ino_data.ctime = trans.time; 1963 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 1964 } 1965 } 1966 if (error) 1967 goto failed; /* XXX */ 1968 1969 /* 1970 * Locate the record in the originating directory and remove it. 1971 * 1972 * Calculate the namekey and setup the key range for the scan. This 1973 * works kinda like a chained hash table where the lower 32 bits 1974 * of the namekey synthesize the chain. 1975 * 1976 * The key range is inclusive of both key_beg and key_end. 1977 */ 1978 namekey = hammer_direntry_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 1979 &max_iterations); 1980 retry: 1981 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 1982 cursor.key_beg.localization = fdip->obj_localization | 1983 hammer_dir_localization(fdip); 1984 cursor.key_beg.obj_id = fdip->obj_id; 1985 cursor.key_beg.key = namekey; 1986 cursor.key_beg.create_tid = 0; 1987 cursor.key_beg.delete_tid = 0; 1988 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1989 cursor.key_beg.obj_type = 0; 1990 1991 cursor.key_end = cursor.key_beg; 1992 cursor.key_end.key += max_iterations; 1993 cursor.asof = fdip->obj_asof; 1994 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1995 1996 /* 1997 * Scan all matching records (the chain), locate the one matching 1998 * the requested path component. 1999 * 2000 * The hammer_ip_*() functions merge in-memory records with on-disk 2001 * records for the purposes of the search. 2002 */ 2003 error = hammer_ip_first(&cursor); 2004 while (error == 0) { 2005 if (hammer_ip_resolve_data(&cursor) != 0) 2006 break; 2007 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2008 KKASSERT(nlen > 0); 2009 if (fncp->nc_nlen == nlen && 2010 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2011 break; 2012 } 2013 error = hammer_ip_next(&cursor); 2014 } 2015 2016 /* 2017 * If all is ok we have to get the inode so we can adjust nlinks. 2018 * 2019 * WARNING: hammer_ip_del_direntry() may have to terminate the 2020 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2021 * twice. 2022 */ 2023 if (error == 0) 2024 error = hammer_ip_del_direntry(&trans, &cursor, fdip, ip); 2025 2026 /* 2027 * XXX A deadlock here will break rename's atomicy for the purposes 2028 * of crash recovery. 2029 */ 2030 if (error == EDEADLK) { 2031 hammer_done_cursor(&cursor); 2032 goto retry; 2033 } 2034 2035 /* 2036 * Cleanup and tell the kernel that the rename succeeded. 2037 * 2038 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2039 * without formally acquiring the vp since the vp might 2040 * have zero refs on it, or in the middle of a reclaim, 2041 * etc. 2042 */ 2043 hammer_done_cursor(&cursor); 2044 if (error == 0) { 2045 cache_rename(ap->a_fnch, ap->a_tnch); 2046 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2047 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2048 while (ip->vp) { 2049 struct vnode *vp; 2050 2051 error = hammer_get_vnode(ip, &vp); 2052 if (error == 0 && vp) { 2053 vn_unlock(vp); 2054 hammer_knote(ip->vp, NOTE_RENAME); 2055 vrele(vp); 2056 break; 2057 } 2058 hdkprintf("ip/vp race2 avoided\n"); 2059 } 2060 } 2061 2062 failed: 2063 hammer_done_transaction(&trans); 2064 lwkt_reltoken(&hmp->fs_token); 2065 return (error); 2066 } 2067 2068 /* 2069 * hammer_vop_nrmdir { nch, dvp, cred } 2070 */ 2071 static 2072 int 2073 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2074 { 2075 struct hammer_transaction trans; 2076 hammer_inode_t dip; 2077 hammer_mount_t hmp; 2078 int error; 2079 2080 dip = VTOI(ap->a_dvp); 2081 hmp = dip->hmp; 2082 2083 if (hammer_nohistory(dip) == 0 && 2084 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2085 return (error); 2086 } 2087 2088 lwkt_gettoken(&hmp->fs_token); 2089 hammer_start_transaction(&trans, hmp); 2090 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2091 hammer_done_transaction(&trans); 2092 if (error == 0) 2093 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2094 lwkt_reltoken(&hmp->fs_token); 2095 return (error); 2096 } 2097 2098 /* 2099 * hammer_vop_markatime { vp, cred } 2100 */ 2101 static 2102 int 2103 hammer_vop_markatime(struct vop_markatime_args *ap) 2104 { 2105 struct hammer_transaction trans; 2106 hammer_inode_t ip; 2107 hammer_mount_t hmp; 2108 2109 ip = VTOI(ap->a_vp); 2110 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2111 return (EROFS); 2112 if (ip->flags & HAMMER_INODE_RO) 2113 return (EROFS); 2114 hmp = ip->hmp; 2115 if (hmp->mp->mnt_flag & MNT_NOATIME) 2116 return (0); 2117 lwkt_gettoken(&hmp->fs_token); 2118 hammer_start_transaction(&trans, hmp); 2119 2120 ip->ino_data.atime = trans.time; 2121 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2122 hammer_done_transaction(&trans); 2123 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2124 lwkt_reltoken(&hmp->fs_token); 2125 return (0); 2126 } 2127 2128 /* 2129 * hammer_vop_setattr { vp, vap, cred } 2130 */ 2131 static 2132 int 2133 hammer_vop_setattr(struct vop_setattr_args *ap) 2134 { 2135 struct hammer_transaction trans; 2136 hammer_inode_t ip; 2137 struct vattr *vap; 2138 hammer_mount_t hmp; 2139 int modflags; 2140 int error; 2141 int truncating; 2142 int blksize; 2143 int kflags; 2144 #if 0 2145 int64_t aligned_size; 2146 #endif 2147 uint32_t flags; 2148 2149 vap = ap->a_vap; 2150 ip = ap->a_vp->v_data; 2151 modflags = 0; 2152 kflags = 0; 2153 hmp = ip->hmp; 2154 2155 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2156 return(EROFS); 2157 if (ip->flags & HAMMER_INODE_RO) 2158 return (EROFS); 2159 if (hammer_nohistory(ip) == 0 && 2160 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2161 return (error); 2162 } 2163 2164 lwkt_gettoken(&hmp->fs_token); 2165 hammer_start_transaction(&trans, hmp); 2166 error = 0; 2167 2168 if (vap->va_flags != VNOVAL) { 2169 flags = ip->ino_data.uflags; 2170 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2171 hammer_to_unix_xid(&ip->ino_data.uid), 2172 ap->a_cred); 2173 if (error == 0) { 2174 if (ip->ino_data.uflags != flags) { 2175 ip->ino_data.uflags = flags; 2176 ip->ino_data.ctime = trans.time; 2177 modflags |= HAMMER_INODE_DDIRTY; 2178 kflags |= NOTE_ATTRIB; 2179 } 2180 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2181 error = 0; 2182 goto done; 2183 } 2184 } 2185 goto done; 2186 } 2187 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2188 error = EPERM; 2189 goto done; 2190 } 2191 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2192 mode_t cur_mode = ip->ino_data.mode; 2193 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2194 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2195 uuid_t uuid_uid; 2196 uuid_t uuid_gid; 2197 2198 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2199 ap->a_cred, 2200 &cur_uid, &cur_gid, &cur_mode); 2201 if (error == 0) { 2202 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2203 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2204 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2205 sizeof(uuid_uid)) || 2206 bcmp(&uuid_gid, &ip->ino_data.gid, 2207 sizeof(uuid_gid)) || 2208 ip->ino_data.mode != cur_mode) { 2209 ip->ino_data.uid = uuid_uid; 2210 ip->ino_data.gid = uuid_gid; 2211 ip->ino_data.mode = cur_mode; 2212 ip->ino_data.ctime = trans.time; 2213 modflags |= HAMMER_INODE_DDIRTY; 2214 } 2215 kflags |= NOTE_ATTRIB; 2216 } 2217 } 2218 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2219 switch(ap->a_vp->v_type) { 2220 case VREG: 2221 if (vap->va_size == ip->ino_data.size) 2222 break; 2223 2224 /* 2225 * Log the operation if in fast-fsync mode or if 2226 * there are unterminated redo write records present. 2227 * 2228 * The second check is needed so the recovery code 2229 * properly truncates write redos even if nominal 2230 * REDO operations is turned off due to excessive 2231 * writes, because the related records might be 2232 * destroyed and never lay down a TERM_WRITE. 2233 */ 2234 if ((ip->flags & HAMMER_INODE_REDO) || 2235 (ip->flags & HAMMER_INODE_RDIRTY)) { 2236 error = hammer_generate_redo(&trans, ip, 2237 vap->va_size, 2238 HAMMER_REDO_TRUNC, 2239 NULL, 0); 2240 } 2241 blksize = hammer_blocksize(vap->va_size); 2242 2243 /* 2244 * XXX break atomicy, we can deadlock the backend 2245 * if we do not release the lock. Probably not a 2246 * big deal here. 2247 */ 2248 if (vap->va_size < ip->ino_data.size) { 2249 nvtruncbuf(ap->a_vp, vap->va_size, 2250 blksize, 2251 hammer_blockoff(vap->va_size), 2252 0); 2253 truncating = 1; 2254 kflags |= NOTE_WRITE; 2255 } else { 2256 nvextendbuf(ap->a_vp, 2257 ip->ino_data.size, 2258 vap->va_size, 2259 hammer_blocksize(ip->ino_data.size), 2260 hammer_blocksize(vap->va_size), 2261 hammer_blockoff(ip->ino_data.size), 2262 hammer_blockoff(vap->va_size), 2263 0); 2264 truncating = 0; 2265 kflags |= NOTE_WRITE | NOTE_EXTEND; 2266 } 2267 ip->ino_data.size = vap->va_size; 2268 ip->ino_data.mtime = trans.time; 2269 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2270 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2271 2272 /* 2273 * On-media truncation is cached in the inode until 2274 * the inode is synchronized. We must immediately 2275 * handle any frontend records. 2276 */ 2277 if (truncating) { 2278 hammer_ip_frontend_trunc(ip, vap->va_size); 2279 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2280 ip->flags |= HAMMER_INODE_TRUNCATED; 2281 ip->trunc_off = vap->va_size; 2282 hammer_inode_dirty(ip); 2283 } else if (ip->trunc_off > vap->va_size) { 2284 ip->trunc_off = vap->va_size; 2285 } 2286 } 2287 2288 #if 0 2289 /* 2290 * When truncating, nvtruncbuf() may have cleaned out 2291 * a portion of the last block on-disk in the buffer 2292 * cache. We must clean out any frontend records 2293 * for blocks beyond the new last block. 2294 */ 2295 aligned_size = (vap->va_size + (blksize - 1)) & 2296 ~(int64_t)(blksize - 1); 2297 if (truncating && vap->va_size < aligned_size) { 2298 aligned_size -= blksize; 2299 hammer_ip_frontend_trunc(ip, aligned_size); 2300 } 2301 #endif 2302 break; 2303 case VDATABASE: 2304 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2305 ip->flags |= HAMMER_INODE_TRUNCATED; 2306 ip->trunc_off = vap->va_size; 2307 hammer_inode_dirty(ip); 2308 } else if (ip->trunc_off > vap->va_size) { 2309 ip->trunc_off = vap->va_size; 2310 } 2311 hammer_ip_frontend_trunc(ip, vap->va_size); 2312 ip->ino_data.size = vap->va_size; 2313 ip->ino_data.mtime = trans.time; 2314 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2315 kflags |= NOTE_ATTRIB; 2316 break; 2317 default: 2318 error = EINVAL; 2319 goto done; 2320 } 2321 break; 2322 } 2323 if (vap->va_atime.tv_sec != VNOVAL) { 2324 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2325 modflags |= HAMMER_INODE_ATIME; 2326 kflags |= NOTE_ATTRIB; 2327 } 2328 if (vap->va_mtime.tv_sec != VNOVAL) { 2329 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2330 modflags |= HAMMER_INODE_MTIME; 2331 kflags |= NOTE_ATTRIB; 2332 } 2333 if (vap->va_mode != (mode_t)VNOVAL) { 2334 mode_t cur_mode = ip->ino_data.mode; 2335 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2336 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2337 2338 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2339 cur_uid, cur_gid, &cur_mode); 2340 if (error == 0 && ip->ino_data.mode != cur_mode) { 2341 ip->ino_data.mode = cur_mode; 2342 ip->ino_data.ctime = trans.time; 2343 modflags |= HAMMER_INODE_DDIRTY; 2344 kflags |= NOTE_ATTRIB; 2345 } 2346 } 2347 done: 2348 if (error == 0) 2349 hammer_modify_inode(&trans, ip, modflags); 2350 hammer_done_transaction(&trans); 2351 hammer_knote(ap->a_vp, kflags); 2352 lwkt_reltoken(&hmp->fs_token); 2353 return (error); 2354 } 2355 2356 /* 2357 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2358 */ 2359 static 2360 int 2361 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2362 { 2363 struct hammer_transaction trans; 2364 hammer_inode_t dip; 2365 hammer_inode_t nip; 2366 hammer_record_t record; 2367 struct nchandle *nch; 2368 hammer_mount_t hmp; 2369 int error; 2370 int bytes; 2371 2372 ap->a_vap->va_type = VLNK; 2373 2374 nch = ap->a_nch; 2375 dip = VTOI(ap->a_dvp); 2376 hmp = dip->hmp; 2377 2378 if (dip->flags & HAMMER_INODE_RO) 2379 return (EROFS); 2380 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2381 return (error); 2382 2383 /* 2384 * Create a transaction to cover the operations we perform. 2385 */ 2386 lwkt_gettoken(&hmp->fs_token); 2387 hammer_start_transaction(&trans, hmp); 2388 2389 /* 2390 * Create a new filesystem object of the requested type. The 2391 * returned inode will be referenced but not locked. 2392 */ 2393 2394 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2395 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2396 NULL, &nip); 2397 if (error) { 2398 hammer_done_transaction(&trans); 2399 *ap->a_vpp = NULL; 2400 lwkt_reltoken(&hmp->fs_token); 2401 return (error); 2402 } 2403 2404 /* 2405 * Add a record representing the symlink. symlink stores the link 2406 * as pure data, not a string, and is no \0 terminated. 2407 */ 2408 if (error == 0) { 2409 bytes = strlen(ap->a_target); 2410 2411 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2412 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2413 } else { 2414 record = hammer_alloc_mem_record(nip, bytes); 2415 record->type = HAMMER_MEM_RECORD_GENERAL; 2416 2417 record->leaf.base.localization = nip->obj_localization | 2418 HAMMER_LOCALIZE_MISC; 2419 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2420 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2421 record->leaf.data_len = bytes; 2422 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2423 bcopy(ap->a_target, record->data->symlink.name, bytes); 2424 error = hammer_ip_add_record(&trans, record); 2425 } 2426 2427 /* 2428 * Set the file size to the length of the link. 2429 */ 2430 if (error == 0) { 2431 nip->ino_data.size = bytes; 2432 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2433 } 2434 } 2435 if (error == 0) 2436 error = hammer_ip_add_direntry(&trans, dip, nch->ncp->nc_name, 2437 nch->ncp->nc_nlen, nip); 2438 2439 /* 2440 * Finish up. 2441 */ 2442 if (error) { 2443 hammer_rel_inode(nip, 0); 2444 *ap->a_vpp = NULL; 2445 } else { 2446 error = hammer_get_vnode(nip, ap->a_vpp); 2447 hammer_rel_inode(nip, 0); 2448 if (error == 0) { 2449 cache_setunresolved(ap->a_nch); 2450 cache_setvp(ap->a_nch, *ap->a_vpp); 2451 hammer_knote(ap->a_dvp, NOTE_WRITE); 2452 } 2453 } 2454 hammer_done_transaction(&trans); 2455 lwkt_reltoken(&hmp->fs_token); 2456 return (error); 2457 } 2458 2459 /* 2460 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2461 */ 2462 static 2463 int 2464 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2465 { 2466 struct hammer_transaction trans; 2467 hammer_inode_t dip; 2468 hammer_mount_t hmp; 2469 int error; 2470 2471 dip = VTOI(ap->a_dvp); 2472 hmp = dip->hmp; 2473 2474 if (hammer_nohistory(dip) == 0 && 2475 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2476 return (error); 2477 } 2478 2479 lwkt_gettoken(&hmp->fs_token); 2480 hammer_start_transaction(&trans, hmp); 2481 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2482 ap->a_cred, ap->a_flags, -1); 2483 hammer_done_transaction(&trans); 2484 lwkt_reltoken(&hmp->fs_token); 2485 2486 return (error); 2487 } 2488 2489 /* 2490 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2491 */ 2492 static 2493 int 2494 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2495 { 2496 hammer_inode_t ip = ap->a_vp->v_data; 2497 hammer_mount_t hmp = ip->hmp; 2498 int error; 2499 2500 lwkt_gettoken(&hmp->fs_token); 2501 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2502 ap->a_fflag, ap->a_cred); 2503 lwkt_reltoken(&hmp->fs_token); 2504 return (error); 2505 } 2506 2507 static 2508 int 2509 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2510 { 2511 static const struct mountctl_opt extraopt[] = { 2512 { HMNT_NOHISTORY, "nohistory" }, 2513 { HMNT_MASTERID, "master" }, 2514 { HMNT_NOMIRROR, "nomirror" }, 2515 { 0, NULL} 2516 2517 }; 2518 hammer_mount_t hmp; 2519 struct mount *mp; 2520 int usedbytes; 2521 int error; 2522 2523 error = 0; 2524 usedbytes = 0; 2525 mp = ap->a_head.a_ops->head.vv_mount; 2526 KKASSERT(mp->mnt_data != NULL); 2527 hmp = (hammer_mount_t)mp->mnt_data; 2528 2529 lwkt_gettoken(&hmp->fs_token); 2530 2531 switch(ap->a_op) { 2532 case MOUNTCTL_SET_EXPORT: 2533 if (ap->a_ctllen != sizeof(struct export_args)) 2534 error = EINVAL; 2535 else 2536 error = hammer_vfs_export(mp, ap->a_op, 2537 (const struct export_args *)ap->a_ctl); 2538 break; 2539 case MOUNTCTL_MOUNTFLAGS: 2540 /* 2541 * Call standard mountctl VOP function 2542 * so we get user mount flags. 2543 */ 2544 error = vop_stdmountctl(ap); 2545 if (error) 2546 break; 2547 2548 usedbytes = *ap->a_res; 2549 2550 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2551 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2552 ap->a_buf, 2553 ap->a_buflen - usedbytes, 2554 &error); 2555 } 2556 2557 *ap->a_res += usedbytes; 2558 break; 2559 default: 2560 error = vop_stdmountctl(ap); 2561 break; 2562 } 2563 lwkt_reltoken(&hmp->fs_token); 2564 return(error); 2565 } 2566 2567 /* 2568 * hammer_vop_strategy { vp, bio } 2569 * 2570 * Strategy call, used for regular file read & write only. Note that the 2571 * bp may represent a cluster. 2572 * 2573 * To simplify operation and allow better optimizations in the future, 2574 * this code does not make any assumptions with regards to buffer alignment 2575 * or size. 2576 */ 2577 static 2578 int 2579 hammer_vop_strategy(struct vop_strategy_args *ap) 2580 { 2581 struct buf *bp; 2582 int error; 2583 2584 bp = ap->a_bio->bio_buf; 2585 2586 switch(bp->b_cmd) { 2587 case BUF_CMD_READ: 2588 error = hammer_vop_strategy_read(ap); 2589 break; 2590 case BUF_CMD_WRITE: 2591 error = hammer_vop_strategy_write(ap); 2592 break; 2593 default: 2594 bp->b_error = error = EINVAL; 2595 bp->b_flags |= B_ERROR; 2596 biodone(ap->a_bio); 2597 break; 2598 } 2599 2600 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2601 2602 return (error); 2603 } 2604 2605 /* 2606 * Read from a regular file. Iterate the related records and fill in the 2607 * BIO/BUF. Gaps are zero-filled. 2608 * 2609 * The support code in hammer_object.c should be used to deal with mixed 2610 * in-memory and on-disk records. 2611 * 2612 * NOTE: Can be called from the cluster code with an oversized buf. 2613 * 2614 * XXX atime update 2615 */ 2616 static 2617 int 2618 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2619 { 2620 struct hammer_transaction trans; 2621 hammer_inode_t ip; 2622 hammer_inode_t dip; 2623 hammer_mount_t hmp; 2624 struct hammer_cursor cursor; 2625 hammer_base_elm_t base; 2626 hammer_off_t disk_offset; 2627 struct bio *bio; 2628 struct bio *nbio; 2629 struct buf *bp; 2630 int64_t rec_offset; 2631 int64_t ran_end; 2632 int64_t tmp64; 2633 int error; 2634 int boff; 2635 int roff; 2636 int n; 2637 int isdedupable; 2638 2639 bio = ap->a_bio; 2640 bp = bio->bio_buf; 2641 ip = ap->a_vp->v_data; 2642 hmp = ip->hmp; 2643 2644 /* 2645 * The zone-2 disk offset may have been set by the cluster code via 2646 * a BMAP operation, or else should be NOOFFSET. 2647 * 2648 * Checking the high bits for a match against zone-2 should suffice. 2649 * 2650 * In cases where a lot of data duplication is present it may be 2651 * more beneficial to drop through and doubule-buffer through the 2652 * device. 2653 */ 2654 nbio = push_bio(bio); 2655 if (hammer_is_zone_large_data(nbio->bio_offset)) { 2656 if (hammer_double_buffer == 0) { 2657 lwkt_gettoken(&hmp->fs_token); 2658 error = hammer_io_direct_read(hmp, nbio, NULL); 2659 lwkt_reltoken(&hmp->fs_token); 2660 return (error); 2661 } 2662 2663 /* 2664 * Try to shortcut requests for double_buffer mode too. 2665 * Since this mode runs through the device buffer cache 2666 * only compatible buffer sizes (meaning those generated 2667 * by normal filesystem buffers) are legal. 2668 */ 2669 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2670 lwkt_gettoken(&hmp->fs_token); 2671 error = hammer_io_indirect_read(hmp, nbio, NULL); 2672 lwkt_reltoken(&hmp->fs_token); 2673 return (error); 2674 } 2675 } 2676 2677 /* 2678 * Well, that sucked. Do it the hard way. If all the stars are 2679 * aligned we may still be able to issue a direct-read. 2680 */ 2681 lwkt_gettoken(&hmp->fs_token); 2682 hammer_simple_transaction(&trans, hmp); 2683 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2684 2685 /* 2686 * Key range (begin and end inclusive) to scan. Note that the key's 2687 * stored in the actual records represent BASE+LEN, not BASE. The 2688 * first record containing bio_offset will have a key > bio_offset. 2689 */ 2690 cursor.key_beg.localization = ip->obj_localization | 2691 HAMMER_LOCALIZE_MISC; 2692 cursor.key_beg.obj_id = ip->obj_id; 2693 cursor.key_beg.create_tid = 0; 2694 cursor.key_beg.delete_tid = 0; 2695 cursor.key_beg.obj_type = 0; 2696 cursor.key_beg.key = bio->bio_offset + 1; 2697 cursor.asof = ip->obj_asof; 2698 cursor.flags |= HAMMER_CURSOR_ASOF; 2699 2700 cursor.key_end = cursor.key_beg; 2701 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2702 #if 0 2703 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2704 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2705 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2706 cursor.key_end.key = HAMMER_MAX_KEY; 2707 } else 2708 #endif 2709 { 2710 ran_end = bio->bio_offset + bp->b_bufsize; 2711 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2712 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2713 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2714 if (tmp64 < ran_end) 2715 cursor.key_end.key = HAMMER_MAX_KEY; 2716 else 2717 cursor.key_end.key = ran_end + MAXPHYS + 1; 2718 } 2719 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2720 2721 /* 2722 * Set NOSWAPCACHE for cursor data extraction if double buffering 2723 * is disabled or (if the file is not marked cacheable via chflags 2724 * and vm.swapcache_use_chflags is enabled). 2725 */ 2726 if (hammer_double_buffer == 0 || 2727 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2728 vm_swapcache_use_chflags)) { 2729 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2730 } 2731 2732 error = hammer_ip_first(&cursor); 2733 boff = 0; 2734 2735 while (error == 0) { 2736 /* 2737 * Get the base file offset of the record. The key for 2738 * data records is (base + bytes) rather then (base). 2739 */ 2740 base = &cursor.leaf->base; 2741 rec_offset = base->key - cursor.leaf->data_len; 2742 2743 /* 2744 * Calculate the gap, if any, and zero-fill it. 2745 * 2746 * n is the offset of the start of the record verses our 2747 * current seek offset in the bio. 2748 */ 2749 n = (int)(rec_offset - (bio->bio_offset + boff)); 2750 if (n > 0) { 2751 if (n > bp->b_bufsize - boff) 2752 n = bp->b_bufsize - boff; 2753 bzero((char *)bp->b_data + boff, n); 2754 boff += n; 2755 n = 0; 2756 } 2757 2758 /* 2759 * Calculate the data offset in the record and the number 2760 * of bytes we can copy. 2761 * 2762 * There are two degenerate cases. First, boff may already 2763 * be at bp->b_bufsize. Secondly, the data offset within 2764 * the record may exceed the record's size. 2765 */ 2766 roff = -n; 2767 rec_offset += roff; 2768 n = cursor.leaf->data_len - roff; 2769 if (n <= 0) { 2770 hdkprintf("bad n=%d roff=%d\n", n, roff); 2771 n = 0; 2772 } else if (n > bp->b_bufsize - boff) { 2773 n = bp->b_bufsize - boff; 2774 } 2775 2776 /* 2777 * Deal with cached truncations. This cool bit of code 2778 * allows truncate()/ftruncate() to avoid having to sync 2779 * the file. 2780 * 2781 * If the frontend is truncated then all backend records are 2782 * subject to the frontend's truncation. 2783 * 2784 * If the backend is truncated then backend records on-disk 2785 * (but not in-memory) are subject to the backend's 2786 * truncation. In-memory records owned by the backend 2787 * represent data written after the truncation point on the 2788 * backend and must not be truncated. 2789 * 2790 * Truncate operations deal with frontend buffer cache 2791 * buffers and frontend-owned in-memory records synchronously. 2792 */ 2793 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2794 if (hammer_cursor_ondisk(&cursor)/* || 2795 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2796 if (ip->trunc_off <= rec_offset) 2797 n = 0; 2798 else if (ip->trunc_off < rec_offset + n) 2799 n = (int)(ip->trunc_off - rec_offset); 2800 } 2801 } 2802 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2803 if (hammer_cursor_ondisk(&cursor)) { 2804 if (ip->sync_trunc_off <= rec_offset) 2805 n = 0; 2806 else if (ip->sync_trunc_off < rec_offset + n) 2807 n = (int)(ip->sync_trunc_off - rec_offset); 2808 } 2809 } 2810 2811 /* 2812 * Try to issue a direct read into our bio if possible, 2813 * otherwise resolve the element data into a hammer_buffer 2814 * and copy. 2815 * 2816 * The buffer on-disk should be zerod past any real 2817 * truncation point, but may not be for any synthesized 2818 * truncation point from above. 2819 * 2820 * NOTE: disk_offset is only valid if the cursor data is 2821 * on-disk. 2822 */ 2823 disk_offset = cursor.leaf->data_offset + roff; 2824 isdedupable = (boff == 0 && n == bp->b_bufsize && 2825 hammer_cursor_ondisk(&cursor) && 2826 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2827 2828 if (isdedupable && hammer_double_buffer == 0) { 2829 /* 2830 * Direct read case 2831 */ 2832 KKASSERT(hammer_is_zone_large_data(disk_offset)); 2833 nbio->bio_offset = disk_offset; 2834 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2835 if (hammer_live_dedup && error == 0) 2836 hammer_dedup_cache_add(ip, cursor.leaf); 2837 goto done; 2838 } else if (isdedupable) { 2839 /* 2840 * Async I/O case for reading from backing store 2841 * and copying the data to the filesystem buffer. 2842 * live-dedup has to verify the data anyway if it 2843 * gets a hit later so we can just add the entry 2844 * now. 2845 */ 2846 KKASSERT(hammer_is_zone_large_data(disk_offset)); 2847 nbio->bio_offset = disk_offset; 2848 if (hammer_live_dedup) 2849 hammer_dedup_cache_add(ip, cursor.leaf); 2850 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2851 goto done; 2852 } else if (n) { 2853 error = hammer_ip_resolve_data(&cursor); 2854 if (error == 0) { 2855 if (hammer_live_dedup && isdedupable) 2856 hammer_dedup_cache_add(ip, cursor.leaf); 2857 bcopy((char *)cursor.data + roff, 2858 (char *)bp->b_data + boff, n); 2859 } 2860 } 2861 if (error) 2862 break; 2863 2864 /* 2865 * We have to be sure that the only elements added to the 2866 * dedup cache are those which are already on-media. 2867 */ 2868 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2869 hammer_dedup_cache_add(ip, cursor.leaf); 2870 2871 /* 2872 * Iterate until we have filled the request. 2873 */ 2874 boff += n; 2875 if (boff == bp->b_bufsize) 2876 break; 2877 error = hammer_ip_next(&cursor); 2878 } 2879 2880 /* 2881 * There may have been a gap after the last record 2882 */ 2883 if (error == ENOENT) 2884 error = 0; 2885 if (error == 0 && boff != bp->b_bufsize) { 2886 KKASSERT(boff < bp->b_bufsize); 2887 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2888 /* boff = bp->b_bufsize; */ 2889 } 2890 2891 /* 2892 * Disallow swapcache operation on the vnode buffer if double 2893 * buffering is enabled, the swapcache will get the data via 2894 * the block device buffer. 2895 */ 2896 if (hammer_double_buffer) 2897 bp->b_flags |= B_NOTMETA; 2898 2899 /* 2900 * Cleanup 2901 */ 2902 bp->b_resid = 0; 2903 bp->b_error = error; 2904 if (error) 2905 bp->b_flags |= B_ERROR; 2906 biodone(ap->a_bio); 2907 2908 done: 2909 /* 2910 * Cache the b-tree node for the last data read in cache[1]. 2911 * 2912 * If we hit the file EOF then also cache the node in the 2913 * governing directory's cache[3], it will be used to initialize 2914 * the new inode's cache[1] for any inodes looked up via the directory. 2915 * 2916 * This doesn't reduce disk accesses since the B-Tree chain is 2917 * likely cached, but it does reduce cpu overhead when looking 2918 * up file offsets for cpdup/tar/cpio style iterations. 2919 */ 2920 if (cursor.node) 2921 hammer_cache_node(&ip->cache[1], cursor.node); 2922 if (ran_end >= ip->ino_data.size) { 2923 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2924 ip->obj_asof, ip->obj_localization); 2925 if (dip) { 2926 hammer_cache_node(&dip->cache[3], cursor.node); 2927 hammer_rel_inode(dip, 0); 2928 } 2929 } 2930 hammer_done_cursor(&cursor); 2931 hammer_done_transaction(&trans); 2932 lwkt_reltoken(&hmp->fs_token); 2933 return(error); 2934 } 2935 2936 /* 2937 * BMAP operation - used to support cluster_read() only. 2938 * 2939 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2940 * 2941 * This routine may return EOPNOTSUPP if the opration is not supported for 2942 * the specified offset. The contents of the pointer arguments do not 2943 * need to be initialized in that case. 2944 * 2945 * If a disk address is available and properly aligned return 0 with 2946 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2947 * to the run-length relative to that offset. Callers may assume that 2948 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2949 * large, so return EOPNOTSUPP if it is not sufficiently large. 2950 */ 2951 static 2952 int 2953 hammer_vop_bmap(struct vop_bmap_args *ap) 2954 { 2955 struct hammer_transaction trans; 2956 hammer_inode_t ip; 2957 hammer_mount_t hmp; 2958 struct hammer_cursor cursor; 2959 hammer_base_elm_t base; 2960 int64_t rec_offset; 2961 int64_t ran_end; 2962 int64_t tmp64; 2963 int64_t base_offset; 2964 int64_t base_disk_offset; 2965 int64_t last_offset; 2966 hammer_off_t last_disk_offset; 2967 hammer_off_t disk_offset; 2968 int rec_len; 2969 int error; 2970 int blksize; 2971 2972 ip = ap->a_vp->v_data; 2973 hmp = ip->hmp; 2974 2975 /* 2976 * We can only BMAP regular files. We can't BMAP database files, 2977 * directories, etc. 2978 */ 2979 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 2980 return(EOPNOTSUPP); 2981 2982 /* 2983 * bmap is typically called with runp/runb both NULL when used 2984 * for writing. We do not support BMAP for writing atm. 2985 */ 2986 if (ap->a_cmd != BUF_CMD_READ) 2987 return(EOPNOTSUPP); 2988 2989 /* 2990 * Scan the B-Tree to acquire blockmap addresses, then translate 2991 * to raw addresses. 2992 */ 2993 lwkt_gettoken(&hmp->fs_token); 2994 hammer_simple_transaction(&trans, hmp); 2995 2996 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2997 2998 /* 2999 * Key range (begin and end inclusive) to scan. Note that the key's 3000 * stored in the actual records represent BASE+LEN, not BASE. The 3001 * first record containing bio_offset will have a key > bio_offset. 3002 */ 3003 cursor.key_beg.localization = ip->obj_localization | 3004 HAMMER_LOCALIZE_MISC; 3005 cursor.key_beg.obj_id = ip->obj_id; 3006 cursor.key_beg.create_tid = 0; 3007 cursor.key_beg.delete_tid = 0; 3008 cursor.key_beg.obj_type = 0; 3009 if (ap->a_runb) 3010 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3011 else 3012 cursor.key_beg.key = ap->a_loffset + 1; 3013 if (cursor.key_beg.key < 0) 3014 cursor.key_beg.key = 0; 3015 cursor.asof = ip->obj_asof; 3016 cursor.flags |= HAMMER_CURSOR_ASOF; 3017 3018 cursor.key_end = cursor.key_beg; 3019 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3020 3021 ran_end = ap->a_loffset + MAXPHYS; 3022 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3023 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3024 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3025 if (tmp64 < ran_end) 3026 cursor.key_end.key = HAMMER_MAX_KEY; 3027 else 3028 cursor.key_end.key = ran_end + MAXPHYS + 1; 3029 3030 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3031 3032 error = hammer_ip_first(&cursor); 3033 base_offset = last_offset = 0; 3034 base_disk_offset = last_disk_offset = 0; 3035 3036 while (error == 0) { 3037 /* 3038 * Get the base file offset of the record. The key for 3039 * data records is (base + bytes) rather then (base). 3040 * 3041 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3042 * The extra bytes should be zero on-disk and the BMAP op 3043 * should still be ok. 3044 */ 3045 base = &cursor.leaf->base; 3046 rec_offset = base->key - cursor.leaf->data_len; 3047 rec_len = cursor.leaf->data_len; 3048 3049 /* 3050 * Incorporate any cached truncation. 3051 * 3052 * NOTE: Modifications to rec_len based on synthesized 3053 * truncation points remove the guarantee that any extended 3054 * data on disk is zero (since the truncations may not have 3055 * taken place on-media yet). 3056 */ 3057 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3058 if (hammer_cursor_ondisk(&cursor) || 3059 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3060 if (ip->trunc_off <= rec_offset) 3061 rec_len = 0; 3062 else if (ip->trunc_off < rec_offset + rec_len) 3063 rec_len = (int)(ip->trunc_off - rec_offset); 3064 } 3065 } 3066 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3067 if (hammer_cursor_ondisk(&cursor)) { 3068 if (ip->sync_trunc_off <= rec_offset) 3069 rec_len = 0; 3070 else if (ip->sync_trunc_off < rec_offset + rec_len) 3071 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3072 } 3073 } 3074 3075 /* 3076 * Accumulate information. If we have hit a discontiguous 3077 * block reset base_offset unless we are already beyond the 3078 * requested offset. If we are, that's it, we stop. 3079 */ 3080 if (error) 3081 break; 3082 if (hammer_cursor_ondisk(&cursor)) { 3083 disk_offset = cursor.leaf->data_offset; 3084 if (rec_offset != last_offset || 3085 disk_offset != last_disk_offset) { 3086 if (rec_offset > ap->a_loffset) 3087 break; 3088 base_offset = rec_offset; 3089 base_disk_offset = disk_offset; 3090 } 3091 last_offset = rec_offset + rec_len; 3092 last_disk_offset = disk_offset + rec_len; 3093 3094 if (hammer_live_dedup) 3095 hammer_dedup_cache_add(ip, cursor.leaf); 3096 } 3097 3098 error = hammer_ip_next(&cursor); 3099 } 3100 3101 if (cursor.node) 3102 hammer_cache_node(&ip->cache[1], cursor.node); 3103 3104 hammer_done_cursor(&cursor); 3105 hammer_done_transaction(&trans); 3106 lwkt_reltoken(&hmp->fs_token); 3107 3108 /* 3109 * If we couldn't find any records or the records we did find were 3110 * all behind the requested offset, return failure. A forward 3111 * truncation can leave a hole w/ no on-disk records. 3112 */ 3113 if (last_offset == 0 || last_offset < ap->a_loffset) 3114 return (EOPNOTSUPP); 3115 3116 /* 3117 * Figure out the block size at the requested offset and adjust 3118 * our limits so the cluster_read() does not create inappropriately 3119 * sized buffer cache buffers. 3120 */ 3121 blksize = hammer_blocksize(ap->a_loffset); 3122 if (hammer_blocksize(base_offset) != blksize) { 3123 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3124 } 3125 if (last_offset != ap->a_loffset && 3126 hammer_blocksize(last_offset - 1) != blksize) { 3127 last_offset = hammer_blockdemarc(ap->a_loffset, 3128 last_offset - 1); 3129 } 3130 3131 /* 3132 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3133 * from occuring. 3134 */ 3135 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3136 3137 if (!hammer_is_zone_large_data(disk_offset)) { 3138 /* 3139 * Only large-data zones can be direct-IOd 3140 */ 3141 error = EOPNOTSUPP; 3142 } else if ((disk_offset & HAMMER_BUFMASK) || 3143 (last_offset - ap->a_loffset) < blksize) { 3144 /* 3145 * doffsetp is not aligned or the forward run size does 3146 * not cover a whole buffer, disallow the direct I/O. 3147 */ 3148 error = EOPNOTSUPP; 3149 } else { 3150 /* 3151 * We're good. 3152 */ 3153 *ap->a_doffsetp = disk_offset; 3154 if (ap->a_runb) { 3155 *ap->a_runb = ap->a_loffset - base_offset; 3156 KKASSERT(*ap->a_runb >= 0); 3157 } 3158 if (ap->a_runp) { 3159 *ap->a_runp = last_offset - ap->a_loffset; 3160 KKASSERT(*ap->a_runp >= 0); 3161 } 3162 error = 0; 3163 } 3164 return(error); 3165 } 3166 3167 /* 3168 * Write to a regular file. Because this is a strategy call the OS is 3169 * trying to actually get data onto the media. 3170 */ 3171 static 3172 int 3173 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3174 { 3175 hammer_record_t record; 3176 hammer_mount_t hmp; 3177 hammer_inode_t ip; 3178 struct bio *bio; 3179 struct buf *bp; 3180 int blksize __debugvar; 3181 int bytes; 3182 int error; 3183 3184 bio = ap->a_bio; 3185 bp = bio->bio_buf; 3186 ip = ap->a_vp->v_data; 3187 hmp = ip->hmp; 3188 3189 blksize = hammer_blocksize(bio->bio_offset); 3190 KKASSERT(bp->b_bufsize == blksize); 3191 3192 if (ip->flags & HAMMER_INODE_RO) { 3193 bp->b_error = EROFS; 3194 bp->b_flags |= B_ERROR; 3195 biodone(ap->a_bio); 3196 return(EROFS); 3197 } 3198 3199 lwkt_gettoken(&hmp->fs_token); 3200 3201 /* 3202 * Disallow swapcache operation on the vnode buffer if double 3203 * buffering is enabled, the swapcache will get the data via 3204 * the block device buffer. 3205 */ 3206 if (hammer_double_buffer) 3207 bp->b_flags |= B_NOTMETA; 3208 3209 /* 3210 * Interlock with inode destruction (no in-kernel or directory 3211 * topology visibility). If we queue new IO while trying to 3212 * destroy the inode we can deadlock the vtrunc call in 3213 * hammer_inode_unloadable_check(). 3214 * 3215 * Besides, there's no point flushing a bp associated with an 3216 * inode that is being destroyed on-media and has no kernel 3217 * references. 3218 */ 3219 if ((ip->flags | ip->sync_flags) & 3220 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3221 bp->b_resid = 0; 3222 biodone(ap->a_bio); 3223 lwkt_reltoken(&hmp->fs_token); 3224 return(0); 3225 } 3226 3227 /* 3228 * Reserve space and issue a direct-write from the front-end. 3229 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3230 * allocations. 3231 * 3232 * An in-memory record will be installed to reference the storage 3233 * until the flusher can get to it. 3234 * 3235 * Since we own the high level bio the front-end will not try to 3236 * do a direct-read until the write completes. 3237 * 3238 * NOTE: The only time we do not reserve a full-sized buffers 3239 * worth of data is if the file is small. We do not try to 3240 * allocate a fragment (from the small-data zone) at the end of 3241 * an otherwise large file as this can lead to wildly separated 3242 * data. 3243 */ 3244 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3245 KKASSERT(bio->bio_offset < ip->ino_data.size); 3246 if (bio->bio_offset || ip->ino_data.size > HAMMER_HBUFSIZE) 3247 bytes = bp->b_bufsize; 3248 else 3249 bytes = HAMMER_DATA_DOALIGN_WITH(int, ip->ino_data.size); 3250 3251 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3252 bytes, &error); 3253 3254 /* 3255 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3256 * in hammer_vop_write(). We must flag the record so the proper 3257 * REDO_TERM_WRITE entry is generated during the flush. 3258 */ 3259 if (record) { 3260 if (bp->b_flags & B_VFSFLAG1) { 3261 record->flags |= HAMMER_RECF_REDO; 3262 bp->b_flags &= ~B_VFSFLAG1; 3263 } 3264 if (record->flags & HAMMER_RECF_DEDUPED) { 3265 bp->b_resid = 0; 3266 hammer_ip_replace_bulk(hmp, record); 3267 biodone(ap->a_bio); 3268 } else { 3269 hammer_io_direct_write(hmp, bio, record); 3270 } 3271 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3272 hammer_flush_inode(ip, 0); 3273 } else { 3274 bp->b_bio2.bio_offset = NOOFFSET; 3275 bp->b_error = error; 3276 bp->b_flags |= B_ERROR; 3277 biodone(ap->a_bio); 3278 } 3279 lwkt_reltoken(&hmp->fs_token); 3280 return(error); 3281 } 3282 3283 /* 3284 * dounlink - disconnect a directory entry 3285 * 3286 * XXX whiteout support not really in yet 3287 */ 3288 static int 3289 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3290 struct vnode *dvp, struct ucred *cred, 3291 int flags, int isdir) 3292 { 3293 struct namecache *ncp; 3294 hammer_inode_t dip; 3295 hammer_inode_t ip; 3296 hammer_mount_t hmp; 3297 struct hammer_cursor cursor; 3298 int64_t namekey; 3299 uint32_t max_iterations; 3300 int nlen, error; 3301 3302 /* 3303 * Calculate the namekey and setup the key range for the scan. This 3304 * works kinda like a chained hash table where the lower 32 bits 3305 * of the namekey synthesize the chain. 3306 * 3307 * The key range is inclusive of both key_beg and key_end. 3308 */ 3309 dip = VTOI(dvp); 3310 ncp = nch->ncp; 3311 hmp = dip->hmp; 3312 3313 if (dip->flags & HAMMER_INODE_RO) 3314 return (EROFS); 3315 3316 namekey = hammer_direntry_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3317 &max_iterations); 3318 retry: 3319 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3320 cursor.key_beg.localization = dip->obj_localization | 3321 hammer_dir_localization(dip); 3322 cursor.key_beg.obj_id = dip->obj_id; 3323 cursor.key_beg.key = namekey; 3324 cursor.key_beg.create_tid = 0; 3325 cursor.key_beg.delete_tid = 0; 3326 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3327 cursor.key_beg.obj_type = 0; 3328 3329 cursor.key_end = cursor.key_beg; 3330 cursor.key_end.key += max_iterations; 3331 cursor.asof = dip->obj_asof; 3332 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3333 3334 /* 3335 * Scan all matching records (the chain), locate the one matching 3336 * the requested path component. info->last_error contains the 3337 * error code on search termination and could be 0, ENOENT, or 3338 * something else. 3339 * 3340 * The hammer_ip_*() functions merge in-memory records with on-disk 3341 * records for the purposes of the search. 3342 */ 3343 error = hammer_ip_first(&cursor); 3344 3345 while (error == 0) { 3346 error = hammer_ip_resolve_data(&cursor); 3347 if (error) 3348 break; 3349 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3350 KKASSERT(nlen > 0); 3351 if (ncp->nc_nlen == nlen && 3352 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3353 break; 3354 } 3355 error = hammer_ip_next(&cursor); 3356 } 3357 3358 /* 3359 * If all is ok we have to get the inode so we can adjust nlinks. 3360 * To avoid a deadlock with the flusher we must release the inode 3361 * lock on the directory when acquiring the inode for the entry. 3362 * 3363 * If the target is a directory, it must be empty. 3364 */ 3365 if (error == 0) { 3366 hammer_unlock(&cursor.ip->lock); 3367 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3368 hmp->asof, 3369 cursor.data->entry.localization, 3370 0, &error); 3371 hammer_lock_sh(&cursor.ip->lock); 3372 if (error == ENOENT) { 3373 hkprintf("WARNING: Removing dirent w/missing inode " 3374 "\"%s\"\n" 3375 "\tobj_id = %016jx\n", 3376 ncp->nc_name, 3377 (intmax_t)cursor.data->entry.obj_id); 3378 error = 0; 3379 } 3380 3381 /* 3382 * If isdir >= 0 we validate that the entry is or is not a 3383 * directory. If isdir < 0 we don't care. 3384 */ 3385 if (error == 0 && isdir >= 0 && ip) { 3386 if (isdir && 3387 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3388 error = ENOTDIR; 3389 } else if (isdir == 0 && 3390 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3391 error = EISDIR; 3392 } 3393 } 3394 3395 /* 3396 * If we are trying to remove a directory the directory must 3397 * be empty. 3398 * 3399 * The check directory code can loop and deadlock/retry. Our 3400 * own cursor's node locks must be released to avoid a 3-way 3401 * deadlock with the flusher if the check directory code 3402 * blocks. 3403 * 3404 * If any changes whatsoever have been made to the cursor 3405 * set EDEADLK and retry. 3406 * 3407 * WARNING: See warnings in hammer_unlock_cursor() 3408 * function. 3409 */ 3410 if (error == 0 && ip && ip->ino_data.obj_type == 3411 HAMMER_OBJTYPE_DIRECTORY) { 3412 hammer_unlock_cursor(&cursor); 3413 error = hammer_ip_check_directory_empty(trans, ip); 3414 hammer_lock_cursor(&cursor); 3415 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3416 hkprintf("Warning: avoided deadlock " 3417 "on rmdir '%s'\n", 3418 ncp->nc_name); 3419 error = EDEADLK; 3420 } 3421 } 3422 3423 /* 3424 * Delete the directory entry. 3425 * 3426 * WARNING: hammer_ip_del_direntry() may have to terminate 3427 * the cursor to avoid a deadlock. It is ok to call 3428 * hammer_done_cursor() twice. 3429 */ 3430 if (error == 0) { 3431 error = hammer_ip_del_direntry(trans, &cursor, 3432 dip, ip); 3433 } 3434 hammer_done_cursor(&cursor); 3435 if (error == 0) { 3436 /* 3437 * Tell the namecache that we are now unlinked. 3438 */ 3439 cache_unlink(nch); 3440 3441 /* 3442 * NOTE: ip->vp, if non-NULL, cannot be directly 3443 * referenced without formally acquiring the 3444 * vp since the vp might have zero refs on it, 3445 * or in the middle of a reclaim, etc. 3446 * 3447 * NOTE: The cache_setunresolved() can rip the vp 3448 * out from under us since the vp may not have 3449 * any refs, in which case ip->vp will be NULL 3450 * from the outset. 3451 */ 3452 while (ip && ip->vp) { 3453 struct vnode *vp; 3454 3455 error = hammer_get_vnode(ip, &vp); 3456 if (error == 0 && vp) { 3457 vn_unlock(vp); 3458 hammer_knote(ip->vp, NOTE_DELETE); 3459 #if 0 3460 /* 3461 * Don't do this, it can deadlock 3462 * on concurrent rm's of hardlinks. 3463 * Shouldn't be needed any more. 3464 */ 3465 cache_inval_vp(ip->vp, CINV_DESTROY); 3466 #endif 3467 vrele(vp); 3468 break; 3469 } 3470 hdkprintf("ip/vp race1 avoided\n"); 3471 } 3472 } 3473 if (ip) 3474 hammer_rel_inode(ip, 0); 3475 } else { 3476 hammer_done_cursor(&cursor); 3477 } 3478 if (error == EDEADLK) 3479 goto retry; 3480 3481 return (error); 3482 } 3483 3484 /************************************************************************ 3485 * FIFO AND SPECFS OPS * 3486 ************************************************************************ 3487 * 3488 */ 3489 static int 3490 hammer_vop_fifoclose (struct vop_close_args *ap) 3491 { 3492 /* XXX update itimes */ 3493 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3494 } 3495 3496 static int 3497 hammer_vop_fiforead (struct vop_read_args *ap) 3498 { 3499 int error; 3500 3501 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3502 /* XXX update access time */ 3503 return (error); 3504 } 3505 3506 static int 3507 hammer_vop_fifowrite (struct vop_write_args *ap) 3508 { 3509 int error; 3510 3511 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3512 /* XXX update access time */ 3513 return (error); 3514 } 3515 3516 static 3517 int 3518 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3519 { 3520 int error; 3521 3522 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3523 if (error) 3524 error = hammer_vop_kqfilter(ap); 3525 return(error); 3526 } 3527 3528 /************************************************************************ 3529 * KQFILTER OPS * 3530 ************************************************************************ 3531 * 3532 */ 3533 static void filt_hammerdetach(struct knote *kn); 3534 static int filt_hammerread(struct knote *kn, long hint); 3535 static int filt_hammerwrite(struct knote *kn, long hint); 3536 static int filt_hammervnode(struct knote *kn, long hint); 3537 3538 static struct filterops hammerread_filtops = 3539 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3540 NULL, filt_hammerdetach, filt_hammerread }; 3541 static struct filterops hammerwrite_filtops = 3542 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3543 NULL, filt_hammerdetach, filt_hammerwrite }; 3544 static struct filterops hammervnode_filtops = 3545 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3546 NULL, filt_hammerdetach, filt_hammervnode }; 3547 3548 static 3549 int 3550 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3551 { 3552 struct vnode *vp = ap->a_vp; 3553 struct knote *kn = ap->a_kn; 3554 3555 switch (kn->kn_filter) { 3556 case EVFILT_READ: 3557 kn->kn_fop = &hammerread_filtops; 3558 break; 3559 case EVFILT_WRITE: 3560 kn->kn_fop = &hammerwrite_filtops; 3561 break; 3562 case EVFILT_VNODE: 3563 kn->kn_fop = &hammervnode_filtops; 3564 break; 3565 default: 3566 return (EOPNOTSUPP); 3567 } 3568 3569 kn->kn_hook = (caddr_t)vp; 3570 3571 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3572 3573 return(0); 3574 } 3575 3576 static void 3577 filt_hammerdetach(struct knote *kn) 3578 { 3579 struct vnode *vp = (void *)kn->kn_hook; 3580 3581 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3582 } 3583 3584 static int 3585 filt_hammerread(struct knote *kn, long hint) 3586 { 3587 struct vnode *vp = (void *)kn->kn_hook; 3588 hammer_inode_t ip = VTOI(vp); 3589 hammer_mount_t hmp = ip->hmp; 3590 off_t off; 3591 3592 if (hint == NOTE_REVOKE) { 3593 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3594 return(1); 3595 } 3596 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3597 off = ip->ino_data.size - kn->kn_fp->f_offset; 3598 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3599 lwkt_reltoken(&hmp->fs_token); 3600 if (kn->kn_sfflags & NOTE_OLDAPI) 3601 return(1); 3602 return (kn->kn_data != 0); 3603 } 3604 3605 static int 3606 filt_hammerwrite(struct knote *kn, long hint) 3607 { 3608 if (hint == NOTE_REVOKE) 3609 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3610 kn->kn_data = 0; 3611 return (1); 3612 } 3613 3614 static int 3615 filt_hammervnode(struct knote *kn, long hint) 3616 { 3617 if (kn->kn_sfflags & hint) 3618 kn->kn_fflags |= hint; 3619 if (hint == NOTE_REVOKE) { 3620 kn->kn_flags |= (EV_EOF | EV_NODATA); 3621 return (1); 3622 } 3623 return (kn->kn_fflags != 0); 3624 } 3625 3626