1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/mountctl.h> 36 #include <sys/namecache.h> 37 #include <sys/buf2.h> 38 #include <vfs/fifofs/fifo.h> 39 40 #include "hammer.h" 41 42 /* 43 * USERFS VNOPS 44 */ 45 static int hammer_vop_fsync(struct vop_fsync_args *); 46 static int hammer_vop_read(struct vop_read_args *); 47 static int hammer_vop_write(struct vop_write_args *); 48 static int hammer_vop_access(struct vop_access_args *); 49 static int hammer_vop_advlock(struct vop_advlock_args *); 50 static int hammer_vop_close(struct vop_close_args *); 51 static int hammer_vop_ncreate(struct vop_ncreate_args *); 52 static int hammer_vop_getattr(struct vop_getattr_args *); 53 static int hammer_vop_nresolve(struct vop_nresolve_args *); 54 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 55 static int hammer_vop_nlink(struct vop_nlink_args *); 56 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 57 static int hammer_vop_nmknod(struct vop_nmknod_args *); 58 static int hammer_vop_open(struct vop_open_args *); 59 static int hammer_vop_print(struct vop_print_args *); 60 static int hammer_vop_readdir(struct vop_readdir_args *); 61 static int hammer_vop_readlink(struct vop_readlink_args *); 62 static int hammer_vop_nremove(struct vop_nremove_args *); 63 static int hammer_vop_nrename(struct vop_nrename_args *); 64 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 65 static int hammer_vop_markatime(struct vop_markatime_args *); 66 static int hammer_vop_setattr(struct vop_setattr_args *); 67 static int hammer_vop_strategy(struct vop_strategy_args *); 68 static int hammer_vop_bmap(struct vop_bmap_args *ap); 69 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 70 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 71 static int hammer_vop_ioctl(struct vop_ioctl_args *); 72 static int hammer_vop_mountctl(struct vop_mountctl_args *); 73 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 74 75 static int hammer_vop_fifoclose (struct vop_close_args *); 76 static int hammer_vop_fiforead (struct vop_read_args *); 77 static int hammer_vop_fifowrite (struct vop_write_args *); 78 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 79 80 struct vop_ops hammer_vnode_vops = { 81 .vop_default = vop_defaultop, 82 .vop_fsync = hammer_vop_fsync, 83 .vop_getpages = vop_stdgetpages, 84 .vop_putpages = vop_stdputpages, 85 .vop_read = hammer_vop_read, 86 .vop_write = hammer_vop_write, 87 .vop_access = hammer_vop_access, 88 .vop_advlock = hammer_vop_advlock, 89 .vop_close = hammer_vop_close, 90 .vop_ncreate = hammer_vop_ncreate, 91 .vop_getattr = hammer_vop_getattr, 92 .vop_inactive = hammer_vop_inactive, 93 .vop_reclaim = hammer_vop_reclaim, 94 .vop_nresolve = hammer_vop_nresolve, 95 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 96 .vop_nlink = hammer_vop_nlink, 97 .vop_nmkdir = hammer_vop_nmkdir, 98 .vop_nmknod = hammer_vop_nmknod, 99 .vop_open = hammer_vop_open, 100 .vop_pathconf = vop_stdpathconf, 101 .vop_print = hammer_vop_print, 102 .vop_readdir = hammer_vop_readdir, 103 .vop_readlink = hammer_vop_readlink, 104 .vop_nremove = hammer_vop_nremove, 105 .vop_nrename = hammer_vop_nrename, 106 .vop_nrmdir = hammer_vop_nrmdir, 107 .vop_markatime = hammer_vop_markatime, 108 .vop_setattr = hammer_vop_setattr, 109 .vop_bmap = hammer_vop_bmap, 110 .vop_strategy = hammer_vop_strategy, 111 .vop_nsymlink = hammer_vop_nsymlink, 112 .vop_nwhiteout = hammer_vop_nwhiteout, 113 .vop_ioctl = hammer_vop_ioctl, 114 .vop_mountctl = hammer_vop_mountctl, 115 .vop_kqfilter = hammer_vop_kqfilter 116 }; 117 118 struct vop_ops hammer_spec_vops = { 119 .vop_default = vop_defaultop, 120 .vop_fsync = hammer_vop_fsync, 121 .vop_read = vop_stdnoread, 122 .vop_write = vop_stdnowrite, 123 .vop_access = hammer_vop_access, 124 .vop_close = hammer_vop_close, 125 .vop_markatime = hammer_vop_markatime, 126 .vop_getattr = hammer_vop_getattr, 127 .vop_inactive = hammer_vop_inactive, 128 .vop_reclaim = hammer_vop_reclaim, 129 .vop_setattr = hammer_vop_setattr 130 }; 131 132 struct vop_ops hammer_fifo_vops = { 133 .vop_default = fifo_vnoperate, 134 .vop_fsync = hammer_vop_fsync, 135 .vop_read = hammer_vop_fiforead, 136 .vop_write = hammer_vop_fifowrite, 137 .vop_access = hammer_vop_access, 138 .vop_close = hammer_vop_fifoclose, 139 .vop_markatime = hammer_vop_markatime, 140 .vop_getattr = hammer_vop_getattr, 141 .vop_inactive = hammer_vop_inactive, 142 .vop_reclaim = hammer_vop_reclaim, 143 .vop_setattr = hammer_vop_setattr, 144 .vop_kqfilter = hammer_vop_fifokqfilter 145 }; 146 147 static __inline 148 void 149 hammer_knote(struct vnode *vp, int flags) 150 { 151 if (flags) 152 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 153 } 154 155 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 156 struct vnode *dvp, struct ucred *cred, 157 int flags, int isdir); 158 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 159 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 160 161 /* 162 * hammer_vop_fsync { vp, waitfor } 163 * 164 * fsync() an inode to disk and wait for it to be completely committed 165 * such that the information would not be undone if a crash occured after 166 * return. 167 * 168 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 169 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 170 * operation. 171 * 172 * Ultimately the combination of a REDO log and use of fast storage 173 * to front-end cluster caches will make fsync fast, but it aint 174 * here yet. And, in anycase, we need real transactional 175 * all-or-nothing features which are not restricted to a single file. 176 */ 177 static 178 int 179 hammer_vop_fsync(struct vop_fsync_args *ap) 180 { 181 hammer_inode_t ip = VTOI(ap->a_vp); 182 hammer_mount_t hmp = ip->hmp; 183 int waitfor = ap->a_waitfor; 184 int mode; 185 186 lwkt_gettoken(&hmp->fs_token); 187 188 /* 189 * Fsync rule relaxation (default is either full synchronous flush 190 * or REDO semantics with synchronous flush). 191 */ 192 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 193 switch(hammer_fsync_mode) { 194 case 0: 195 mode0: 196 /* no REDO, full synchronous flush */ 197 goto skip; 198 case 1: 199 mode1: 200 /* no REDO, full asynchronous flush */ 201 if (waitfor == MNT_WAIT) 202 waitfor = MNT_NOWAIT; 203 goto skip; 204 case 2: 205 /* REDO semantics, synchronous flush */ 206 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 207 goto mode0; 208 mode = HAMMER_FLUSH_UNDOS_AUTO; 209 break; 210 case 3: 211 /* REDO semantics, relaxed asynchronous flush */ 212 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 213 goto mode1; 214 mode = HAMMER_FLUSH_UNDOS_RELAXED; 215 if (waitfor == MNT_WAIT) 216 waitfor = MNT_NOWAIT; 217 break; 218 case 4: 219 /* ignore the fsync() system call */ 220 lwkt_reltoken(&hmp->fs_token); 221 return(0); 222 default: 223 /* we have to do something */ 224 mode = HAMMER_FLUSH_UNDOS_RELAXED; 225 if (waitfor == MNT_WAIT) 226 waitfor = MNT_NOWAIT; 227 break; 228 } 229 230 /* 231 * Fast fsync only needs to flush the UNDO/REDO fifo if 232 * HAMMER_INODE_REDO is non-zero and the only modifications 233 * made to the file are write or write-extends. 234 */ 235 if ((ip->flags & HAMMER_INODE_REDO) && 236 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0) { 237 ++hammer_count_fsyncs; 238 hammer_flusher_flush_undos(hmp, mode); 239 ip->redo_count = 0; 240 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 241 vclrisdirty(ip->vp); 242 lwkt_reltoken(&hmp->fs_token); 243 return(0); 244 } 245 246 /* 247 * REDO is enabled by fsync(), the idea being we really only 248 * want to lay down REDO records when programs are using 249 * fsync() heavily. The first fsync() on the file starts 250 * the gravy train going and later fsync()s keep it hot by 251 * resetting the redo_count. 252 * 253 * We weren't running REDOs before now so we have to fall 254 * through and do a full fsync of what we have. 255 */ 256 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 257 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 258 ip->flags |= HAMMER_INODE_REDO; 259 ip->redo_count = 0; 260 } 261 } 262 skip: 263 264 /* 265 * Do a full flush sequence. 266 * 267 * Attempt to release the vnode while waiting for the inode to 268 * finish flushing. This can really mess up inactive->reclaim 269 * sequences so only do it if the vnode is active. 270 * 271 * WARNING! The VX lock functions must be used. vn_lock() will 272 * fail when this is part of a VOP_RECLAIM sequence. 273 */ 274 ++hammer_count_fsyncs; 275 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 276 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 277 if (waitfor == MNT_WAIT) { 278 int dorelock; 279 280 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 281 vx_unlock(ap->a_vp); 282 dorelock = 1; 283 } else { 284 dorelock = 0; 285 } 286 hammer_wait_inode(ip); 287 if (dorelock) 288 vx_lock(ap->a_vp); 289 } 290 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 291 vclrisdirty(ip->vp); 292 lwkt_reltoken(&hmp->fs_token); 293 return (ip->error); 294 } 295 296 /* 297 * hammer_vop_read { vp, uio, ioflag, cred } 298 * 299 * MPSAFE (for the cache safe does not require fs_token) 300 */ 301 static 302 int 303 hammer_vop_read(struct vop_read_args *ap) 304 { 305 struct hammer_transaction trans; 306 hammer_inode_t ip; 307 hammer_mount_t hmp; 308 off_t offset; 309 struct buf *bp; 310 struct uio *uio; 311 int error; 312 int n; 313 int seqcount; 314 int ioseqcount; 315 int blksize; 316 int bigread; 317 int got_trans; 318 size_t resid; 319 320 if (ap->a_vp->v_type != VREG) 321 return (EINVAL); 322 ip = VTOI(ap->a_vp); 323 hmp = ip->hmp; 324 error = 0; 325 got_trans = 0; 326 uio = ap->a_uio; 327 328 /* 329 * Attempt to shortcut directly to the VM object using lwbufs. 330 * This is much faster than instantiating buffer cache buffers. 331 */ 332 resid = uio->uio_resid; 333 error = vop_helper_read_shortcut(ap); 334 hammer_stats_file_read += resid - uio->uio_resid; 335 if (error) 336 return (error); 337 if (uio->uio_resid == 0) 338 goto finished; 339 340 /* 341 * Allow the UIO's size to override the sequential heuristic. 342 */ 343 blksize = hammer_blocksize(uio->uio_offset); 344 seqcount = (uio->uio_resid + (MAXBSIZE - 1)) / MAXBSIZE; 345 ioseqcount = (ap->a_ioflag >> 16); 346 if (seqcount < ioseqcount) 347 seqcount = ioseqcount; 348 349 /* 350 * If reading or writing a huge amount of data we have to break 351 * atomicy and allow the operation to be interrupted by a signal 352 * or it can DOS the machine. 353 */ 354 bigread = (uio->uio_resid > 100 * 1024 * 1024); 355 356 /* 357 * Access the data typically in HAMMER_BUFSIZE blocks via the 358 * buffer cache, but HAMMER may use a variable block size based 359 * on the offset. 360 * 361 * XXX Temporary hack, delay the start transaction while we remain 362 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 363 * locked-shared. 364 */ 365 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 366 int64_t base_offset; 367 int64_t file_limit; 368 369 blksize = hammer_blocksize(uio->uio_offset); 370 offset = (int)uio->uio_offset & (blksize - 1); 371 base_offset = uio->uio_offset - offset; 372 373 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 374 break; 375 376 /* 377 * MPSAFE 378 */ 379 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 380 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 381 bp->b_flags &= ~B_AGE; 382 error = 0; 383 goto skip; 384 } 385 if (ap->a_ioflag & IO_NRDELAY) { 386 bqrelse(bp); 387 return (EWOULDBLOCK); 388 } 389 390 /* 391 * MPUNSAFE 392 */ 393 if (got_trans == 0) { 394 hammer_start_transaction(&trans, ip->hmp); 395 got_trans = 1; 396 } 397 398 /* 399 * NOTE: A valid bp has already been acquired, but was not 400 * B_CACHE. 401 */ 402 if (hammer_cluster_enable) { 403 /* 404 * Use file_limit to prevent cluster_read() from 405 * creating buffers of the wrong block size past 406 * the demarc. 407 */ 408 file_limit = ip->ino_data.size; 409 if (base_offset < HAMMER_XDEMARC && 410 file_limit > HAMMER_XDEMARC) { 411 file_limit = HAMMER_XDEMARC; 412 } 413 error = cluster_readx(ap->a_vp, 414 file_limit, base_offset, 415 blksize, B_NOTMETA, 416 uio->uio_resid, 417 seqcount * MAXBSIZE, 418 &bp); 419 } else { 420 error = breadnx(ap->a_vp, base_offset, 421 blksize, B_NOTMETA, 422 NULL, NULL, 0, &bp); 423 } 424 if (error) { 425 brelse(bp); 426 break; 427 } 428 skip: 429 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IOISSUED)) { 430 hdkprintf("zone2_offset %016jx read file %016jx@%016jx\n", 431 (intmax_t)bp->b_bio2.bio_offset, 432 (intmax_t)ip->obj_id, 433 (intmax_t)bp->b_loffset); 434 } 435 bp->b_flags &= ~B_IOISSUED; 436 if (blksize == HAMMER_XBUFSIZE) 437 bp->b_flags |= B_CLUSTEROK; 438 439 n = blksize - offset; 440 if (n > uio->uio_resid) 441 n = uio->uio_resid; 442 if (n > ip->ino_data.size - uio->uio_offset) 443 n = (int)(ip->ino_data.size - uio->uio_offset); 444 445 /* 446 * Set B_AGE, data has a lower priority than meta-data. 447 * 448 * Use a hold/unlock/drop sequence to run the uiomove 449 * with the buffer unlocked, avoiding deadlocks against 450 * read()s on mmap()'d spaces. 451 */ 452 bp->b_flags |= B_AGE; 453 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 454 bqrelse(bp); 455 456 if (error) 457 break; 458 hammer_stats_file_read += n; 459 } 460 461 finished: 462 463 /* 464 * Try to update the atime with just the inode lock for maximum 465 * concurrency. If we can't shortcut it we have to get the full 466 * blown transaction. 467 */ 468 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 469 hammer_start_transaction(&trans, ip->hmp); 470 got_trans = 1; 471 } 472 473 if (got_trans) { 474 if ((ip->flags & HAMMER_INODE_RO) == 0 && 475 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 476 lwkt_gettoken(&hmp->fs_token); 477 ip->ino_data.atime = trans.time; 478 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 479 hammer_done_transaction(&trans); 480 lwkt_reltoken(&hmp->fs_token); 481 } else { 482 hammer_done_transaction(&trans); 483 } 484 } 485 return (error); 486 } 487 488 /* 489 * hammer_vop_write { vp, uio, ioflag, cred } 490 */ 491 static 492 int 493 hammer_vop_write(struct vop_write_args *ap) 494 { 495 struct hammer_transaction trans; 496 hammer_inode_t ip; 497 hammer_mount_t hmp; 498 thread_t td; 499 struct uio *uio; 500 int offset; 501 off_t base_offset; 502 int64_t cluster_eof; 503 struct buf *bp; 504 int kflags; 505 int error; 506 int n; 507 int flags; 508 int seqcount; 509 int bigwrite; 510 511 if (ap->a_vp->v_type != VREG) 512 return (EINVAL); 513 ip = VTOI(ap->a_vp); 514 hmp = ip->hmp; 515 error = 0; 516 kflags = 0; 517 seqcount = ap->a_ioflag >> 16; 518 519 if (ip->flags & HAMMER_INODE_RO) 520 return (EROFS); 521 522 /* 523 * Create a transaction to cover the operations we perform. 524 */ 525 hammer_start_transaction(&trans, hmp); 526 uio = ap->a_uio; 527 528 /* 529 * Check append mode 530 */ 531 if (ap->a_ioflag & IO_APPEND) 532 uio->uio_offset = ip->ino_data.size; 533 534 /* 535 * Check for illegal write offsets. Valid range is 0...2^63-1. 536 * 537 * NOTE: the base_off assignment is required to work around what 538 * I consider to be a GCC-4 optimization bug. 539 */ 540 if (uio->uio_offset < 0) { 541 hammer_done_transaction(&trans); 542 return (EFBIG); 543 } 544 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 545 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 546 hammer_done_transaction(&trans); 547 return (EFBIG); 548 } 549 550 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 551 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 552 hammer_done_transaction(&trans); 553 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 554 return (EFBIG); 555 } 556 557 /* 558 * If reading or writing a huge amount of data we have to break 559 * atomicy and allow the operation to be interrupted by a signal 560 * or it can DOS the machine. 561 * 562 * Preset redo_count so we stop generating REDOs earlier if the 563 * limit is exceeded. 564 * 565 * redo_count is heuristical, SMP races are ok 566 */ 567 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 568 if ((ip->flags & HAMMER_INODE_REDO) && 569 ip->redo_count < hammer_limit_redo) { 570 ip->redo_count += uio->uio_resid; 571 } 572 573 /* 574 * Access the data typically in HAMMER_BUFSIZE blocks via the 575 * buffer cache, but HAMMER may use a variable block size based 576 * on the offset. 577 */ 578 while (uio->uio_resid > 0) { 579 int fixsize = 0; 580 int blksize; 581 int blkmask; 582 int trivial; 583 int endofblk; 584 off_t nsize; 585 586 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 587 break; 588 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 589 break; 590 591 blksize = hammer_blocksize(uio->uio_offset); 592 593 /* 594 * Control the number of pending records associated with 595 * this inode. If too many have accumulated start a 596 * flush. Try to maintain a pipeline with the flusher. 597 * 598 * NOTE: It is possible for other sources to grow the 599 * records but not necessarily issue another flush, 600 * so use a timeout and ensure that a re-flush occurs. 601 */ 602 if (ip->rsv_recs >= hammer_limit_inode_recs) { 603 lwkt_gettoken(&hmp->fs_token); 604 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 605 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 606 ip->flags |= HAMMER_INODE_RECSW; 607 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 608 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 609 } 610 lwkt_reltoken(&hmp->fs_token); 611 } 612 613 /* 614 * Do not allow HAMMER to blow out the buffer cache. Very 615 * large UIOs can lockout other processes due to bwillwrite() 616 * mechanics. 617 * 618 * The hammer inode is not locked during these operations. 619 * The vnode is locked which can interfere with the pageout 620 * daemon for non-UIO_NOCOPY writes but should not interfere 621 * with the buffer cache. Even so, we cannot afford to 622 * allow the pageout daemon to build up too many dirty buffer 623 * cache buffers. 624 * 625 * Only call this if we aren't being recursively called from 626 * a virtual disk device (vn), else we may deadlock. 627 */ 628 if ((ap->a_ioflag & IO_RECURSE) == 0) 629 bwillwrite(blksize); 630 631 /* 632 * Calculate the blocksize at the current offset and figure 633 * out how much we can actually write. 634 */ 635 blkmask = blksize - 1; 636 offset = (int)uio->uio_offset & blkmask; 637 base_offset = uio->uio_offset & ~(int64_t)blkmask; 638 n = blksize - offset; 639 if (n > uio->uio_resid) { 640 n = uio->uio_resid; 641 endofblk = 0; 642 } else { 643 endofblk = 1; 644 } 645 nsize = uio->uio_offset + n; 646 if (nsize > ip->ino_data.size) { 647 if (uio->uio_offset > ip->ino_data.size) 648 trivial = 0; 649 else 650 trivial = 1; 651 nvextendbuf(ap->a_vp, 652 ip->ino_data.size, 653 nsize, 654 hammer_blocksize(ip->ino_data.size), 655 hammer_blocksize(nsize), 656 hammer_blockoff(ip->ino_data.size), 657 hammer_blockoff(nsize), 658 trivial); 659 fixsize = 1; 660 kflags |= NOTE_EXTEND; 661 } 662 663 if (uio->uio_segflg == UIO_NOCOPY) { 664 /* 665 * Issuing a write with the same data backing the 666 * buffer. Instantiate the buffer to collect the 667 * backing vm pages, then read-in any missing bits. 668 * 669 * This case is used by vop_stdputpages(). 670 */ 671 bp = getblk(ap->a_vp, base_offset, 672 blksize, GETBLK_BHEAVY, 0); 673 if ((bp->b_flags & B_CACHE) == 0) { 674 bqrelse(bp); 675 error = bread(ap->a_vp, base_offset, 676 blksize, &bp); 677 } 678 } else if (offset == 0 && uio->uio_resid >= blksize) { 679 /* 680 * Even though we are entirely overwriting the buffer 681 * we may still have to zero it out to avoid a 682 * mmap/write visibility issue. 683 */ 684 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 685 if ((bp->b_flags & B_CACHE) == 0) 686 vfs_bio_clrbuf(bp); 687 } else if (base_offset >= ip->ino_data.size) { 688 /* 689 * If the base offset of the buffer is beyond the 690 * file EOF, we don't have to issue a read. 691 */ 692 bp = getblk(ap->a_vp, base_offset, 693 blksize, GETBLK_BHEAVY, 0); 694 vfs_bio_clrbuf(bp); 695 } else { 696 /* 697 * Partial overwrite, read in any missing bits then 698 * replace the portion being written. 699 */ 700 error = bread(ap->a_vp, base_offset, blksize, &bp); 701 if (error == 0) 702 bheavy(bp); 703 } 704 if (error == 0) 705 error = uiomovebp(bp, bp->b_data + offset, n, uio); 706 707 lwkt_gettoken(&hmp->fs_token); 708 709 /* 710 * Generate REDO records if enabled and redo_count will not 711 * exceeded the limit. 712 * 713 * If redo_count exceeds the limit we stop generating records 714 * and clear HAMMER_INODE_REDO. This will cause the next 715 * fsync() to do a full meta-data sync instead of just an 716 * UNDO/REDO fifo update. 717 * 718 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 719 * will still be tracked. The tracks will be terminated 720 * when the related meta-data (including possible data 721 * modifications which are not tracked via REDO) is 722 * flushed. 723 */ 724 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 725 if (ip->redo_count < hammer_limit_redo) { 726 bp->b_flags |= B_VFSFLAG1; 727 error = hammer_generate_redo(&trans, ip, 728 base_offset + offset, 729 HAMMER_REDO_WRITE, 730 bp->b_data + offset, 731 (size_t)n); 732 } else { 733 ip->flags &= ~HAMMER_INODE_REDO; 734 } 735 } 736 737 /* 738 * If we screwed up we have to undo any VM size changes we 739 * made. 740 */ 741 if (error) { 742 brelse(bp); 743 if (fixsize) { 744 nvtruncbuf(ap->a_vp, ip->ino_data.size, 745 hammer_blocksize(ip->ino_data.size), 746 hammer_blockoff(ip->ino_data.size), 747 0); 748 } 749 lwkt_reltoken(&hmp->fs_token); 750 break; 751 } 752 kflags |= NOTE_WRITE; 753 hammer_stats_file_write += n; 754 if (blksize == HAMMER_XBUFSIZE) 755 bp->b_flags |= B_CLUSTEROK; 756 if (ip->ino_data.size < uio->uio_offset) { 757 ip->ino_data.size = uio->uio_offset; 758 flags = HAMMER_INODE_SDIRTY; 759 } else { 760 flags = 0; 761 } 762 ip->ino_data.mtime = trans.time; 763 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 764 hammer_modify_inode(&trans, ip, flags); 765 766 /* 767 * Once we dirty the buffer any cached zone-X offset 768 * becomes invalid. HAMMER NOTE: no-history mode cannot 769 * allow overwriting over the same data sector unless 770 * we provide UNDOs for the old data, which we don't. 771 */ 772 bp->b_bio2.bio_offset = NOOFFSET; 773 774 lwkt_reltoken(&hmp->fs_token); 775 776 /* 777 * Final buffer disposition. 778 * 779 * Because meta-data updates are deferred, HAMMER is 780 * especially sensitive to excessive bdwrite()s because 781 * the I/O stream is not broken up by disk reads. So the 782 * buffer cache simply cannot keep up. 783 * 784 * WARNING! blksize is variable. cluster_write() is 785 * expected to not blow up if it encounters 786 * buffers that do not match the passed blksize. 787 * 788 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 789 * The ip->rsv_recs check should burst-flush the data. 790 * If we queue it immediately the buf could be left 791 * locked on the device queue for a very long time. 792 * 793 * However, failing to flush a dirty buffer out when 794 * issued from the pageout daemon can result in a low 795 * memory deadlock against bio_page_alloc(), so we 796 * have to bawrite() on IO_ASYNC as well. 797 * 798 * NOTE! To avoid degenerate stalls due to mismatched block 799 * sizes we only honor IO_DIRECT on the write which 800 * abuts the end of the buffer. However, we must 801 * honor IO_SYNC in case someone is silly enough to 802 * configure a HAMMER file as swap, or when HAMMER 803 * is serving NFS (for commits). Ick ick. 804 */ 805 bp->b_flags |= B_AGE; 806 if (blksize == HAMMER_XBUFSIZE) 807 bp->b_flags |= B_CLUSTEROK; 808 809 if (ap->a_ioflag & IO_SYNC) { 810 bwrite(bp); 811 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 812 bawrite(bp); 813 } else if (ap->a_ioflag & IO_ASYNC) { 814 bawrite(bp); 815 } else if (hammer_cluster_enable && 816 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 817 if (base_offset < HAMMER_XDEMARC) 818 cluster_eof = hammer_blockdemarc(base_offset, 819 ip->ino_data.size); 820 else 821 cluster_eof = ip->ino_data.size; 822 cluster_write(bp, cluster_eof, blksize, seqcount); 823 } else { 824 bdwrite(bp); 825 } 826 } 827 hammer_done_transaction(&trans); 828 hammer_knote(ap->a_vp, kflags); 829 830 return (error); 831 } 832 833 /* 834 * hammer_vop_access { vp, mode, cred } 835 * 836 * MPSAFE - does not require fs_token 837 */ 838 static 839 int 840 hammer_vop_access(struct vop_access_args *ap) 841 { 842 hammer_inode_t ip = VTOI(ap->a_vp); 843 uid_t uid; 844 gid_t gid; 845 int error; 846 847 uid = hammer_to_unix_xid(&ip->ino_data.uid); 848 gid = hammer_to_unix_xid(&ip->ino_data.gid); 849 850 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 851 ip->ino_data.uflags); 852 return (error); 853 } 854 855 /* 856 * hammer_vop_advlock { vp, id, op, fl, flags } 857 * 858 * MPSAFE - does not require fs_token 859 */ 860 static 861 int 862 hammer_vop_advlock(struct vop_advlock_args *ap) 863 { 864 hammer_inode_t ip = VTOI(ap->a_vp); 865 866 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 867 } 868 869 /* 870 * hammer_vop_close { vp, fflag } 871 * 872 * We can only sync-on-close for normal closes. XXX disabled for now. 873 */ 874 static 875 int 876 hammer_vop_close(struct vop_close_args *ap) 877 { 878 #if 0 879 struct vnode *vp = ap->a_vp; 880 hammer_inode_t ip = VTOI(vp); 881 int waitfor; 882 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 883 if (vn_islocked(vp) == LK_EXCLUSIVE && 884 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 885 if (ip->flags & HAMMER_INODE_CLOSESYNC) 886 waitfor = MNT_WAIT; 887 else 888 waitfor = MNT_NOWAIT; 889 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 890 HAMMER_INODE_CLOSEASYNC); 891 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 892 } 893 } 894 #endif 895 return (vop_stdclose(ap)); 896 } 897 898 /* 899 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 900 * 901 * The operating system has already ensured that the directory entry 902 * does not exist and done all appropriate namespace locking. 903 */ 904 static 905 int 906 hammer_vop_ncreate(struct vop_ncreate_args *ap) 907 { 908 struct hammer_transaction trans; 909 hammer_inode_t dip; 910 hammer_inode_t nip; 911 struct nchandle *nch; 912 hammer_mount_t hmp; 913 int error; 914 915 nch = ap->a_nch; 916 dip = VTOI(ap->a_dvp); 917 hmp = dip->hmp; 918 919 if (dip->flags & HAMMER_INODE_RO) 920 return (EROFS); 921 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 922 return (error); 923 924 /* 925 * Create a transaction to cover the operations we perform. 926 */ 927 lwkt_gettoken(&hmp->fs_token); 928 hammer_start_transaction(&trans, hmp); 929 930 /* 931 * Create a new filesystem object of the requested type. The 932 * returned inode will be referenced and shared-locked to prevent 933 * it from being moved to the flusher. 934 */ 935 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 936 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 937 NULL, &nip); 938 if (error) { 939 hkprintf("hammer_create_inode error %d\n", error); 940 hammer_done_transaction(&trans); 941 *ap->a_vpp = NULL; 942 lwkt_reltoken(&hmp->fs_token); 943 return (error); 944 } 945 946 /* 947 * Add the new filesystem object to the directory. This will also 948 * bump the inode's link count. 949 */ 950 error = hammer_ip_add_direntry(&trans, dip, 951 nch->ncp->nc_name, nch->ncp->nc_nlen, 952 nip); 953 if (error) 954 hkprintf("hammer_ip_add_direntry error %d\n", error); 955 956 /* 957 * Finish up. 958 */ 959 if (error) { 960 hammer_rel_inode(nip, 0); 961 hammer_done_transaction(&trans); 962 *ap->a_vpp = NULL; 963 } else { 964 error = hammer_get_vnode(nip, ap->a_vpp); 965 hammer_done_transaction(&trans); 966 hammer_rel_inode(nip, 0); 967 if (error == 0) { 968 cache_setunresolved(ap->a_nch); 969 cache_setvp(ap->a_nch, *ap->a_vpp); 970 } 971 hammer_knote(ap->a_dvp, NOTE_WRITE); 972 } 973 lwkt_reltoken(&hmp->fs_token); 974 return (error); 975 } 976 977 /* 978 * hammer_vop_getattr { vp, vap } 979 * 980 * Retrieve an inode's attribute information. When accessing inodes 981 * historically we fake the atime field to ensure consistent results. 982 * The atime field is stored in the B-Tree element and allowed to be 983 * updated without cycling the element. 984 * 985 * MPSAFE - does not require fs_token 986 */ 987 static 988 int 989 hammer_vop_getattr(struct vop_getattr_args *ap) 990 { 991 hammer_inode_t ip = VTOI(ap->a_vp); 992 struct vattr *vap = ap->a_vap; 993 994 /* 995 * We want the fsid to be different when accessing a filesystem 996 * with different as-of's so programs like diff don't think 997 * the files are the same. 998 * 999 * We also want the fsid to be the same when comparing snapshots, 1000 * or when comparing mirrors (which might be backed by different 1001 * physical devices). HAMMER fsids are based on the PFS's 1002 * shared_uuid field. 1003 * 1004 * XXX there is a chance of collision here. The va_fsid reported 1005 * by stat is different from the more involved fsid used in the 1006 * mount structure. 1007 */ 1008 hammer_lock_sh(&ip->lock); 1009 vap->va_fsid = ip->pfsm->fsid_udev ^ (uint32_t)ip->obj_asof ^ 1010 (uint32_t)(ip->obj_asof >> 32); 1011 1012 vap->va_fileid = ip->ino_leaf.base.obj_id; 1013 vap->va_mode = ip->ino_data.mode; 1014 vap->va_nlink = ip->ino_data.nlinks; 1015 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1016 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1017 vap->va_rmajor = 0; 1018 vap->va_rminor = 0; 1019 vap->va_size = ip->ino_data.size; 1020 1021 /* 1022 * Special case for @@PFS softlinks. The actual size of the 1023 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1024 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1025 * 1026 * Note that userspace hammer command does not allow users to 1027 * create a @@PFS softlink under an existing other PFS (id!=0) 1028 * so the ip localization here for @@PFS softlink is always 0. 1029 */ 1030 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1031 ip->ino_data.size == 10 && 1032 ip->obj_asof == HAMMER_MAX_TID && 1033 ip->obj_localization == HAMMER_DEF_LOCALIZATION && 1034 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1035 if (hammer_is_pfs_slave(&ip->pfsm->pfsd)) 1036 vap->va_size = 26; 1037 else 1038 vap->va_size = 10; 1039 } 1040 1041 /* 1042 * We must provide a consistent atime and mtime for snapshots 1043 * so people can do a 'tar cf - ... | md5' on them and get 1044 * consistent results. 1045 */ 1046 if (ip->flags & HAMMER_INODE_RO) { 1047 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1048 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1049 } else { 1050 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1051 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1052 } 1053 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1054 vap->va_flags = ip->ino_data.uflags; 1055 vap->va_gen = 1; /* hammer inums are unique for all time */ 1056 vap->va_blocksize = HAMMER_BUFSIZE; 1057 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1058 vap->va_bytes = HAMMER_XBUFSIZE64_DOALIGN(ip->ino_data.size); 1059 } else if (ip->ino_data.size > HAMMER_HBUFSIZE) { 1060 vap->va_bytes = HAMMER_BUFSIZE64_DOALIGN(ip->ino_data.size); 1061 } else { 1062 vap->va_bytes = HAMMER_DATA_DOALIGN(ip->ino_data.size); 1063 } 1064 1065 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1066 vap->va_filerev = 0; /* XXX */ 1067 vap->va_uid_uuid = ip->ino_data.uid; 1068 vap->va_gid_uuid = ip->ino_data.gid; 1069 vap->va_fsid_uuid = ip->hmp->fsid; 1070 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1071 VA_FSID_UUID_VALID; 1072 1073 switch (ip->ino_data.obj_type) { 1074 case HAMMER_OBJTYPE_CDEV: 1075 case HAMMER_OBJTYPE_BDEV: 1076 vap->va_rmajor = ip->ino_data.rmajor; 1077 vap->va_rminor = ip->ino_data.rminor; 1078 break; 1079 default: 1080 break; 1081 } 1082 hammer_unlock(&ip->lock); 1083 return(0); 1084 } 1085 1086 /* 1087 * hammer_vop_nresolve { nch, dvp, cred } 1088 * 1089 * Locate the requested directory entry. 1090 */ 1091 static 1092 int 1093 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1094 { 1095 struct hammer_transaction trans; 1096 struct namecache *ncp; 1097 hammer_mount_t hmp; 1098 hammer_inode_t dip; 1099 hammer_inode_t ip; 1100 hammer_tid_t asof; 1101 struct hammer_cursor cursor; 1102 struct vnode *vp; 1103 int64_t namekey; 1104 int error; 1105 int i; 1106 int nlen; 1107 int flags; 1108 int ispfs; 1109 int64_t obj_id; 1110 uint32_t localization; 1111 uint32_t max_iterations; 1112 1113 /* 1114 * Misc initialization, plus handle as-of name extensions. Look for 1115 * the '@@' extension. Note that as-of files and directories cannot 1116 * be modified. 1117 */ 1118 dip = VTOI(ap->a_dvp); 1119 ncp = ap->a_nch->ncp; 1120 asof = dip->obj_asof; 1121 localization = dip->obj_localization; /* for code consistency */ 1122 nlen = ncp->nc_nlen; 1123 flags = dip->flags & HAMMER_INODE_RO; 1124 ispfs = 0; 1125 hmp = dip->hmp; 1126 1127 lwkt_gettoken(&hmp->fs_token); 1128 hammer_simple_transaction(&trans, hmp); 1129 1130 for (i = 0; i < nlen; ++i) { 1131 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1132 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1133 &ispfs, &asof, &localization); 1134 if (error != 0) { 1135 i = nlen; 1136 break; 1137 } 1138 if (asof != HAMMER_MAX_TID) 1139 flags |= HAMMER_INODE_RO; 1140 break; 1141 } 1142 } 1143 nlen = i; 1144 1145 /* 1146 * If this is a PFS we dive into the PFS root inode 1147 */ 1148 if (ispfs && nlen == 0) { 1149 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1150 asof, localization, 1151 flags, &error); 1152 if (error == 0) { 1153 error = hammer_get_vnode(ip, &vp); 1154 hammer_rel_inode(ip, 0); 1155 } else { 1156 vp = NULL; 1157 } 1158 if (error == 0) { 1159 vn_unlock(vp); 1160 cache_setvp(ap->a_nch, vp); 1161 vrele(vp); 1162 } 1163 goto done; 1164 } 1165 1166 /* 1167 * If there is no path component the time extension is relative to dip. 1168 * e.g. "fubar/@@<snapshot>" 1169 * 1170 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1171 * e.g. "fubar/.@@<snapshot>" 1172 * 1173 * ".." is handled by the kernel. We do not currently handle 1174 * "..@<snapshot>". 1175 */ 1176 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1177 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1178 asof, dip->obj_localization, 1179 flags, &error); 1180 if (error == 0) { 1181 error = hammer_get_vnode(ip, &vp); 1182 hammer_rel_inode(ip, 0); 1183 } else { 1184 vp = NULL; 1185 } 1186 if (error == 0) { 1187 vn_unlock(vp); 1188 cache_setvp(ap->a_nch, vp); 1189 vrele(vp); 1190 } 1191 goto done; 1192 } 1193 1194 /* 1195 * Calculate the namekey and setup the key range for the scan. This 1196 * works kinda like a chained hash table where the lower 32 bits 1197 * of the namekey synthesize the chain. 1198 * 1199 * The key range is inclusive of both key_beg and key_end. 1200 */ 1201 namekey = hammer_direntry_namekey(dip, ncp->nc_name, nlen, 1202 &max_iterations); 1203 1204 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1205 cursor.key_beg.localization = dip->obj_localization | 1206 hammer_dir_localization(dip); 1207 cursor.key_beg.obj_id = dip->obj_id; 1208 cursor.key_beg.key = namekey; 1209 cursor.key_beg.create_tid = 0; 1210 cursor.key_beg.delete_tid = 0; 1211 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1212 cursor.key_beg.obj_type = 0; 1213 1214 cursor.key_end = cursor.key_beg; 1215 cursor.key_end.key += max_iterations; 1216 cursor.asof = asof; 1217 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1218 1219 /* 1220 * Scan all matching records (the chain), locate the one matching 1221 * the requested path component. 1222 * 1223 * The hammer_ip_*() functions merge in-memory records with on-disk 1224 * records for the purposes of the search. 1225 */ 1226 obj_id = 0; 1227 localization = HAMMER_DEF_LOCALIZATION; 1228 1229 if (error == 0) { 1230 error = hammer_ip_first(&cursor); 1231 while (error == 0) { 1232 error = hammer_ip_resolve_data(&cursor); 1233 if (error) 1234 break; 1235 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1236 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1237 obj_id = cursor.data->entry.obj_id; 1238 localization = cursor.data->entry.localization; 1239 break; 1240 } 1241 error = hammer_ip_next(&cursor); 1242 } 1243 } 1244 hammer_done_cursor(&cursor); 1245 1246 /* 1247 * Lookup the obj_id. This should always succeed. If it does not 1248 * the filesystem may be damaged and we return a dummy inode. 1249 */ 1250 if (error == 0) { 1251 ip = hammer_get_inode(&trans, dip, obj_id, 1252 asof, localization, 1253 flags, &error); 1254 if (error == ENOENT) { 1255 hkprintf("WARNING: Missing inode for dirent \"%s\"\n" 1256 "\tobj_id = %016jx, asof=%016jx, lo=%08x\n", 1257 ncp->nc_name, 1258 (intmax_t)obj_id, (intmax_t)asof, 1259 localization); 1260 error = 0; 1261 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1262 asof, localization, 1263 flags, &error); 1264 } 1265 if (error == 0) { 1266 error = hammer_get_vnode(ip, &vp); 1267 hammer_rel_inode(ip, 0); 1268 } else { 1269 vp = NULL; 1270 } 1271 if (error == 0) { 1272 vn_unlock(vp); 1273 cache_setvp(ap->a_nch, vp); 1274 vrele(vp); 1275 } 1276 } else if (error == ENOENT) { 1277 cache_setvp(ap->a_nch, NULL); 1278 } 1279 done: 1280 hammer_done_transaction(&trans); 1281 lwkt_reltoken(&hmp->fs_token); 1282 return (error); 1283 } 1284 1285 /* 1286 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1287 * 1288 * Locate the parent directory of a directory vnode. 1289 * 1290 * dvp is referenced but not locked. *vpp must be returned referenced and 1291 * locked. A parent_obj_id of 0 indicates that we are at the root. 1292 * 1293 * NOTE: as-of sequences are not linked into the directory structure. If 1294 * we are at the root with a different asof then the mount point, reload 1295 * the same directory with the mount point's asof. I'm not sure what this 1296 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1297 * get confused, but it hasn't been tested. 1298 */ 1299 static 1300 int 1301 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1302 { 1303 struct hammer_transaction trans; 1304 hammer_inode_t dip; 1305 hammer_inode_t ip; 1306 hammer_mount_t hmp; 1307 int64_t parent_obj_id; 1308 uint32_t parent_obj_localization; 1309 hammer_tid_t asof; 1310 int error; 1311 1312 dip = VTOI(ap->a_dvp); 1313 asof = dip->obj_asof; 1314 hmp = dip->hmp; 1315 1316 /* 1317 * Whos are parent? This could be the root of a pseudo-filesystem 1318 * whos parent is in another localization domain. 1319 */ 1320 lwkt_gettoken(&hmp->fs_token); 1321 parent_obj_id = dip->ino_data.parent_obj_id; 1322 if (dip->obj_id == HAMMER_OBJID_ROOT) 1323 parent_obj_localization = HAMMER_DEF_LOCALIZATION; 1324 else 1325 parent_obj_localization = dip->obj_localization; 1326 1327 /* 1328 * It's probably a PFS root when dip->ino_data.parent_obj_id is 0. 1329 */ 1330 if (parent_obj_id == 0) { 1331 if (dip->obj_id == HAMMER_OBJID_ROOT && 1332 asof != hmp->asof) { 1333 parent_obj_id = dip->obj_id; 1334 asof = hmp->asof; 1335 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1336 ksnprintf(*ap->a_fakename, 19, "0x%016jx", 1337 (intmax_t)dip->obj_asof); 1338 } else { 1339 *ap->a_vpp = NULL; 1340 lwkt_reltoken(&hmp->fs_token); 1341 return ENOENT; 1342 } 1343 } 1344 1345 hammer_simple_transaction(&trans, hmp); 1346 1347 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1348 asof, parent_obj_localization, 1349 dip->flags, &error); 1350 if (ip) { 1351 error = hammer_get_vnode(ip, ap->a_vpp); 1352 hammer_rel_inode(ip, 0); 1353 } else { 1354 *ap->a_vpp = NULL; 1355 } 1356 hammer_done_transaction(&trans); 1357 lwkt_reltoken(&hmp->fs_token); 1358 return (error); 1359 } 1360 1361 /* 1362 * hammer_vop_nlink { nch, dvp, vp, cred } 1363 */ 1364 static 1365 int 1366 hammer_vop_nlink(struct vop_nlink_args *ap) 1367 { 1368 struct hammer_transaction trans; 1369 hammer_inode_t dip; 1370 hammer_inode_t ip; 1371 struct nchandle *nch; 1372 hammer_mount_t hmp; 1373 int error; 1374 1375 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1376 return(EXDEV); 1377 1378 nch = ap->a_nch; 1379 dip = VTOI(ap->a_dvp); 1380 ip = VTOI(ap->a_vp); 1381 hmp = dip->hmp; 1382 1383 if (dip->obj_localization != ip->obj_localization) 1384 return(EXDEV); 1385 1386 if (dip->flags & HAMMER_INODE_RO) 1387 return (EROFS); 1388 if (ip->flags & HAMMER_INODE_RO) 1389 return (EROFS); 1390 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1391 return (error); 1392 1393 /* 1394 * Create a transaction to cover the operations we perform. 1395 */ 1396 lwkt_gettoken(&hmp->fs_token); 1397 hammer_start_transaction(&trans, hmp); 1398 1399 /* 1400 * Add the filesystem object to the directory. Note that neither 1401 * dip nor ip are referenced or locked, but their vnodes are 1402 * referenced. This function will bump the inode's link count. 1403 */ 1404 error = hammer_ip_add_direntry(&trans, dip, 1405 nch->ncp->nc_name, nch->ncp->nc_nlen, 1406 ip); 1407 1408 /* 1409 * Finish up. 1410 */ 1411 if (error == 0) { 1412 cache_setunresolved(nch); 1413 cache_setvp(nch, ap->a_vp); 1414 } 1415 hammer_done_transaction(&trans); 1416 hammer_knote(ap->a_vp, NOTE_LINK); 1417 hammer_knote(ap->a_dvp, NOTE_WRITE); 1418 lwkt_reltoken(&hmp->fs_token); 1419 return (error); 1420 } 1421 1422 /* 1423 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1424 * 1425 * The operating system has already ensured that the directory entry 1426 * does not exist and done all appropriate namespace locking. 1427 */ 1428 static 1429 int 1430 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1431 { 1432 struct hammer_transaction trans; 1433 hammer_inode_t dip; 1434 hammer_inode_t nip; 1435 struct nchandle *nch; 1436 hammer_mount_t hmp; 1437 int error; 1438 1439 nch = ap->a_nch; 1440 dip = VTOI(ap->a_dvp); 1441 hmp = dip->hmp; 1442 1443 if (dip->flags & HAMMER_INODE_RO) 1444 return (EROFS); 1445 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1446 return (error); 1447 1448 /* 1449 * Create a transaction to cover the operations we perform. 1450 */ 1451 lwkt_gettoken(&hmp->fs_token); 1452 hammer_start_transaction(&trans, hmp); 1453 1454 /* 1455 * Create a new filesystem object of the requested type. The 1456 * returned inode will be referenced but not locked. 1457 */ 1458 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1459 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1460 NULL, &nip); 1461 if (error) { 1462 hammer_done_transaction(&trans); 1463 *ap->a_vpp = NULL; 1464 lwkt_reltoken(&hmp->fs_token); 1465 return (error); 1466 } 1467 /* 1468 * Add the new filesystem object to the directory. This will also 1469 * bump the inode's link count. 1470 */ 1471 error = hammer_ip_add_direntry(&trans, dip, 1472 nch->ncp->nc_name, nch->ncp->nc_nlen, 1473 nip); 1474 if (error) 1475 hkprintf("hammer_mkdir (add) error %d\n", error); 1476 1477 /* 1478 * Finish up. 1479 */ 1480 if (error) { 1481 hammer_rel_inode(nip, 0); 1482 *ap->a_vpp = NULL; 1483 } else { 1484 error = hammer_get_vnode(nip, ap->a_vpp); 1485 hammer_rel_inode(nip, 0); 1486 if (error == 0) { 1487 cache_setunresolved(ap->a_nch); 1488 cache_setvp(ap->a_nch, *ap->a_vpp); 1489 } 1490 } 1491 hammer_done_transaction(&trans); 1492 if (error == 0) 1493 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1494 lwkt_reltoken(&hmp->fs_token); 1495 return (error); 1496 } 1497 1498 /* 1499 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1500 * 1501 * The operating system has already ensured that the directory entry 1502 * does not exist and done all appropriate namespace locking. 1503 */ 1504 static 1505 int 1506 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1507 { 1508 struct hammer_transaction trans; 1509 hammer_inode_t dip; 1510 hammer_inode_t nip; 1511 struct nchandle *nch; 1512 hammer_mount_t hmp; 1513 int error; 1514 1515 nch = ap->a_nch; 1516 dip = VTOI(ap->a_dvp); 1517 hmp = dip->hmp; 1518 1519 if (dip->flags & HAMMER_INODE_RO) 1520 return (EROFS); 1521 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1522 return (error); 1523 1524 /* 1525 * Create a transaction to cover the operations we perform. 1526 */ 1527 lwkt_gettoken(&hmp->fs_token); 1528 hammer_start_transaction(&trans, hmp); 1529 1530 /* 1531 * Create a new filesystem object of the requested type. The 1532 * returned inode will be referenced but not locked. 1533 * 1534 * If mknod specifies a directory a pseudo-fs is created. 1535 */ 1536 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1537 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1538 NULL, &nip); 1539 if (error) { 1540 hammer_done_transaction(&trans); 1541 *ap->a_vpp = NULL; 1542 lwkt_reltoken(&hmp->fs_token); 1543 return (error); 1544 } 1545 1546 /* 1547 * Add the new filesystem object to the directory. This will also 1548 * bump the inode's link count. 1549 */ 1550 error = hammer_ip_add_direntry(&trans, dip, 1551 nch->ncp->nc_name, nch->ncp->nc_nlen, 1552 nip); 1553 1554 /* 1555 * Finish up. 1556 */ 1557 if (error) { 1558 hammer_rel_inode(nip, 0); 1559 *ap->a_vpp = NULL; 1560 } else { 1561 error = hammer_get_vnode(nip, ap->a_vpp); 1562 hammer_rel_inode(nip, 0); 1563 if (error == 0) { 1564 cache_setunresolved(ap->a_nch); 1565 cache_setvp(ap->a_nch, *ap->a_vpp); 1566 } 1567 } 1568 hammer_done_transaction(&trans); 1569 if (error == 0) 1570 hammer_knote(ap->a_dvp, NOTE_WRITE); 1571 lwkt_reltoken(&hmp->fs_token); 1572 return (error); 1573 } 1574 1575 /* 1576 * hammer_vop_open { vp, mode, cred, fp } 1577 * 1578 * MPSAFE (does not require fs_token) 1579 */ 1580 static 1581 int 1582 hammer_vop_open(struct vop_open_args *ap) 1583 { 1584 hammer_inode_t ip; 1585 1586 ip = VTOI(ap->a_vp); 1587 1588 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1589 return (EROFS); 1590 return(vop_stdopen(ap)); 1591 } 1592 1593 /* 1594 * hammer_vop_print { vp } 1595 */ 1596 static 1597 int 1598 hammer_vop_print(struct vop_print_args *ap) 1599 { 1600 return EOPNOTSUPP; 1601 } 1602 1603 /* 1604 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1605 */ 1606 static 1607 int 1608 hammer_vop_readdir(struct vop_readdir_args *ap) 1609 { 1610 struct hammer_transaction trans; 1611 struct hammer_cursor cursor; 1612 hammer_inode_t ip; 1613 hammer_mount_t hmp; 1614 struct uio *uio; 1615 hammer_base_elm_t base; 1616 int error; 1617 int cookie_index; 1618 int ncookies; 1619 off_t *cookies; 1620 off_t saveoff; 1621 int r; 1622 int dtype; 1623 1624 ip = VTOI(ap->a_vp); 1625 uio = ap->a_uio; 1626 saveoff = uio->uio_offset; 1627 hmp = ip->hmp; 1628 1629 if (ap->a_ncookies) { 1630 ncookies = uio->uio_resid / 16 + 1; 1631 if (ncookies > 1024) 1632 ncookies = 1024; 1633 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1634 cookie_index = 0; 1635 } else { 1636 ncookies = -1; 1637 cookies = NULL; 1638 cookie_index = 0; 1639 } 1640 1641 lwkt_gettoken(&hmp->fs_token); 1642 hammer_simple_transaction(&trans, hmp); 1643 1644 /* 1645 * Handle artificial entries 1646 * 1647 * It should be noted that the minimum value for a directory 1648 * hash key on-media is 0x0000000100000000, so we can use anything 1649 * less then that to represent our 'special' key space. 1650 */ 1651 error = 0; 1652 if (saveoff == 0) { 1653 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1654 if (r) 1655 goto done; 1656 if (cookies) 1657 cookies[cookie_index] = saveoff; 1658 ++saveoff; 1659 ++cookie_index; 1660 if (cookie_index == ncookies) 1661 goto done; 1662 } 1663 if (saveoff == 1) { 1664 if (ip->ino_data.parent_obj_id) { 1665 r = vop_write_dirent(&error, uio, 1666 ip->ino_data.parent_obj_id, 1667 DT_DIR, 2, ".."); 1668 } else { 1669 r = vop_write_dirent(&error, uio, 1670 ip->obj_id, DT_DIR, 2, ".."); 1671 } 1672 if (r) 1673 goto done; 1674 if (cookies) 1675 cookies[cookie_index] = saveoff; 1676 ++saveoff; 1677 ++cookie_index; 1678 if (cookie_index == ncookies) 1679 goto done; 1680 } 1681 1682 /* 1683 * Key range (begin and end inclusive) to scan. Directory keys 1684 * directly translate to a 64 bit 'seek' position. 1685 */ 1686 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1687 cursor.key_beg.localization = ip->obj_localization | 1688 hammer_dir_localization(ip); 1689 cursor.key_beg.obj_id = ip->obj_id; 1690 cursor.key_beg.create_tid = 0; 1691 cursor.key_beg.delete_tid = 0; 1692 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1693 cursor.key_beg.obj_type = 0; 1694 cursor.key_beg.key = saveoff; 1695 1696 cursor.key_end = cursor.key_beg; 1697 cursor.key_end.key = HAMMER_MAX_KEY; 1698 cursor.asof = ip->obj_asof; 1699 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1700 1701 error = hammer_ip_first(&cursor); 1702 1703 while (error == 0) { 1704 error = hammer_ip_resolve_data(&cursor); 1705 if (error) 1706 break; 1707 base = &cursor.leaf->base; 1708 saveoff = base->key; 1709 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1710 1711 if (base->obj_id != ip->obj_id) 1712 hpanic("bad record at %p", cursor.node); 1713 1714 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1715 r = vop_write_dirent( 1716 &error, uio, cursor.data->entry.obj_id, 1717 dtype, 1718 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1719 (void *)cursor.data->entry.name); 1720 if (r) 1721 break; 1722 ++saveoff; 1723 if (cookies) 1724 cookies[cookie_index] = base->key; 1725 ++cookie_index; 1726 if (cookie_index == ncookies) 1727 break; 1728 error = hammer_ip_next(&cursor); 1729 } 1730 hammer_done_cursor(&cursor); 1731 1732 done: 1733 hammer_done_transaction(&trans); 1734 1735 if (ap->a_eofflag) 1736 *ap->a_eofflag = (error == ENOENT); 1737 uio->uio_offset = saveoff; 1738 if (error && cookie_index == 0) { 1739 if (error == ENOENT) 1740 error = 0; 1741 if (cookies) { 1742 kfree(cookies, M_TEMP); 1743 *ap->a_ncookies = 0; 1744 *ap->a_cookies = NULL; 1745 } 1746 } else { 1747 if (error == ENOENT) 1748 error = 0; 1749 if (cookies) { 1750 *ap->a_ncookies = cookie_index; 1751 *ap->a_cookies = cookies; 1752 } 1753 } 1754 lwkt_reltoken(&hmp->fs_token); 1755 return(error); 1756 } 1757 1758 /* 1759 * hammer_vop_readlink { vp, uio, cred } 1760 */ 1761 static 1762 int 1763 hammer_vop_readlink(struct vop_readlink_args *ap) 1764 { 1765 struct hammer_transaction trans; 1766 struct hammer_cursor cursor; 1767 hammer_inode_t ip; 1768 hammer_mount_t hmp; 1769 char buf[32]; 1770 uint32_t localization; 1771 hammer_pseudofs_inmem_t pfsm; 1772 int error; 1773 1774 ip = VTOI(ap->a_vp); 1775 hmp = ip->hmp; 1776 1777 lwkt_gettoken(&hmp->fs_token); 1778 1779 /* 1780 * Shortcut if the symlink data was stuffed into ino_data. 1781 * 1782 * Also expand special "@@PFS%05d" softlinks (expansion only 1783 * occurs for non-historical (current) accesses made from the 1784 * primary filesystem). 1785 * 1786 * Note that userspace hammer command does not allow users to 1787 * create a @@PFS softlink under an existing other PFS (id!=0) 1788 * so the ip localization here for @@PFS softlink is always 0. 1789 */ 1790 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1791 char *ptr; 1792 int bytes; 1793 1794 ptr = ip->ino_data.ext.symlink; 1795 bytes = (int)ip->ino_data.size; 1796 if (bytes == 10 && 1797 ip->obj_asof == HAMMER_MAX_TID && 1798 ip->obj_localization == HAMMER_DEF_LOCALIZATION && 1799 strncmp(ptr, "@@PFS", 5) == 0) { 1800 hammer_simple_transaction(&trans, hmp); 1801 bcopy(ptr + 5, buf, 5); 1802 buf[5] = 0; 1803 localization = pfs_to_lo(strtoul(buf, NULL, 10)); 1804 pfsm = hammer_load_pseudofs(&trans, localization, 1805 &error); 1806 if (error == 0) { 1807 if (hammer_is_pfs_slave(&pfsm->pfsd)) { 1808 /* vap->va_size == 26 */ 1809 ksnprintf(buf, sizeof(buf), 1810 "@@0x%016jx:%05d", 1811 (intmax_t)pfsm->pfsd.sync_end_tid, 1812 lo_to_pfs(localization)); 1813 } else { 1814 /* vap->va_size == 10 */ 1815 ksnprintf(buf, sizeof(buf), 1816 "@@-1:%05d", 1817 lo_to_pfs(localization)); 1818 } 1819 ptr = buf; 1820 bytes = strlen(buf); 1821 } 1822 if (pfsm) 1823 hammer_rel_pseudofs(hmp, pfsm); 1824 hammer_done_transaction(&trans); 1825 } 1826 error = uiomove(ptr, bytes, ap->a_uio); 1827 lwkt_reltoken(&hmp->fs_token); 1828 return(error); 1829 } 1830 1831 /* 1832 * Long version 1833 */ 1834 hammer_simple_transaction(&trans, hmp); 1835 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1836 1837 /* 1838 * Key range (begin and end inclusive) to scan. Directory keys 1839 * directly translate to a 64 bit 'seek' position. 1840 */ 1841 cursor.key_beg.localization = ip->obj_localization | 1842 HAMMER_LOCALIZE_MISC; 1843 cursor.key_beg.obj_id = ip->obj_id; 1844 cursor.key_beg.create_tid = 0; 1845 cursor.key_beg.delete_tid = 0; 1846 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1847 cursor.key_beg.obj_type = 0; 1848 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1849 cursor.asof = ip->obj_asof; 1850 cursor.flags |= HAMMER_CURSOR_ASOF; 1851 1852 error = hammer_ip_lookup(&cursor); 1853 if (error == 0) { 1854 error = hammer_ip_resolve_data(&cursor); 1855 if (error == 0) { 1856 KKASSERT(cursor.leaf->data_len >= 1857 HAMMER_SYMLINK_NAME_OFF); 1858 error = uiomove(cursor.data->symlink.name, 1859 cursor.leaf->data_len - 1860 HAMMER_SYMLINK_NAME_OFF, 1861 ap->a_uio); 1862 } 1863 } 1864 hammer_done_cursor(&cursor); 1865 hammer_done_transaction(&trans); 1866 lwkt_reltoken(&hmp->fs_token); 1867 return(error); 1868 } 1869 1870 /* 1871 * hammer_vop_nremove { nch, dvp, cred } 1872 */ 1873 static 1874 int 1875 hammer_vop_nremove(struct vop_nremove_args *ap) 1876 { 1877 struct hammer_transaction trans; 1878 hammer_inode_t dip; 1879 hammer_mount_t hmp; 1880 int error; 1881 1882 dip = VTOI(ap->a_dvp); 1883 hmp = dip->hmp; 1884 1885 if (hammer_nohistory(dip) == 0 && 1886 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1887 return (error); 1888 } 1889 1890 lwkt_gettoken(&hmp->fs_token); 1891 hammer_start_transaction(&trans, hmp); 1892 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1893 hammer_done_transaction(&trans); 1894 if (error == 0) 1895 hammer_knote(ap->a_dvp, NOTE_WRITE); 1896 lwkt_reltoken(&hmp->fs_token); 1897 return (error); 1898 } 1899 1900 /* 1901 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1902 */ 1903 static 1904 int 1905 hammer_vop_nrename(struct vop_nrename_args *ap) 1906 { 1907 struct hammer_transaction trans; 1908 struct namecache *fncp; 1909 struct namecache *tncp; 1910 hammer_inode_t fdip; 1911 hammer_inode_t tdip; 1912 hammer_inode_t ip; 1913 hammer_mount_t hmp; 1914 struct hammer_cursor cursor; 1915 int64_t namekey; 1916 uint32_t max_iterations; 1917 int nlen, error; 1918 1919 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1920 return(EXDEV); 1921 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1922 return(EXDEV); 1923 1924 fdip = VTOI(ap->a_fdvp); 1925 tdip = VTOI(ap->a_tdvp); 1926 fncp = ap->a_fnch->ncp; 1927 tncp = ap->a_tnch->ncp; 1928 ip = VTOI(fncp->nc_vp); 1929 KKASSERT(ip != NULL); 1930 1931 hmp = ip->hmp; 1932 1933 if (fdip->obj_localization != tdip->obj_localization) 1934 return(EXDEV); 1935 if (fdip->obj_localization != ip->obj_localization) 1936 return(EXDEV); 1937 1938 if (fdip->flags & HAMMER_INODE_RO) 1939 return (EROFS); 1940 if (tdip->flags & HAMMER_INODE_RO) 1941 return (EROFS); 1942 if (ip->flags & HAMMER_INODE_RO) 1943 return (EROFS); 1944 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1945 return (error); 1946 1947 lwkt_gettoken(&hmp->fs_token); 1948 hammer_start_transaction(&trans, hmp); 1949 1950 /* 1951 * Remove tncp from the target directory and then link ip as 1952 * tncp. XXX pass trans to dounlink 1953 * 1954 * Force the inode sync-time to match the transaction so it is 1955 * in-sync with the creation of the target directory entry. 1956 */ 1957 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1958 ap->a_cred, 0, -1); 1959 if (error == 0 || error == ENOENT) { 1960 error = hammer_ip_add_direntry(&trans, tdip, 1961 tncp->nc_name, tncp->nc_nlen, 1962 ip); 1963 if (error == 0) { 1964 ip->ino_data.parent_obj_id = tdip->obj_id; 1965 ip->ino_data.ctime = trans.time; 1966 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 1967 } 1968 } 1969 if (error) 1970 goto failed; /* XXX */ 1971 1972 /* 1973 * Locate the record in the originating directory and remove it. 1974 * 1975 * Calculate the namekey and setup the key range for the scan. This 1976 * works kinda like a chained hash table where the lower 32 bits 1977 * of the namekey synthesize the chain. 1978 * 1979 * The key range is inclusive of both key_beg and key_end. 1980 */ 1981 namekey = hammer_direntry_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 1982 &max_iterations); 1983 retry: 1984 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 1985 cursor.key_beg.localization = fdip->obj_localization | 1986 hammer_dir_localization(fdip); 1987 cursor.key_beg.obj_id = fdip->obj_id; 1988 cursor.key_beg.key = namekey; 1989 cursor.key_beg.create_tid = 0; 1990 cursor.key_beg.delete_tid = 0; 1991 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1992 cursor.key_beg.obj_type = 0; 1993 1994 cursor.key_end = cursor.key_beg; 1995 cursor.key_end.key += max_iterations; 1996 cursor.asof = fdip->obj_asof; 1997 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1998 1999 /* 2000 * Scan all matching records (the chain), locate the one matching 2001 * the requested path component. 2002 * 2003 * The hammer_ip_*() functions merge in-memory records with on-disk 2004 * records for the purposes of the search. 2005 */ 2006 error = hammer_ip_first(&cursor); 2007 while (error == 0) { 2008 if (hammer_ip_resolve_data(&cursor) != 0) 2009 break; 2010 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2011 KKASSERT(nlen > 0); 2012 if (fncp->nc_nlen == nlen && 2013 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2014 break; 2015 } 2016 error = hammer_ip_next(&cursor); 2017 } 2018 2019 /* 2020 * If all is ok we have to get the inode so we can adjust nlinks. 2021 * 2022 * WARNING: hammer_ip_del_direntry() may have to terminate the 2023 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2024 * twice. 2025 */ 2026 if (error == 0) 2027 error = hammer_ip_del_direntry(&trans, &cursor, fdip, ip); 2028 2029 /* 2030 * XXX A deadlock here will break rename's atomicy for the purposes 2031 * of crash recovery. 2032 */ 2033 if (error == EDEADLK) { 2034 hammer_done_cursor(&cursor); 2035 goto retry; 2036 } 2037 2038 /* 2039 * Cleanup and tell the kernel that the rename succeeded. 2040 * 2041 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2042 * without formally acquiring the vp since the vp might 2043 * have zero refs on it, or in the middle of a reclaim, 2044 * etc. 2045 */ 2046 hammer_done_cursor(&cursor); 2047 if (error == 0) { 2048 cache_rename(ap->a_fnch, ap->a_tnch); 2049 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2050 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2051 while (ip->vp) { 2052 struct vnode *vp; 2053 2054 error = hammer_get_vnode(ip, &vp); 2055 if (error == 0 && vp) { 2056 vn_unlock(vp); 2057 hammer_knote(ip->vp, NOTE_RENAME); 2058 vrele(vp); 2059 break; 2060 } 2061 hdkprintf("ip/vp race2 avoided\n"); 2062 } 2063 } 2064 2065 failed: 2066 hammer_done_transaction(&trans); 2067 lwkt_reltoken(&hmp->fs_token); 2068 return (error); 2069 } 2070 2071 /* 2072 * hammer_vop_nrmdir { nch, dvp, cred } 2073 */ 2074 static 2075 int 2076 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2077 { 2078 struct hammer_transaction trans; 2079 hammer_inode_t dip; 2080 hammer_mount_t hmp; 2081 int error; 2082 2083 dip = VTOI(ap->a_dvp); 2084 hmp = dip->hmp; 2085 2086 if (hammer_nohistory(dip) == 0 && 2087 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2088 return (error); 2089 } 2090 2091 lwkt_gettoken(&hmp->fs_token); 2092 hammer_start_transaction(&trans, hmp); 2093 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2094 hammer_done_transaction(&trans); 2095 if (error == 0) 2096 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2097 lwkt_reltoken(&hmp->fs_token); 2098 return (error); 2099 } 2100 2101 /* 2102 * hammer_vop_markatime { vp, cred } 2103 */ 2104 static 2105 int 2106 hammer_vop_markatime(struct vop_markatime_args *ap) 2107 { 2108 struct hammer_transaction trans; 2109 hammer_inode_t ip; 2110 hammer_mount_t hmp; 2111 2112 ip = VTOI(ap->a_vp); 2113 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2114 return (EROFS); 2115 if (ip->flags & HAMMER_INODE_RO) 2116 return (EROFS); 2117 hmp = ip->hmp; 2118 if (hmp->mp->mnt_flag & MNT_NOATIME) 2119 return (0); 2120 lwkt_gettoken(&hmp->fs_token); 2121 hammer_start_transaction(&trans, hmp); 2122 2123 ip->ino_data.atime = trans.time; 2124 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2125 hammer_done_transaction(&trans); 2126 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2127 lwkt_reltoken(&hmp->fs_token); 2128 return (0); 2129 } 2130 2131 /* 2132 * hammer_vop_setattr { vp, vap, cred } 2133 */ 2134 static 2135 int 2136 hammer_vop_setattr(struct vop_setattr_args *ap) 2137 { 2138 struct hammer_transaction trans; 2139 hammer_inode_t ip; 2140 struct vattr *vap; 2141 hammer_mount_t hmp; 2142 int modflags; 2143 int error; 2144 int truncating; 2145 int blksize; 2146 int kflags; 2147 #if 0 2148 int64_t aligned_size; 2149 #endif 2150 uint32_t flags; 2151 2152 vap = ap->a_vap; 2153 ip = ap->a_vp->v_data; 2154 modflags = 0; 2155 kflags = 0; 2156 hmp = ip->hmp; 2157 2158 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2159 return(EROFS); 2160 if (ip->flags & HAMMER_INODE_RO) 2161 return (EROFS); 2162 if (hammer_nohistory(ip) == 0 && 2163 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2164 return (error); 2165 } 2166 2167 lwkt_gettoken(&hmp->fs_token); 2168 hammer_start_transaction(&trans, hmp); 2169 error = 0; 2170 2171 if (vap->va_flags != VNOVAL) { 2172 flags = ip->ino_data.uflags; 2173 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2174 hammer_to_unix_xid(&ip->ino_data.uid), 2175 ap->a_cred); 2176 if (error == 0) { 2177 if (ip->ino_data.uflags != flags) { 2178 ip->ino_data.uflags = flags; 2179 ip->ino_data.ctime = trans.time; 2180 modflags |= HAMMER_INODE_DDIRTY; 2181 kflags |= NOTE_ATTRIB; 2182 } 2183 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2184 error = 0; 2185 goto done; 2186 } 2187 } 2188 goto done; 2189 } 2190 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2191 error = EPERM; 2192 goto done; 2193 } 2194 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2195 mode_t cur_mode = ip->ino_data.mode; 2196 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2197 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2198 hammer_uuid_t uuid_uid; 2199 hammer_uuid_t uuid_gid; 2200 2201 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2202 ap->a_cred, 2203 &cur_uid, &cur_gid, &cur_mode); 2204 if (error == 0) { 2205 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2206 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2207 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2208 sizeof(uuid_uid)) || 2209 bcmp(&uuid_gid, &ip->ino_data.gid, 2210 sizeof(uuid_gid)) || 2211 ip->ino_data.mode != cur_mode) { 2212 ip->ino_data.uid = uuid_uid; 2213 ip->ino_data.gid = uuid_gid; 2214 ip->ino_data.mode = cur_mode; 2215 ip->ino_data.ctime = trans.time; 2216 modflags |= HAMMER_INODE_DDIRTY; 2217 } 2218 kflags |= NOTE_ATTRIB; 2219 } 2220 } 2221 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2222 switch(ap->a_vp->v_type) { 2223 case VREG: 2224 if (vap->va_size == ip->ino_data.size) 2225 break; 2226 2227 /* 2228 * Log the operation if in fast-fsync mode or if 2229 * there are unterminated redo write records present. 2230 * 2231 * The second check is needed so the recovery code 2232 * properly truncates write redos even if nominal 2233 * REDO operations is turned off due to excessive 2234 * writes, because the related records might be 2235 * destroyed and never lay down a TERM_WRITE. 2236 */ 2237 if ((ip->flags & HAMMER_INODE_REDO) || 2238 (ip->flags & HAMMER_INODE_RDIRTY)) { 2239 error = hammer_generate_redo(&trans, ip, 2240 vap->va_size, 2241 HAMMER_REDO_TRUNC, 2242 NULL, 0); 2243 } 2244 blksize = hammer_blocksize(vap->va_size); 2245 2246 /* 2247 * XXX break atomicy, we can deadlock the backend 2248 * if we do not release the lock. Probably not a 2249 * big deal here. 2250 */ 2251 if (vap->va_size < ip->ino_data.size) { 2252 nvtruncbuf(ap->a_vp, vap->va_size, 2253 blksize, 2254 hammer_blockoff(vap->va_size), 2255 0); 2256 truncating = 1; 2257 kflags |= NOTE_WRITE; 2258 } else { 2259 nvextendbuf(ap->a_vp, 2260 ip->ino_data.size, 2261 vap->va_size, 2262 hammer_blocksize(ip->ino_data.size), 2263 hammer_blocksize(vap->va_size), 2264 hammer_blockoff(ip->ino_data.size), 2265 hammer_blockoff(vap->va_size), 2266 0); 2267 truncating = 0; 2268 kflags |= NOTE_WRITE | NOTE_EXTEND; 2269 } 2270 ip->ino_data.size = vap->va_size; 2271 ip->ino_data.mtime = trans.time; 2272 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2273 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2274 2275 /* 2276 * On-media truncation is cached in the inode until 2277 * the inode is synchronized. We must immediately 2278 * handle any frontend records. 2279 */ 2280 if (truncating) { 2281 hammer_ip_frontend_trunc(ip, vap->va_size); 2282 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2283 ip->flags |= HAMMER_INODE_TRUNCATED; 2284 ip->trunc_off = vap->va_size; 2285 hammer_inode_dirty(ip); 2286 } else if (ip->trunc_off > vap->va_size) { 2287 ip->trunc_off = vap->va_size; 2288 } 2289 } 2290 2291 #if 0 2292 /* 2293 * When truncating, nvtruncbuf() may have cleaned out 2294 * a portion of the last block on-disk in the buffer 2295 * cache. We must clean out any frontend records 2296 * for blocks beyond the new last block. 2297 */ 2298 aligned_size = (vap->va_size + (blksize - 1)) & 2299 ~(int64_t)(blksize - 1); 2300 if (truncating && vap->va_size < aligned_size) { 2301 aligned_size -= blksize; 2302 hammer_ip_frontend_trunc(ip, aligned_size); 2303 } 2304 #endif 2305 break; 2306 case VDATABASE: 2307 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2308 ip->flags |= HAMMER_INODE_TRUNCATED; 2309 ip->trunc_off = vap->va_size; 2310 hammer_inode_dirty(ip); 2311 } else if (ip->trunc_off > vap->va_size) { 2312 ip->trunc_off = vap->va_size; 2313 } 2314 hammer_ip_frontend_trunc(ip, vap->va_size); 2315 ip->ino_data.size = vap->va_size; 2316 ip->ino_data.mtime = trans.time; 2317 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2318 kflags |= NOTE_ATTRIB; 2319 break; 2320 default: 2321 error = EINVAL; 2322 goto done; 2323 } 2324 break; 2325 } 2326 if (vap->va_atime.tv_sec != VNOVAL) { 2327 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2328 modflags |= HAMMER_INODE_ATIME; 2329 kflags |= NOTE_ATTRIB; 2330 } 2331 if (vap->va_mtime.tv_sec != VNOVAL) { 2332 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2333 modflags |= HAMMER_INODE_MTIME; 2334 kflags |= NOTE_ATTRIB; 2335 } 2336 if (vap->va_mode != (mode_t)VNOVAL) { 2337 mode_t cur_mode = ip->ino_data.mode; 2338 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2339 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2340 2341 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2342 cur_uid, cur_gid, &cur_mode); 2343 if (error == 0 && ip->ino_data.mode != cur_mode) { 2344 ip->ino_data.mode = cur_mode; 2345 ip->ino_data.ctime = trans.time; 2346 modflags |= HAMMER_INODE_DDIRTY; 2347 kflags |= NOTE_ATTRIB; 2348 } 2349 } 2350 done: 2351 if (error == 0) 2352 hammer_modify_inode(&trans, ip, modflags); 2353 hammer_done_transaction(&trans); 2354 hammer_knote(ap->a_vp, kflags); 2355 lwkt_reltoken(&hmp->fs_token); 2356 return (error); 2357 } 2358 2359 /* 2360 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2361 */ 2362 static 2363 int 2364 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2365 { 2366 struct hammer_transaction trans; 2367 hammer_inode_t dip; 2368 hammer_inode_t nip; 2369 hammer_record_t record; 2370 struct nchandle *nch; 2371 hammer_mount_t hmp; 2372 int error; 2373 int bytes; 2374 2375 ap->a_vap->va_type = VLNK; 2376 2377 nch = ap->a_nch; 2378 dip = VTOI(ap->a_dvp); 2379 hmp = dip->hmp; 2380 2381 if (dip->flags & HAMMER_INODE_RO) 2382 return (EROFS); 2383 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2384 return (error); 2385 2386 /* 2387 * Create a transaction to cover the operations we perform. 2388 */ 2389 lwkt_gettoken(&hmp->fs_token); 2390 hammer_start_transaction(&trans, hmp); 2391 2392 /* 2393 * Create a new filesystem object of the requested type. The 2394 * returned inode will be referenced but not locked. 2395 */ 2396 2397 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2398 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2399 NULL, &nip); 2400 if (error) { 2401 hammer_done_transaction(&trans); 2402 *ap->a_vpp = NULL; 2403 lwkt_reltoken(&hmp->fs_token); 2404 return (error); 2405 } 2406 2407 /* 2408 * Add a record representing the symlink. symlink stores the link 2409 * as pure data, not a string, and is no \0 terminated. 2410 */ 2411 if (error == 0) { 2412 bytes = strlen(ap->a_target); 2413 2414 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2415 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2416 } else { 2417 record = hammer_alloc_mem_record(nip, bytes); 2418 record->type = HAMMER_MEM_RECORD_GENERAL; 2419 2420 record->leaf.base.localization = nip->obj_localization | 2421 HAMMER_LOCALIZE_MISC; 2422 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2423 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2424 record->leaf.data_len = bytes; 2425 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2426 bcopy(ap->a_target, record->data->symlink.name, bytes); 2427 error = hammer_ip_add_record(&trans, record); 2428 } 2429 2430 /* 2431 * Set the file size to the length of the link. 2432 */ 2433 if (error == 0) { 2434 nip->ino_data.size = bytes; 2435 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2436 } 2437 } 2438 if (error == 0) 2439 error = hammer_ip_add_direntry(&trans, dip, nch->ncp->nc_name, 2440 nch->ncp->nc_nlen, nip); 2441 2442 /* 2443 * Finish up. 2444 */ 2445 if (error) { 2446 hammer_rel_inode(nip, 0); 2447 *ap->a_vpp = NULL; 2448 } else { 2449 error = hammer_get_vnode(nip, ap->a_vpp); 2450 hammer_rel_inode(nip, 0); 2451 if (error == 0) { 2452 cache_setunresolved(ap->a_nch); 2453 cache_setvp(ap->a_nch, *ap->a_vpp); 2454 hammer_knote(ap->a_dvp, NOTE_WRITE); 2455 } 2456 } 2457 hammer_done_transaction(&trans); 2458 lwkt_reltoken(&hmp->fs_token); 2459 return (error); 2460 } 2461 2462 /* 2463 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2464 */ 2465 static 2466 int 2467 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2468 { 2469 struct hammer_transaction trans; 2470 hammer_inode_t dip; 2471 hammer_mount_t hmp; 2472 int error; 2473 2474 dip = VTOI(ap->a_dvp); 2475 hmp = dip->hmp; 2476 2477 if (hammer_nohistory(dip) == 0 && 2478 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2479 return (error); 2480 } 2481 2482 lwkt_gettoken(&hmp->fs_token); 2483 hammer_start_transaction(&trans, hmp); 2484 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2485 ap->a_cred, ap->a_flags, -1); 2486 hammer_done_transaction(&trans); 2487 lwkt_reltoken(&hmp->fs_token); 2488 2489 return (error); 2490 } 2491 2492 /* 2493 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2494 */ 2495 static 2496 int 2497 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2498 { 2499 hammer_inode_t ip = ap->a_vp->v_data; 2500 hammer_mount_t hmp = ip->hmp; 2501 int error; 2502 2503 lwkt_gettoken(&hmp->fs_token); 2504 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2505 ap->a_fflag, ap->a_cred); 2506 lwkt_reltoken(&hmp->fs_token); 2507 return (error); 2508 } 2509 2510 static 2511 int 2512 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2513 { 2514 static const struct mountctl_opt extraopt[] = { 2515 { HMNT_NOHISTORY, "nohistory" }, 2516 { HMNT_MASTERID, "master" }, 2517 { HMNT_NOMIRROR, "nomirror" }, 2518 { 0, NULL} 2519 2520 }; 2521 hammer_mount_t hmp; 2522 struct mount *mp; 2523 int usedbytes; 2524 int error; 2525 2526 error = 0; 2527 usedbytes = 0; 2528 mp = ap->a_head.a_ops->head.vv_mount; 2529 KKASSERT(mp->mnt_data != NULL); 2530 hmp = (hammer_mount_t)mp->mnt_data; 2531 2532 lwkt_gettoken(&hmp->fs_token); 2533 2534 switch(ap->a_op) { 2535 case MOUNTCTL_SET_EXPORT: 2536 if (ap->a_ctllen != sizeof(struct export_args)) 2537 error = EINVAL; 2538 else 2539 error = hammer_vfs_export(mp, ap->a_op, 2540 (const struct export_args *)ap->a_ctl); 2541 break; 2542 case MOUNTCTL_MOUNTFLAGS: 2543 /* 2544 * Call standard mountctl VOP function 2545 * so we get user mount flags. 2546 */ 2547 error = vop_stdmountctl(ap); 2548 if (error) 2549 break; 2550 2551 usedbytes = *ap->a_res; 2552 2553 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2554 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2555 ap->a_buf, 2556 ap->a_buflen - usedbytes, 2557 &error); 2558 } 2559 2560 *ap->a_res += usedbytes; 2561 break; 2562 default: 2563 error = vop_stdmountctl(ap); 2564 break; 2565 } 2566 lwkt_reltoken(&hmp->fs_token); 2567 return(error); 2568 } 2569 2570 /* 2571 * hammer_vop_strategy { vp, bio } 2572 * 2573 * Strategy call, used for regular file read & write only. Note that the 2574 * bp may represent a cluster. 2575 * 2576 * To simplify operation and allow better optimizations in the future, 2577 * this code does not make any assumptions with regards to buffer alignment 2578 * or size. 2579 */ 2580 static 2581 int 2582 hammer_vop_strategy(struct vop_strategy_args *ap) 2583 { 2584 struct buf *bp; 2585 int error; 2586 2587 bp = ap->a_bio->bio_buf; 2588 2589 switch(bp->b_cmd) { 2590 case BUF_CMD_READ: 2591 error = hammer_vop_strategy_read(ap); 2592 break; 2593 case BUF_CMD_WRITE: 2594 error = hammer_vop_strategy_write(ap); 2595 break; 2596 default: 2597 bp->b_error = error = EINVAL; 2598 bp->b_flags |= B_ERROR; 2599 biodone(ap->a_bio); 2600 break; 2601 } 2602 2603 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2604 2605 return (error); 2606 } 2607 2608 /* 2609 * Read from a regular file. Iterate the related records and fill in the 2610 * BIO/BUF. Gaps are zero-filled. 2611 * 2612 * The support code in hammer_object.c should be used to deal with mixed 2613 * in-memory and on-disk records. 2614 * 2615 * NOTE: Can be called from the cluster code with an oversized buf. 2616 * 2617 * XXX atime update 2618 */ 2619 static 2620 int 2621 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2622 { 2623 struct hammer_transaction trans; 2624 hammer_inode_t ip; 2625 hammer_inode_t dip; 2626 hammer_mount_t hmp; 2627 struct hammer_cursor cursor; 2628 hammer_base_elm_t base; 2629 hammer_off_t disk_offset; 2630 struct bio *bio; 2631 struct bio *nbio; 2632 struct buf *bp; 2633 int64_t rec_offset; 2634 int64_t ran_end; 2635 int64_t tmp64; 2636 int error; 2637 int boff; 2638 int roff; 2639 int n; 2640 int isdedupable; 2641 2642 bio = ap->a_bio; 2643 bp = bio->bio_buf; 2644 ip = ap->a_vp->v_data; 2645 hmp = ip->hmp; 2646 2647 /* 2648 * The zone-2 disk offset may have been set by the cluster code via 2649 * a BMAP operation, or else should be NOOFFSET. 2650 * 2651 * Checking the high bits for a match against zone-2 should suffice. 2652 * 2653 * In cases where a lot of data duplication is present it may be 2654 * more beneficial to drop through and doubule-buffer through the 2655 * device. 2656 */ 2657 nbio = push_bio(bio); 2658 if (hammer_is_zone_large_data(nbio->bio_offset)) { 2659 if (hammer_double_buffer == 0) { 2660 lwkt_gettoken(&hmp->fs_token); 2661 error = hammer_io_direct_read(hmp, nbio, NULL); 2662 lwkt_reltoken(&hmp->fs_token); 2663 return (error); 2664 } 2665 2666 /* 2667 * Try to shortcut requests for double_buffer mode too. 2668 * Since this mode runs through the device buffer cache 2669 * only compatible buffer sizes (meaning those generated 2670 * by normal filesystem buffers) are legal. 2671 */ 2672 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2673 lwkt_gettoken(&hmp->fs_token); 2674 error = hammer_io_indirect_read(hmp, nbio, NULL); 2675 lwkt_reltoken(&hmp->fs_token); 2676 return (error); 2677 } 2678 } 2679 2680 /* 2681 * Well, that sucked. Do it the hard way. If all the stars are 2682 * aligned we may still be able to issue a direct-read. 2683 */ 2684 lwkt_gettoken(&hmp->fs_token); 2685 hammer_simple_transaction(&trans, hmp); 2686 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2687 2688 /* 2689 * Key range (begin and end inclusive) to scan. Note that the key's 2690 * stored in the actual records represent BASE+LEN, not BASE. The 2691 * first record containing bio_offset will have a key > bio_offset. 2692 */ 2693 cursor.key_beg.localization = ip->obj_localization | 2694 HAMMER_LOCALIZE_MISC; 2695 cursor.key_beg.obj_id = ip->obj_id; 2696 cursor.key_beg.create_tid = 0; 2697 cursor.key_beg.delete_tid = 0; 2698 cursor.key_beg.obj_type = 0; 2699 cursor.key_beg.key = bio->bio_offset + 1; 2700 cursor.asof = ip->obj_asof; 2701 cursor.flags |= HAMMER_CURSOR_ASOF; 2702 2703 cursor.key_end = cursor.key_beg; 2704 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2705 #if 0 2706 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2707 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2708 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2709 cursor.key_end.key = HAMMER_MAX_KEY; 2710 } else 2711 #endif 2712 { 2713 ran_end = bio->bio_offset + bp->b_bufsize; 2714 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2715 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2716 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2717 if (tmp64 < ran_end) 2718 cursor.key_end.key = HAMMER_MAX_KEY; 2719 else 2720 cursor.key_end.key = ran_end + MAXPHYS + 1; 2721 } 2722 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2723 2724 /* 2725 * Set NOSWAPCACHE for cursor data extraction if double buffering 2726 * is disabled or (if the file is not marked cacheable via chflags 2727 * and vm.swapcache_use_chflags is enabled). 2728 */ 2729 if (hammer_double_buffer == 0 || 2730 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2731 vm_swapcache_use_chflags)) { 2732 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2733 } 2734 2735 error = hammer_ip_first(&cursor); 2736 boff = 0; 2737 2738 while (error == 0) { 2739 /* 2740 * Get the base file offset of the record. The key for 2741 * data records is (base + bytes) rather then (base). 2742 */ 2743 base = &cursor.leaf->base; 2744 rec_offset = base->key - cursor.leaf->data_len; 2745 2746 /* 2747 * Calculate the gap, if any, and zero-fill it. 2748 * 2749 * n is the offset of the start of the record verses our 2750 * current seek offset in the bio. 2751 */ 2752 n = (int)(rec_offset - (bio->bio_offset + boff)); 2753 if (n > 0) { 2754 if (n > bp->b_bufsize - boff) 2755 n = bp->b_bufsize - boff; 2756 bzero((char *)bp->b_data + boff, n); 2757 boff += n; 2758 n = 0; 2759 } 2760 2761 /* 2762 * Calculate the data offset in the record and the number 2763 * of bytes we can copy. 2764 * 2765 * There are two degenerate cases. First, boff may already 2766 * be at bp->b_bufsize. Secondly, the data offset within 2767 * the record may exceed the record's size. 2768 */ 2769 roff = -n; 2770 rec_offset += roff; 2771 n = cursor.leaf->data_len - roff; 2772 if (n <= 0) { 2773 hdkprintf("bad n=%d roff=%d\n", n, roff); 2774 n = 0; 2775 } else if (n > bp->b_bufsize - boff) { 2776 n = bp->b_bufsize - boff; 2777 } 2778 2779 /* 2780 * Deal with cached truncations. This cool bit of code 2781 * allows truncate()/ftruncate() to avoid having to sync 2782 * the file. 2783 * 2784 * If the frontend is truncated then all backend records are 2785 * subject to the frontend's truncation. 2786 * 2787 * If the backend is truncated then backend records on-disk 2788 * (but not in-memory) are subject to the backend's 2789 * truncation. In-memory records owned by the backend 2790 * represent data written after the truncation point on the 2791 * backend and must not be truncated. 2792 * 2793 * Truncate operations deal with frontend buffer cache 2794 * buffers and frontend-owned in-memory records synchronously. 2795 */ 2796 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2797 if (hammer_cursor_ondisk(&cursor)/* || 2798 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2799 if (ip->trunc_off <= rec_offset) 2800 n = 0; 2801 else if (ip->trunc_off < rec_offset + n) 2802 n = (int)(ip->trunc_off - rec_offset); 2803 } 2804 } 2805 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2806 if (hammer_cursor_ondisk(&cursor)) { 2807 if (ip->sync_trunc_off <= rec_offset) 2808 n = 0; 2809 else if (ip->sync_trunc_off < rec_offset + n) 2810 n = (int)(ip->sync_trunc_off - rec_offset); 2811 } 2812 } 2813 2814 /* 2815 * Try to issue a direct read into our bio if possible, 2816 * otherwise resolve the element data into a hammer_buffer 2817 * and copy. 2818 * 2819 * The buffer on-disk should be zerod past any real 2820 * truncation point, but may not be for any synthesized 2821 * truncation point from above. 2822 * 2823 * NOTE: disk_offset is only valid if the cursor data is 2824 * on-disk. 2825 */ 2826 disk_offset = cursor.leaf->data_offset + roff; 2827 isdedupable = (boff == 0 && n == bp->b_bufsize && 2828 hammer_cursor_ondisk(&cursor) && 2829 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2830 2831 if (isdedupable && hammer_double_buffer == 0) { 2832 /* 2833 * Direct read case 2834 */ 2835 KKASSERT(hammer_is_zone_large_data(disk_offset)); 2836 nbio->bio_offset = disk_offset; 2837 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2838 if (hammer_live_dedup && error == 0) 2839 hammer_dedup_cache_add(ip, cursor.leaf); 2840 goto done; 2841 } else if (isdedupable) { 2842 /* 2843 * Async I/O case for reading from backing store 2844 * and copying the data to the filesystem buffer. 2845 * live-dedup has to verify the data anyway if it 2846 * gets a hit later so we can just add the entry 2847 * now. 2848 */ 2849 KKASSERT(hammer_is_zone_large_data(disk_offset)); 2850 nbio->bio_offset = disk_offset; 2851 if (hammer_live_dedup) 2852 hammer_dedup_cache_add(ip, cursor.leaf); 2853 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2854 goto done; 2855 } else if (n) { 2856 error = hammer_ip_resolve_data(&cursor); 2857 if (error == 0) { 2858 if (hammer_live_dedup && isdedupable) 2859 hammer_dedup_cache_add(ip, cursor.leaf); 2860 bcopy((char *)cursor.data + roff, 2861 (char *)bp->b_data + boff, n); 2862 } 2863 } 2864 if (error) 2865 break; 2866 2867 /* 2868 * We have to be sure that the only elements added to the 2869 * dedup cache are those which are already on-media. 2870 */ 2871 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2872 hammer_dedup_cache_add(ip, cursor.leaf); 2873 2874 /* 2875 * Iterate until we have filled the request. 2876 */ 2877 boff += n; 2878 if (boff == bp->b_bufsize) 2879 break; 2880 error = hammer_ip_next(&cursor); 2881 } 2882 2883 /* 2884 * There may have been a gap after the last record 2885 */ 2886 if (error == ENOENT) 2887 error = 0; 2888 if (error == 0 && boff != bp->b_bufsize) { 2889 KKASSERT(boff < bp->b_bufsize); 2890 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2891 /* boff = bp->b_bufsize; */ 2892 } 2893 2894 /* 2895 * Disallow swapcache operation on the vnode buffer if double 2896 * buffering is enabled, the swapcache will get the data via 2897 * the block device buffer. 2898 */ 2899 if (hammer_double_buffer) 2900 bp->b_flags |= B_NOTMETA; 2901 2902 /* 2903 * Cleanup 2904 */ 2905 bp->b_resid = 0; 2906 bp->b_error = error; 2907 if (error) 2908 bp->b_flags |= B_ERROR; 2909 biodone(ap->a_bio); 2910 2911 done: 2912 /* 2913 * Cache the b-tree node for the last data read in cache[1]. 2914 * 2915 * If we hit the file EOF then also cache the node in the 2916 * governing directory's cache[3], it will be used to initialize 2917 * the new inode's cache[1] for any inodes looked up via the directory. 2918 * 2919 * This doesn't reduce disk accesses since the B-Tree chain is 2920 * likely cached, but it does reduce cpu overhead when looking 2921 * up file offsets for cpdup/tar/cpio style iterations. 2922 */ 2923 if (cursor.node) 2924 hammer_cache_node(&ip->cache[1], cursor.node); 2925 if (ran_end >= ip->ino_data.size) { 2926 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2927 ip->obj_asof, ip->obj_localization); 2928 if (dip) { 2929 hammer_cache_node(&dip->cache[3], cursor.node); 2930 hammer_rel_inode(dip, 0); 2931 } 2932 } 2933 hammer_done_cursor(&cursor); 2934 hammer_done_transaction(&trans); 2935 lwkt_reltoken(&hmp->fs_token); 2936 return(error); 2937 } 2938 2939 /* 2940 * BMAP operation - used to support cluster_read() only. 2941 * 2942 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2943 * 2944 * This routine may return EOPNOTSUPP if the opration is not supported for 2945 * the specified offset. The contents of the pointer arguments do not 2946 * need to be initialized in that case. 2947 * 2948 * If a disk address is available and properly aligned return 0 with 2949 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2950 * to the run-length relative to that offset. Callers may assume that 2951 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2952 * large, so return EOPNOTSUPP if it is not sufficiently large. 2953 */ 2954 static 2955 int 2956 hammer_vop_bmap(struct vop_bmap_args *ap) 2957 { 2958 struct hammer_transaction trans; 2959 hammer_inode_t ip; 2960 hammer_mount_t hmp; 2961 struct hammer_cursor cursor; 2962 hammer_base_elm_t base; 2963 int64_t rec_offset; 2964 int64_t ran_end; 2965 int64_t tmp64; 2966 int64_t base_offset; 2967 int64_t base_disk_offset; 2968 int64_t last_offset; 2969 hammer_off_t last_disk_offset; 2970 hammer_off_t disk_offset; 2971 int rec_len; 2972 int error; 2973 int blksize; 2974 2975 ip = ap->a_vp->v_data; 2976 hmp = ip->hmp; 2977 2978 /* 2979 * We can only BMAP regular files. We can't BMAP database files, 2980 * directories, etc. 2981 */ 2982 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 2983 return(EOPNOTSUPP); 2984 2985 /* 2986 * bmap is typically called with runp/runb both NULL when used 2987 * for writing. We do not support BMAP for writing atm. 2988 */ 2989 if (ap->a_cmd != BUF_CMD_READ) 2990 return(EOPNOTSUPP); 2991 2992 /* 2993 * Scan the B-Tree to acquire blockmap addresses, then translate 2994 * to raw addresses. 2995 */ 2996 lwkt_gettoken(&hmp->fs_token); 2997 hammer_simple_transaction(&trans, hmp); 2998 2999 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3000 3001 /* 3002 * Key range (begin and end inclusive) to scan. Note that the key's 3003 * stored in the actual records represent BASE+LEN, not BASE. The 3004 * first record containing bio_offset will have a key > bio_offset. 3005 */ 3006 cursor.key_beg.localization = ip->obj_localization | 3007 HAMMER_LOCALIZE_MISC; 3008 cursor.key_beg.obj_id = ip->obj_id; 3009 cursor.key_beg.create_tid = 0; 3010 cursor.key_beg.delete_tid = 0; 3011 cursor.key_beg.obj_type = 0; 3012 if (ap->a_runb) 3013 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3014 else 3015 cursor.key_beg.key = ap->a_loffset + 1; 3016 if (cursor.key_beg.key < 0) 3017 cursor.key_beg.key = 0; 3018 cursor.asof = ip->obj_asof; 3019 cursor.flags |= HAMMER_CURSOR_ASOF; 3020 3021 cursor.key_end = cursor.key_beg; 3022 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3023 3024 ran_end = ap->a_loffset + MAXPHYS; 3025 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3026 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3027 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3028 if (tmp64 < ran_end) 3029 cursor.key_end.key = HAMMER_MAX_KEY; 3030 else 3031 cursor.key_end.key = ran_end + MAXPHYS + 1; 3032 3033 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3034 3035 error = hammer_ip_first(&cursor); 3036 base_offset = last_offset = 0; 3037 base_disk_offset = last_disk_offset = 0; 3038 3039 while (error == 0) { 3040 /* 3041 * Get the base file offset of the record. The key for 3042 * data records is (base + bytes) rather then (base). 3043 * 3044 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3045 * The extra bytes should be zero on-disk and the BMAP op 3046 * should still be ok. 3047 */ 3048 base = &cursor.leaf->base; 3049 rec_offset = base->key - cursor.leaf->data_len; 3050 rec_len = cursor.leaf->data_len; 3051 3052 /* 3053 * Incorporate any cached truncation. 3054 * 3055 * NOTE: Modifications to rec_len based on synthesized 3056 * truncation points remove the guarantee that any extended 3057 * data on disk is zero (since the truncations may not have 3058 * taken place on-media yet). 3059 */ 3060 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3061 if (hammer_cursor_ondisk(&cursor) || 3062 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3063 if (ip->trunc_off <= rec_offset) 3064 rec_len = 0; 3065 else if (ip->trunc_off < rec_offset + rec_len) 3066 rec_len = (int)(ip->trunc_off - rec_offset); 3067 } 3068 } 3069 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3070 if (hammer_cursor_ondisk(&cursor)) { 3071 if (ip->sync_trunc_off <= rec_offset) 3072 rec_len = 0; 3073 else if (ip->sync_trunc_off < rec_offset + rec_len) 3074 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3075 } 3076 } 3077 3078 /* 3079 * Accumulate information. If we have hit a discontiguous 3080 * block reset base_offset unless we are already beyond the 3081 * requested offset. If we are, that's it, we stop. 3082 */ 3083 if (error) 3084 break; 3085 if (hammer_cursor_ondisk(&cursor)) { 3086 disk_offset = cursor.leaf->data_offset; 3087 if (rec_offset != last_offset || 3088 disk_offset != last_disk_offset) { 3089 if (rec_offset > ap->a_loffset) 3090 break; 3091 base_offset = rec_offset; 3092 base_disk_offset = disk_offset; 3093 } 3094 last_offset = rec_offset + rec_len; 3095 last_disk_offset = disk_offset + rec_len; 3096 3097 if (hammer_live_dedup) 3098 hammer_dedup_cache_add(ip, cursor.leaf); 3099 } 3100 3101 error = hammer_ip_next(&cursor); 3102 } 3103 3104 if (cursor.node) 3105 hammer_cache_node(&ip->cache[1], cursor.node); 3106 3107 hammer_done_cursor(&cursor); 3108 hammer_done_transaction(&trans); 3109 lwkt_reltoken(&hmp->fs_token); 3110 3111 /* 3112 * If we couldn't find any records or the records we did find were 3113 * all behind the requested offset, return failure. A forward 3114 * truncation can leave a hole w/ no on-disk records. 3115 */ 3116 if (last_offset == 0 || last_offset < ap->a_loffset) 3117 return (EOPNOTSUPP); 3118 3119 /* 3120 * Figure out the block size at the requested offset and adjust 3121 * our limits so the cluster_read() does not create inappropriately 3122 * sized buffer cache buffers. 3123 */ 3124 blksize = hammer_blocksize(ap->a_loffset); 3125 if (hammer_blocksize(base_offset) != blksize) { 3126 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3127 } 3128 if (last_offset != ap->a_loffset && 3129 hammer_blocksize(last_offset - 1) != blksize) { 3130 last_offset = hammer_blockdemarc(ap->a_loffset, 3131 last_offset - 1); 3132 } 3133 3134 /* 3135 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3136 * from occuring. 3137 */ 3138 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3139 3140 if (!hammer_is_zone_large_data(disk_offset)) { 3141 /* 3142 * Only large-data zones can be direct-IOd 3143 */ 3144 error = EOPNOTSUPP; 3145 } else if ((disk_offset & HAMMER_BUFMASK) || 3146 (last_offset - ap->a_loffset) < blksize) { 3147 /* 3148 * doffsetp is not aligned or the forward run size does 3149 * not cover a whole buffer, disallow the direct I/O. 3150 */ 3151 error = EOPNOTSUPP; 3152 } else { 3153 /* 3154 * We're good. 3155 */ 3156 *ap->a_doffsetp = disk_offset; 3157 if (ap->a_runb) { 3158 *ap->a_runb = ap->a_loffset - base_offset; 3159 KKASSERT(*ap->a_runb >= 0); 3160 } 3161 if (ap->a_runp) { 3162 *ap->a_runp = last_offset - ap->a_loffset; 3163 KKASSERT(*ap->a_runp >= 0); 3164 } 3165 error = 0; 3166 } 3167 return(error); 3168 } 3169 3170 /* 3171 * Write to a regular file. Because this is a strategy call the OS is 3172 * trying to actually get data onto the media. 3173 */ 3174 static 3175 int 3176 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3177 { 3178 hammer_record_t record; 3179 hammer_mount_t hmp; 3180 hammer_inode_t ip; 3181 struct bio *bio; 3182 struct buf *bp; 3183 int blksize __debugvar; 3184 int bytes; 3185 int error; 3186 3187 bio = ap->a_bio; 3188 bp = bio->bio_buf; 3189 ip = ap->a_vp->v_data; 3190 hmp = ip->hmp; 3191 3192 blksize = hammer_blocksize(bio->bio_offset); 3193 KKASSERT(bp->b_bufsize == blksize); 3194 3195 if (ip->flags & HAMMER_INODE_RO) { 3196 bp->b_error = EROFS; 3197 bp->b_flags |= B_ERROR; 3198 biodone(ap->a_bio); 3199 return(EROFS); 3200 } 3201 3202 lwkt_gettoken(&hmp->fs_token); 3203 3204 /* 3205 * Disallow swapcache operation on the vnode buffer if double 3206 * buffering is enabled, the swapcache will get the data via 3207 * the block device buffer. 3208 */ 3209 if (hammer_double_buffer) 3210 bp->b_flags |= B_NOTMETA; 3211 3212 /* 3213 * Interlock with inode destruction (no in-kernel or directory 3214 * topology visibility). If we queue new IO while trying to 3215 * destroy the inode we can deadlock the vtrunc call in 3216 * hammer_inode_unloadable_check(). 3217 * 3218 * Besides, there's no point flushing a bp associated with an 3219 * inode that is being destroyed on-media and has no kernel 3220 * references. 3221 */ 3222 if ((ip->flags | ip->sync_flags) & 3223 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3224 bp->b_resid = 0; 3225 biodone(ap->a_bio); 3226 lwkt_reltoken(&hmp->fs_token); 3227 return(0); 3228 } 3229 3230 /* 3231 * Reserve space and issue a direct-write from the front-end. 3232 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3233 * allocations. 3234 * 3235 * An in-memory record will be installed to reference the storage 3236 * until the flusher can get to it. 3237 * 3238 * Since we own the high level bio the front-end will not try to 3239 * do a direct-read until the write completes. 3240 * 3241 * NOTE: The only time we do not reserve a full-sized buffers 3242 * worth of data is if the file is small. We do not try to 3243 * allocate a fragment (from the small-data zone) at the end of 3244 * an otherwise large file as this can lead to wildly separated 3245 * data. 3246 */ 3247 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3248 KKASSERT(bio->bio_offset < ip->ino_data.size); 3249 if (bio->bio_offset || ip->ino_data.size > HAMMER_HBUFSIZE) 3250 bytes = bp->b_bufsize; 3251 else 3252 bytes = HAMMER_DATA_DOALIGN_WITH(int, ip->ino_data.size); 3253 3254 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3255 bytes, &error); 3256 3257 /* 3258 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3259 * in hammer_vop_write(). We must flag the record so the proper 3260 * REDO_TERM_WRITE entry is generated during the flush. 3261 */ 3262 if (record) { 3263 if (bp->b_flags & B_VFSFLAG1) { 3264 record->flags |= HAMMER_RECF_REDO; 3265 bp->b_flags &= ~B_VFSFLAG1; 3266 } 3267 if (record->flags & HAMMER_RECF_DEDUPED) { 3268 bp->b_resid = 0; 3269 hammer_ip_replace_bulk(hmp, record); 3270 biodone(ap->a_bio); 3271 } else { 3272 hammer_io_direct_write(hmp, bio, record); 3273 } 3274 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3275 hammer_flush_inode(ip, 0); 3276 } else { 3277 bp->b_bio2.bio_offset = NOOFFSET; 3278 bp->b_error = error; 3279 bp->b_flags |= B_ERROR; 3280 biodone(ap->a_bio); 3281 } 3282 lwkt_reltoken(&hmp->fs_token); 3283 return(error); 3284 } 3285 3286 /* 3287 * dounlink - disconnect a directory entry 3288 * 3289 * XXX whiteout support not really in yet 3290 */ 3291 static int 3292 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3293 struct vnode *dvp, struct ucred *cred, 3294 int flags, int isdir) 3295 { 3296 struct namecache *ncp; 3297 hammer_inode_t dip; 3298 hammer_inode_t ip; 3299 hammer_mount_t hmp; 3300 struct hammer_cursor cursor; 3301 int64_t namekey; 3302 uint32_t max_iterations; 3303 int nlen, error; 3304 3305 /* 3306 * Calculate the namekey and setup the key range for the scan. This 3307 * works kinda like a chained hash table where the lower 32 bits 3308 * of the namekey synthesize the chain. 3309 * 3310 * The key range is inclusive of both key_beg and key_end. 3311 */ 3312 dip = VTOI(dvp); 3313 ncp = nch->ncp; 3314 hmp = dip->hmp; 3315 3316 if (dip->flags & HAMMER_INODE_RO) 3317 return (EROFS); 3318 3319 namekey = hammer_direntry_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3320 &max_iterations); 3321 retry: 3322 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3323 cursor.key_beg.localization = dip->obj_localization | 3324 hammer_dir_localization(dip); 3325 cursor.key_beg.obj_id = dip->obj_id; 3326 cursor.key_beg.key = namekey; 3327 cursor.key_beg.create_tid = 0; 3328 cursor.key_beg.delete_tid = 0; 3329 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3330 cursor.key_beg.obj_type = 0; 3331 3332 cursor.key_end = cursor.key_beg; 3333 cursor.key_end.key += max_iterations; 3334 cursor.asof = dip->obj_asof; 3335 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3336 3337 /* 3338 * Scan all matching records (the chain), locate the one matching 3339 * the requested path component. info->last_error contains the 3340 * error code on search termination and could be 0, ENOENT, or 3341 * something else. 3342 * 3343 * The hammer_ip_*() functions merge in-memory records with on-disk 3344 * records for the purposes of the search. 3345 */ 3346 error = hammer_ip_first(&cursor); 3347 3348 while (error == 0) { 3349 error = hammer_ip_resolve_data(&cursor); 3350 if (error) 3351 break; 3352 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3353 KKASSERT(nlen > 0); 3354 if (ncp->nc_nlen == nlen && 3355 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3356 break; 3357 } 3358 error = hammer_ip_next(&cursor); 3359 } 3360 3361 /* 3362 * If all is ok we have to get the inode so we can adjust nlinks. 3363 * To avoid a deadlock with the flusher we must release the inode 3364 * lock on the directory when acquiring the inode for the entry. 3365 * 3366 * If the target is a directory, it must be empty. 3367 */ 3368 if (error == 0) { 3369 hammer_unlock(&cursor.ip->lock); 3370 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3371 hmp->asof, 3372 cursor.data->entry.localization, 3373 0, &error); 3374 hammer_lock_sh(&cursor.ip->lock); 3375 if (error == ENOENT) { 3376 hkprintf("WARNING: Removing dirent w/missing inode " 3377 "\"%s\"\n" 3378 "\tobj_id = %016jx\n", 3379 ncp->nc_name, 3380 (intmax_t)cursor.data->entry.obj_id); 3381 error = 0; 3382 } 3383 3384 /* 3385 * If isdir >= 0 we validate that the entry is or is not a 3386 * directory. If isdir < 0 we don't care. 3387 */ 3388 if (error == 0 && isdir >= 0 && ip) { 3389 if (isdir && 3390 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3391 error = ENOTDIR; 3392 } else if (isdir == 0 && 3393 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3394 error = EISDIR; 3395 } 3396 } 3397 3398 /* 3399 * If we are trying to remove a directory the directory must 3400 * be empty. 3401 * 3402 * The check directory code can loop and deadlock/retry. Our 3403 * own cursor's node locks must be released to avoid a 3-way 3404 * deadlock with the flusher if the check directory code 3405 * blocks. 3406 * 3407 * If any changes whatsoever have been made to the cursor 3408 * set EDEADLK and retry. 3409 * 3410 * WARNING: See warnings in hammer_unlock_cursor() 3411 * function. 3412 */ 3413 if (error == 0 && ip && ip->ino_data.obj_type == 3414 HAMMER_OBJTYPE_DIRECTORY) { 3415 hammer_unlock_cursor(&cursor); 3416 error = hammer_ip_check_directory_empty(trans, ip); 3417 hammer_lock_cursor(&cursor); 3418 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3419 hkprintf("Warning: avoided deadlock " 3420 "on rmdir '%s'\n", 3421 ncp->nc_name); 3422 error = EDEADLK; 3423 } 3424 } 3425 3426 /* 3427 * Delete the directory entry. 3428 * 3429 * WARNING: hammer_ip_del_direntry() may have to terminate 3430 * the cursor to avoid a deadlock. It is ok to call 3431 * hammer_done_cursor() twice. 3432 */ 3433 if (error == 0) { 3434 error = hammer_ip_del_direntry(trans, &cursor, 3435 dip, ip); 3436 } 3437 hammer_done_cursor(&cursor); 3438 if (error == 0) { 3439 /* 3440 * Tell the namecache that we are now unlinked. 3441 */ 3442 cache_unlink(nch); 3443 3444 /* 3445 * NOTE: ip->vp, if non-NULL, cannot be directly 3446 * referenced without formally acquiring the 3447 * vp since the vp might have zero refs on it, 3448 * or in the middle of a reclaim, etc. 3449 * 3450 * NOTE: The cache_setunresolved() can rip the vp 3451 * out from under us since the vp may not have 3452 * any refs, in which case ip->vp will be NULL 3453 * from the outset. 3454 */ 3455 while (ip && ip->vp) { 3456 struct vnode *vp; 3457 3458 error = hammer_get_vnode(ip, &vp); 3459 if (error == 0 && vp) { 3460 vn_unlock(vp); 3461 hammer_knote(ip->vp, NOTE_DELETE); 3462 #if 0 3463 /* 3464 * Don't do this, it can deadlock 3465 * on concurrent rm's of hardlinks. 3466 * Shouldn't be needed any more. 3467 */ 3468 cache_inval_vp(ip->vp, CINV_DESTROY); 3469 #endif 3470 vrele(vp); 3471 break; 3472 } 3473 hdkprintf("ip/vp race1 avoided\n"); 3474 } 3475 } 3476 if (ip) 3477 hammer_rel_inode(ip, 0); 3478 } else { 3479 hammer_done_cursor(&cursor); 3480 } 3481 if (error == EDEADLK) 3482 goto retry; 3483 3484 return (error); 3485 } 3486 3487 /************************************************************************ 3488 * FIFO AND SPECFS OPS * 3489 ************************************************************************ 3490 * 3491 */ 3492 static int 3493 hammer_vop_fifoclose (struct vop_close_args *ap) 3494 { 3495 /* XXX update itimes */ 3496 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3497 } 3498 3499 static int 3500 hammer_vop_fiforead (struct vop_read_args *ap) 3501 { 3502 int error; 3503 3504 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3505 /* XXX update access time */ 3506 return (error); 3507 } 3508 3509 static int 3510 hammer_vop_fifowrite (struct vop_write_args *ap) 3511 { 3512 int error; 3513 3514 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3515 /* XXX update access time */ 3516 return (error); 3517 } 3518 3519 static 3520 int 3521 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3522 { 3523 int error; 3524 3525 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3526 if (error) 3527 error = hammer_vop_kqfilter(ap); 3528 return(error); 3529 } 3530 3531 /************************************************************************ 3532 * KQFILTER OPS * 3533 ************************************************************************ 3534 * 3535 */ 3536 static void filt_hammerdetach(struct knote *kn); 3537 static int filt_hammerread(struct knote *kn, long hint); 3538 static int filt_hammerwrite(struct knote *kn, long hint); 3539 static int filt_hammervnode(struct knote *kn, long hint); 3540 3541 static struct filterops hammerread_filtops = 3542 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3543 NULL, filt_hammerdetach, filt_hammerread }; 3544 static struct filterops hammerwrite_filtops = 3545 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3546 NULL, filt_hammerdetach, filt_hammerwrite }; 3547 static struct filterops hammervnode_filtops = 3548 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3549 NULL, filt_hammerdetach, filt_hammervnode }; 3550 3551 static 3552 int 3553 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3554 { 3555 struct vnode *vp = ap->a_vp; 3556 struct knote *kn = ap->a_kn; 3557 3558 switch (kn->kn_filter) { 3559 case EVFILT_READ: 3560 kn->kn_fop = &hammerread_filtops; 3561 break; 3562 case EVFILT_WRITE: 3563 kn->kn_fop = &hammerwrite_filtops; 3564 break; 3565 case EVFILT_VNODE: 3566 kn->kn_fop = &hammervnode_filtops; 3567 break; 3568 default: 3569 return (EOPNOTSUPP); 3570 } 3571 3572 kn->kn_hook = (caddr_t)vp; 3573 3574 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3575 3576 return(0); 3577 } 3578 3579 static void 3580 filt_hammerdetach(struct knote *kn) 3581 { 3582 struct vnode *vp = (void *)kn->kn_hook; 3583 3584 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3585 } 3586 3587 static int 3588 filt_hammerread(struct knote *kn, long hint) 3589 { 3590 struct vnode *vp = (void *)kn->kn_hook; 3591 hammer_inode_t ip = VTOI(vp); 3592 hammer_mount_t hmp = ip->hmp; 3593 off_t off; 3594 3595 if (hint == NOTE_REVOKE) { 3596 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3597 return(1); 3598 } 3599 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3600 off = ip->ino_data.size - kn->kn_fp->f_offset; 3601 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3602 lwkt_reltoken(&hmp->fs_token); 3603 if (kn->kn_sfflags & NOTE_OLDAPI) 3604 return(1); 3605 return (kn->kn_data != 0); 3606 } 3607 3608 static int 3609 filt_hammerwrite(struct knote *kn, long hint) 3610 { 3611 if (hint == NOTE_REVOKE) 3612 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3613 kn->kn_data = 0; 3614 return (1); 3615 } 3616 3617 static int 3618 filt_hammervnode(struct knote *kn, long hint) 3619 { 3620 if (kn->kn_sfflags & hint) 3621 kn->kn_fflags |= hint; 3622 if (hint == NOTE_REVOKE) { 3623 kn->kn_flags |= (EV_EOF | EV_NODATA); 3624 return (1); 3625 } 3626 return (kn->kn_fflags != 0); 3627 } 3628 3629