1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/mountctl.h> 36 #include <sys/namecache.h> 37 #include <sys/buf2.h> 38 #include <vfs/fifofs/fifo.h> 39 40 #include "hammer.h" 41 42 /* 43 * USERFS VNOPS 44 */ 45 static int hammer_vop_fsync(struct vop_fsync_args *); 46 static int hammer_vop_read(struct vop_read_args *); 47 static int hammer_vop_write(struct vop_write_args *); 48 static int hammer_vop_access(struct vop_access_args *); 49 static int hammer_vop_advlock(struct vop_advlock_args *); 50 static int hammer_vop_close(struct vop_close_args *); 51 static int hammer_vop_ncreate(struct vop_ncreate_args *); 52 static int hammer_vop_getattr(struct vop_getattr_args *); 53 static int hammer_vop_nresolve(struct vop_nresolve_args *); 54 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 55 static int hammer_vop_nlink(struct vop_nlink_args *); 56 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 57 static int hammer_vop_nmknod(struct vop_nmknod_args *); 58 static int hammer_vop_open(struct vop_open_args *); 59 static int hammer_vop_print(struct vop_print_args *); 60 static int hammer_vop_readdir(struct vop_readdir_args *); 61 static int hammer_vop_readlink(struct vop_readlink_args *); 62 static int hammer_vop_nremove(struct vop_nremove_args *); 63 static int hammer_vop_nrename(struct vop_nrename_args *); 64 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 65 static int hammer_vop_markatime(struct vop_markatime_args *); 66 static int hammer_vop_setattr(struct vop_setattr_args *); 67 static int hammer_vop_strategy(struct vop_strategy_args *); 68 static int hammer_vop_bmap(struct vop_bmap_args *ap); 69 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 70 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 71 static int hammer_vop_ioctl(struct vop_ioctl_args *); 72 static int hammer_vop_mountctl(struct vop_mountctl_args *); 73 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 74 75 static int hammer_vop_fifoclose (struct vop_close_args *); 76 static int hammer_vop_fiforead (struct vop_read_args *); 77 static int hammer_vop_fifowrite (struct vop_write_args *); 78 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 79 80 struct vop_ops hammer_vnode_vops = { 81 .vop_default = vop_defaultop, 82 .vop_fsync = hammer_vop_fsync, 83 .vop_getpages = vop_stdgetpages, 84 .vop_putpages = vop_stdputpages, 85 .vop_read = hammer_vop_read, 86 .vop_write = hammer_vop_write, 87 .vop_access = hammer_vop_access, 88 .vop_advlock = hammer_vop_advlock, 89 .vop_close = hammer_vop_close, 90 .vop_ncreate = hammer_vop_ncreate, 91 .vop_getattr = hammer_vop_getattr, 92 .vop_inactive = hammer_vop_inactive, 93 .vop_reclaim = hammer_vop_reclaim, 94 .vop_nresolve = hammer_vop_nresolve, 95 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 96 .vop_nlink = hammer_vop_nlink, 97 .vop_nmkdir = hammer_vop_nmkdir, 98 .vop_nmknod = hammer_vop_nmknod, 99 .vop_open = hammer_vop_open, 100 .vop_pathconf = vop_stdpathconf, 101 .vop_print = hammer_vop_print, 102 .vop_readdir = hammer_vop_readdir, 103 .vop_readlink = hammer_vop_readlink, 104 .vop_nremove = hammer_vop_nremove, 105 .vop_nrename = hammer_vop_nrename, 106 .vop_nrmdir = hammer_vop_nrmdir, 107 .vop_markatime = hammer_vop_markatime, 108 .vop_setattr = hammer_vop_setattr, 109 .vop_bmap = hammer_vop_bmap, 110 .vop_strategy = hammer_vop_strategy, 111 .vop_nsymlink = hammer_vop_nsymlink, 112 .vop_nwhiteout = hammer_vop_nwhiteout, 113 .vop_ioctl = hammer_vop_ioctl, 114 .vop_mountctl = hammer_vop_mountctl, 115 .vop_kqfilter = hammer_vop_kqfilter 116 }; 117 118 struct vop_ops hammer_spec_vops = { 119 .vop_default = vop_defaultop, 120 .vop_fsync = hammer_vop_fsync, 121 .vop_read = vop_stdnoread, 122 .vop_write = vop_stdnowrite, 123 .vop_access = hammer_vop_access, 124 .vop_close = hammer_vop_close, 125 .vop_markatime = hammer_vop_markatime, 126 .vop_getattr = hammer_vop_getattr, 127 .vop_inactive = hammer_vop_inactive, 128 .vop_reclaim = hammer_vop_reclaim, 129 .vop_setattr = hammer_vop_setattr 130 }; 131 132 struct vop_ops hammer_fifo_vops = { 133 .vop_default = fifo_vnoperate, 134 .vop_fsync = hammer_vop_fsync, 135 .vop_read = hammer_vop_fiforead, 136 .vop_write = hammer_vop_fifowrite, 137 .vop_access = hammer_vop_access, 138 .vop_close = hammer_vop_fifoclose, 139 .vop_markatime = hammer_vop_markatime, 140 .vop_getattr = hammer_vop_getattr, 141 .vop_inactive = hammer_vop_inactive, 142 .vop_reclaim = hammer_vop_reclaim, 143 .vop_setattr = hammer_vop_setattr, 144 .vop_kqfilter = hammer_vop_fifokqfilter 145 }; 146 147 static __inline 148 void 149 hammer_knote(struct vnode *vp, int flags) 150 { 151 if (flags) 152 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 153 } 154 155 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 156 struct vnode *dvp, struct ucred *cred, 157 int flags, int isdir); 158 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 159 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 160 161 /* 162 * hammer_vop_fsync { vp, waitfor } 163 * 164 * fsync() an inode to disk and wait for it to be completely committed 165 * such that the information would not be undone if a crash occured after 166 * return. 167 * 168 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 169 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 170 * operation. 171 * 172 * Ultimately the combination of a REDO log and use of fast storage 173 * to front-end cluster caches will make fsync fast, but it aint 174 * here yet. And, in anycase, we need real transactional 175 * all-or-nothing features which are not restricted to a single file. 176 */ 177 static 178 int 179 hammer_vop_fsync(struct vop_fsync_args *ap) 180 { 181 hammer_inode_t ip = VTOI(ap->a_vp); 182 hammer_mount_t hmp = ip->hmp; 183 int waitfor = ap->a_waitfor; 184 int mode; 185 186 lwkt_gettoken(&hmp->fs_token); 187 188 /* 189 * Fsync rule relaxation (default is either full synchronous flush 190 * or REDO semantics with synchronous flush). 191 */ 192 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 193 switch(hammer_fsync_mode) { 194 case 0: 195 mode0: 196 /* no REDO, full synchronous flush */ 197 goto skip; 198 case 1: 199 mode1: 200 /* no REDO, full asynchronous flush */ 201 if (waitfor == MNT_WAIT) 202 waitfor = MNT_NOWAIT; 203 goto skip; 204 case 2: 205 /* REDO semantics, synchronous flush */ 206 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 207 goto mode0; 208 mode = HAMMER_FLUSH_UNDOS_AUTO; 209 break; 210 case 3: 211 /* REDO semantics, relaxed asynchronous flush */ 212 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 213 goto mode1; 214 mode = HAMMER_FLUSH_UNDOS_RELAXED; 215 if (waitfor == MNT_WAIT) 216 waitfor = MNT_NOWAIT; 217 break; 218 case 4: 219 /* ignore the fsync() system call */ 220 lwkt_reltoken(&hmp->fs_token); 221 return(0); 222 default: 223 /* we have to do something */ 224 mode = HAMMER_FLUSH_UNDOS_RELAXED; 225 if (waitfor == MNT_WAIT) 226 waitfor = MNT_NOWAIT; 227 break; 228 } 229 230 /* 231 * Fast fsync only needs to flush the UNDO/REDO fifo if 232 * HAMMER_INODE_REDO is non-zero and the only modifications 233 * made to the file are write or write-extends. 234 */ 235 if ((ip->flags & HAMMER_INODE_REDO) && 236 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0) { 237 ++hammer_count_fsyncs; 238 hammer_flusher_flush_undos(hmp, mode); 239 ip->redo_count = 0; 240 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 241 vclrisdirty(ip->vp); 242 lwkt_reltoken(&hmp->fs_token); 243 return(0); 244 } 245 246 /* 247 * REDO is enabled by fsync(), the idea being we really only 248 * want to lay down REDO records when programs are using 249 * fsync() heavily. The first fsync() on the file starts 250 * the gravy train going and later fsync()s keep it hot by 251 * resetting the redo_count. 252 * 253 * We weren't running REDOs before now so we have to fall 254 * through and do a full fsync of what we have. 255 */ 256 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 257 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 258 ip->flags |= HAMMER_INODE_REDO; 259 ip->redo_count = 0; 260 } 261 } 262 skip: 263 264 /* 265 * Do a full flush sequence. 266 * 267 * Attempt to release the vnode while waiting for the inode to 268 * finish flushing. This can really mess up inactive->reclaim 269 * sequences so only do it if the vnode is active. 270 * 271 * WARNING! The VX lock functions must be used. vn_lock() will 272 * fail when this is part of a VOP_RECLAIM sequence. 273 */ 274 ++hammer_count_fsyncs; 275 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 276 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 277 if (waitfor == MNT_WAIT) { 278 int dorelock; 279 280 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 281 vn_unlock(ap->a_vp); 282 dorelock = 1; 283 } else { 284 dorelock = 0; 285 } 286 hammer_wait_inode(ip); 287 if (dorelock) 288 vn_relock(ap->a_vp, LK_EXCLUSIVE); 289 } 290 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 291 vclrisdirty(ip->vp); 292 lwkt_reltoken(&hmp->fs_token); 293 return (ip->error); 294 } 295 296 /* 297 * hammer_vop_read { vp, uio, ioflag, cred } 298 * 299 * MPSAFE (for the cache safe does not require fs_token) 300 */ 301 static 302 int 303 hammer_vop_read(struct vop_read_args *ap) 304 { 305 struct hammer_transaction trans; 306 hammer_inode_t ip; 307 hammer_mount_t hmp; 308 off_t offset; 309 struct buf *bp; 310 struct uio *uio; 311 int error; 312 int n; 313 int seqcount; 314 int ioseqcount; 315 int blksize; 316 int bigread; 317 int got_trans; 318 size_t resid; 319 320 if (ap->a_vp->v_type == VDIR) 321 return (EISDIR); 322 if (ap->a_vp->v_type != VREG) 323 return (EINVAL); 324 ip = VTOI(ap->a_vp); 325 hmp = ip->hmp; 326 error = 0; 327 got_trans = 0; 328 uio = ap->a_uio; 329 330 /* 331 * Attempt to shortcut directly to the VM object using lwbufs. 332 * This is much faster than instantiating buffer cache buffers. 333 */ 334 resid = uio->uio_resid; 335 error = vop_helper_read_shortcut(ap); 336 hammer_stats_file_read += resid - uio->uio_resid; 337 if (error) 338 return (error); 339 if (uio->uio_resid == 0) 340 goto finished; 341 342 /* 343 * Allow the UIO's size to override the sequential heuristic. 344 */ 345 blksize = hammer_blocksize(uio->uio_offset); 346 seqcount = howmany(uio->uio_resid, MAXBSIZE); 347 ioseqcount = ap->a_ioflag >> IO_SEQSHIFT; 348 if (seqcount < ioseqcount) 349 seqcount = ioseqcount; 350 351 /* 352 * If reading or writing a huge amount of data we have to break 353 * atomicy and allow the operation to be interrupted by a signal 354 * or it can DOS the machine. 355 */ 356 bigread = (uio->uio_resid > 100 * 1024 * 1024); 357 358 /* 359 * Access the data typically in HAMMER_BUFSIZE blocks via the 360 * buffer cache, but HAMMER may use a variable block size based 361 * on the offset. 362 * 363 * XXX Temporary hack, delay the start transaction while we remain 364 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 365 * locked-shared. 366 */ 367 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 368 int64_t base_offset; 369 int64_t file_limit; 370 371 blksize = hammer_blocksize(uio->uio_offset); 372 offset = (int)uio->uio_offset & (blksize - 1); 373 base_offset = uio->uio_offset - offset; 374 375 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 376 break; 377 378 /* 379 * MPSAFE 380 */ 381 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 382 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 383 bp->b_flags &= ~B_AGE; 384 error = 0; 385 goto skip; 386 } 387 if (ap->a_ioflag & IO_NRDELAY) { 388 bqrelse(bp); 389 return (EWOULDBLOCK); 390 } 391 392 /* 393 * MPUNSAFE 394 */ 395 if (got_trans == 0) { 396 hammer_start_transaction(&trans, ip->hmp); 397 got_trans = 1; 398 } 399 400 /* 401 * NOTE: A valid bp has already been acquired, but was not 402 * B_CACHE. 403 */ 404 if (hammer_cluster_enable) { 405 /* 406 * Use file_limit to prevent cluster_read() from 407 * creating buffers of the wrong block size past 408 * the demarc. 409 */ 410 file_limit = ip->ino_data.size; 411 if (base_offset < HAMMER_XDEMARC && 412 file_limit > HAMMER_XDEMARC) { 413 file_limit = HAMMER_XDEMARC; 414 } 415 error = cluster_readx(ap->a_vp, 416 file_limit, base_offset, 417 blksize, B_NOTMETA, 418 uio->uio_resid, 419 seqcount * MAXBSIZE, 420 &bp); 421 } else { 422 error = breadnx(ap->a_vp, base_offset, 423 blksize, B_NOTMETA, 424 NULL, NULL, 0, &bp); 425 } 426 if (error) { 427 brelse(bp); 428 break; 429 } 430 skip: 431 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IOISSUED)) { 432 hdkprintf("zone2_offset %016jx read file %016jx@%016jx\n", 433 (intmax_t)bp->b_bio2.bio_offset, 434 (intmax_t)ip->obj_id, 435 (intmax_t)bp->b_loffset); 436 } 437 bp->b_flags &= ~B_IOISSUED; 438 if (blksize == HAMMER_XBUFSIZE) 439 bp->b_flags |= B_CLUSTEROK; 440 441 n = blksize - offset; 442 if (n > uio->uio_resid) 443 n = uio->uio_resid; 444 if (n > ip->ino_data.size - uio->uio_offset) 445 n = (int)(ip->ino_data.size - uio->uio_offset); 446 447 /* 448 * Set B_AGE, data has a lower priority than meta-data. 449 * 450 * Use a hold/unlock/drop sequence to run the uiomove 451 * with the buffer unlocked, avoiding deadlocks against 452 * read()s on mmap()'d spaces. 453 */ 454 bp->b_flags |= B_AGE; 455 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 456 bqrelse(bp); 457 458 if (error) 459 break; 460 hammer_stats_file_read += n; 461 } 462 463 finished: 464 465 /* 466 * Try to update the atime with just the inode lock for maximum 467 * concurrency. If we can't shortcut it we have to get the full 468 * blown transaction. 469 */ 470 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 471 hammer_start_transaction(&trans, ip->hmp); 472 got_trans = 1; 473 } 474 475 if (got_trans) { 476 if ((ip->flags & HAMMER_INODE_RO) == 0 && 477 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 478 lwkt_gettoken(&hmp->fs_token); 479 ip->ino_data.atime = trans.time; 480 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 481 hammer_done_transaction(&trans); 482 lwkt_reltoken(&hmp->fs_token); 483 } else { 484 hammer_done_transaction(&trans); 485 } 486 } 487 return (error); 488 } 489 490 /* 491 * hammer_vop_write { vp, uio, ioflag, cred } 492 */ 493 static 494 int 495 hammer_vop_write(struct vop_write_args *ap) 496 { 497 struct hammer_transaction trans; 498 hammer_inode_t ip; 499 hammer_mount_t hmp; 500 thread_t td; 501 struct vnode *vp; 502 struct uio *uio; 503 int offset; 504 off_t base_offset; 505 int64_t cluster_eof; 506 struct buf *bp; 507 int kflags; 508 int error; 509 int n; 510 int flags; 511 int seqcount; 512 int bigwrite; 513 514 vp = ap->a_vp; 515 if (vp->v_type != VREG) 516 return (EINVAL); 517 ip = VTOI(ap->a_vp); 518 hmp = ip->hmp; 519 error = 0; 520 kflags = 0; 521 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 522 523 if (ip->flags & HAMMER_INODE_RO) 524 return (EROFS); 525 526 /* 527 * Create a transaction to cover the operations we perform. 528 */ 529 hammer_start_transaction(&trans, hmp); 530 uio = ap->a_uio; 531 532 /* 533 * Use v_lastwrite_ts if file not open for writing 534 * (i.e. a late msync) 535 */ 536 if (uio->uio_segflg == UIO_NOCOPY) { 537 if (vp->v_flag & VLASTWRITETS) { 538 trans.time = vp->v_lastwrite_ts.tv_sec * 1000000 + 539 vp->v_lastwrite_ts.tv_nsec / 1000; 540 } else { 541 trans.time = ip->ino_data.mtime; 542 } 543 } else { 544 vclrflags(vp, VLASTWRITETS); 545 } 546 547 /* 548 * Check append mode 549 */ 550 if (ap->a_ioflag & IO_APPEND) 551 uio->uio_offset = ip->ino_data.size; 552 553 /* 554 * Check for illegal write offsets. Valid range is 0...2^63-1. 555 * 556 * NOTE: the base_off assignment is required to work around what 557 * I consider to be a GCC-4 optimization bug. 558 */ 559 if (uio->uio_offset < 0) { 560 hammer_done_transaction(&trans); 561 return (EFBIG); 562 } 563 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 564 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 565 hammer_done_transaction(&trans); 566 return (EFBIG); 567 } 568 569 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 570 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 571 hammer_done_transaction(&trans); 572 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 573 return (EFBIG); 574 } 575 576 /* 577 * If reading or writing a huge amount of data we have to break 578 * atomicy and allow the operation to be interrupted by a signal 579 * or it can DOS the machine. 580 * 581 * Preset redo_count so we stop generating REDOs earlier if the 582 * limit is exceeded. 583 * 584 * redo_count is heuristical, SMP races are ok 585 */ 586 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 587 if ((ip->flags & HAMMER_INODE_REDO) && 588 ip->redo_count < hammer_limit_redo) { 589 ip->redo_count += uio->uio_resid; 590 } 591 592 /* 593 * Access the data typically in HAMMER_BUFSIZE blocks via the 594 * buffer cache, but HAMMER may use a variable block size based 595 * on the offset. 596 */ 597 while (uio->uio_resid > 0) { 598 int fixsize = 0; 599 int blksize; 600 int blkmask; 601 int trivial; 602 int endofblk; 603 off_t nsize; 604 605 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 606 break; 607 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 608 break; 609 610 blksize = hammer_blocksize(uio->uio_offset); 611 612 /* 613 * Control the number of pending records associated with 614 * this inode. If too many have accumulated start a 615 * flush. Try to maintain a pipeline with the flusher. 616 * 617 * NOTE: It is possible for other sources to grow the 618 * records but not necessarily issue another flush, 619 * so use a timeout and ensure that a re-flush occurs. 620 */ 621 if (ip->rsv_recs >= hammer_limit_inode_recs) { 622 lwkt_gettoken(&hmp->fs_token); 623 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 624 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 625 ip->flags |= HAMMER_INODE_RECSW; 626 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 627 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 628 } 629 lwkt_reltoken(&hmp->fs_token); 630 } 631 632 /* 633 * Do not allow HAMMER to blow out the buffer cache. Very 634 * large UIOs can lockout other processes due to bwillwrite() 635 * mechanics. 636 * 637 * The hammer inode is not locked during these operations. 638 * The vnode is locked which can interfere with the pageout 639 * daemon for non-UIO_NOCOPY writes but should not interfere 640 * with the buffer cache. Even so, we cannot afford to 641 * allow the pageout daemon to build up too many dirty buffer 642 * cache buffers. 643 * 644 * Only call this if we aren't being recursively called from 645 * a virtual disk device (vn), else we may deadlock. 646 */ 647 if ((ap->a_ioflag & IO_RECURSE) == 0) 648 bwillwrite(blksize); 649 650 /* 651 * Calculate the blocksize at the current offset and figure 652 * out how much we can actually write. 653 */ 654 blkmask = blksize - 1; 655 offset = (int)uio->uio_offset & blkmask; 656 base_offset = uio->uio_offset & ~(int64_t)blkmask; 657 n = blksize - offset; 658 if (n > uio->uio_resid) { 659 n = uio->uio_resid; 660 endofblk = 0; 661 } else { 662 endofblk = 1; 663 } 664 nsize = uio->uio_offset + n; 665 if (nsize > ip->ino_data.size) { 666 if (uio->uio_offset > ip->ino_data.size) 667 trivial = 0; 668 else 669 trivial = NVEXTF_TRIVIAL; 670 nvextendbuf(ap->a_vp, 671 ip->ino_data.size, 672 nsize, 673 hammer_blocksize(ip->ino_data.size), 674 hammer_blocksize(nsize), 675 hammer_blockoff(ip->ino_data.size), 676 hammer_blockoff(nsize), 677 trivial); 678 fixsize = 1; 679 kflags |= NOTE_EXTEND; 680 } 681 682 if (uio->uio_segflg == UIO_NOCOPY) { 683 /* 684 * Issuing a write with the same data backing the 685 * buffer. Instantiate the buffer to collect the 686 * backing vm pages, then read-in any missing bits. 687 * 688 * This case is used by vop_stdputpages(). 689 */ 690 bp = getblk(ap->a_vp, base_offset, 691 blksize, GETBLK_BHEAVY, 0); 692 if ((bp->b_flags & B_CACHE) == 0) { 693 bqrelse(bp); 694 error = bread(ap->a_vp, base_offset, 695 blksize, &bp); 696 } 697 } else if (offset == 0 && uio->uio_resid >= blksize) { 698 /* 699 * Even though we are entirely overwriting the buffer 700 * we may still have to zero it out to avoid a 701 * mmap/write visibility issue. 702 */ 703 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 704 if ((bp->b_flags & B_CACHE) == 0) 705 vfs_bio_clrbuf(bp); 706 } else if (base_offset >= ip->ino_data.size) { 707 /* 708 * If the base offset of the buffer is beyond the 709 * file EOF, we don't have to issue a read. 710 */ 711 bp = getblk(ap->a_vp, base_offset, 712 blksize, GETBLK_BHEAVY, 0); 713 vfs_bio_clrbuf(bp); 714 } else { 715 /* 716 * Partial overwrite, read in any missing bits then 717 * replace the portion being written. 718 */ 719 error = bread(ap->a_vp, base_offset, blksize, &bp); 720 if (error == 0) 721 bheavy(bp); 722 } 723 if (error == 0) 724 error = uiomovebp(bp, bp->b_data + offset, n, uio); 725 726 lwkt_gettoken(&hmp->fs_token); 727 728 /* 729 * Generate REDO records if enabled and redo_count will not 730 * exceeded the limit. 731 * 732 * If redo_count exceeds the limit we stop generating records 733 * and clear HAMMER_INODE_REDO. This will cause the next 734 * fsync() to do a full meta-data sync instead of just an 735 * UNDO/REDO fifo update. 736 * 737 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 738 * will still be tracked. The tracks will be terminated 739 * when the related meta-data (including possible data 740 * modifications which are not tracked via REDO) is 741 * flushed. 742 */ 743 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 744 if (ip->redo_count < hammer_limit_redo) { 745 bp->b_flags |= B_VFSFLAG1; 746 error = hammer_generate_redo(&trans, ip, 747 base_offset + offset, 748 HAMMER_REDO_WRITE, 749 bp->b_data + offset, 750 (size_t)n); 751 } else { 752 ip->flags &= ~HAMMER_INODE_REDO; 753 } 754 } 755 756 /* 757 * If we screwed up we have to undo any VM size changes we 758 * made. 759 */ 760 if (error) { 761 brelse(bp); 762 if (fixsize) { 763 nvtruncbuf(ap->a_vp, ip->ino_data.size, 764 hammer_blocksize(ip->ino_data.size), 765 hammer_blockoff(ip->ino_data.size), 766 0); 767 } 768 lwkt_reltoken(&hmp->fs_token); 769 break; 770 } 771 kflags |= NOTE_WRITE; 772 hammer_stats_file_write += n; 773 if (blksize == HAMMER_XBUFSIZE) 774 bp->b_flags |= B_CLUSTEROK; 775 if (ip->ino_data.size < uio->uio_offset) { 776 ip->ino_data.size = uio->uio_offset; 777 flags = HAMMER_INODE_SDIRTY; 778 } else { 779 flags = 0; 780 } 781 ip->ino_data.mtime = trans.time; 782 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 783 hammer_modify_inode(&trans, ip, flags); 784 785 /* 786 * Once we dirty the buffer any cached zone-X offset 787 * becomes invalid. HAMMER NOTE: no-history mode cannot 788 * allow overwriting over the same data sector unless 789 * we provide UNDOs for the old data, which we don't. 790 */ 791 bp->b_bio2.bio_offset = NOOFFSET; 792 793 lwkt_reltoken(&hmp->fs_token); 794 795 /* 796 * Final buffer disposition. 797 * 798 * Because meta-data updates are deferred, HAMMER is 799 * especially sensitive to excessive bdwrite()s because 800 * the I/O stream is not broken up by disk reads. So the 801 * buffer cache simply cannot keep up. 802 * 803 * WARNING! blksize is variable. cluster_write() is 804 * expected to not blow up if it encounters 805 * buffers that do not match the passed blksize. 806 * 807 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 808 * The ip->rsv_recs check should burst-flush the data. 809 * If we queue it immediately the buf could be left 810 * locked on the device queue for a very long time. 811 * 812 * However, failing to flush a dirty buffer out when 813 * issued from the pageout daemon can result in a low 814 * memory deadlock against bio_page_alloc(), so we 815 * have to bawrite() on IO_ASYNC as well. 816 * 817 * NOTE! To avoid degenerate stalls due to mismatched block 818 * sizes we only honor IO_DIRECT on the write which 819 * abuts the end of the buffer. However, we must 820 * honor IO_SYNC in case someone is silly enough to 821 * configure a HAMMER file as swap, or when HAMMER 822 * is serving NFS (for commits). Ick ick. 823 */ 824 bp->b_flags |= B_AGE; 825 if (blksize == HAMMER_XBUFSIZE) 826 bp->b_flags |= B_CLUSTEROK; 827 828 if (ap->a_ioflag & IO_SYNC) { 829 bwrite(bp); 830 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 831 bawrite(bp); 832 } else if (ap->a_ioflag & IO_ASYNC) { 833 bawrite(bp); 834 } else if (hammer_cluster_enable && 835 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 836 if (base_offset < HAMMER_XDEMARC) 837 cluster_eof = hammer_blockdemarc(base_offset, 838 ip->ino_data.size); 839 else 840 cluster_eof = ip->ino_data.size; 841 cluster_write(bp, cluster_eof, blksize, seqcount); 842 } else { 843 bdwrite(bp); 844 } 845 } 846 hammer_done_transaction(&trans); 847 hammer_knote(ap->a_vp, kflags); 848 849 return (error); 850 } 851 852 /* 853 * hammer_vop_access { vp, mode, cred } 854 * 855 * MPSAFE - does not require fs_token 856 */ 857 static 858 int 859 hammer_vop_access(struct vop_access_args *ap) 860 { 861 hammer_inode_t ip = VTOI(ap->a_vp); 862 uid_t uid; 863 gid_t gid; 864 int error; 865 866 uid = hammer_to_unix_xid(&ip->ino_data.uid); 867 gid = hammer_to_unix_xid(&ip->ino_data.gid); 868 869 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 870 ip->ino_data.uflags); 871 return (error); 872 } 873 874 /* 875 * hammer_vop_advlock { vp, id, op, fl, flags } 876 * 877 * MPSAFE - does not require fs_token 878 */ 879 static 880 int 881 hammer_vop_advlock(struct vop_advlock_args *ap) 882 { 883 hammer_inode_t ip = VTOI(ap->a_vp); 884 885 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 886 } 887 888 /* 889 * hammer_vop_close { vp, fflag } 890 * 891 * We can only sync-on-close for normal closes. XXX disabled for now. 892 */ 893 static 894 int 895 hammer_vop_close(struct vop_close_args *ap) 896 { 897 #if 0 898 struct vnode *vp = ap->a_vp; 899 hammer_inode_t ip = VTOI(vp); 900 int waitfor; 901 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 902 if (vn_islocked(vp) == LK_EXCLUSIVE && 903 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 904 if (ip->flags & HAMMER_INODE_CLOSESYNC) 905 waitfor = MNT_WAIT; 906 else 907 waitfor = MNT_NOWAIT; 908 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 909 HAMMER_INODE_CLOSEASYNC); 910 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 911 } 912 } 913 #endif 914 return (vop_stdclose(ap)); 915 } 916 917 /* 918 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 919 * 920 * The operating system has already ensured that the directory entry 921 * does not exist and done all appropriate namespace locking. 922 */ 923 static 924 int 925 hammer_vop_ncreate(struct vop_ncreate_args *ap) 926 { 927 struct hammer_transaction trans; 928 hammer_inode_t dip; 929 hammer_inode_t nip; 930 struct nchandle *nch; 931 hammer_mount_t hmp; 932 int error; 933 934 nch = ap->a_nch; 935 dip = VTOI(ap->a_dvp); 936 hmp = dip->hmp; 937 938 if (dip->flags & HAMMER_INODE_RO) 939 return (EROFS); 940 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 941 return (error); 942 943 /* 944 * Create a transaction to cover the operations we perform. 945 */ 946 lwkt_gettoken(&hmp->fs_token); 947 hammer_start_transaction(&trans, hmp); 948 949 /* 950 * Create a new filesystem object of the requested type. The 951 * returned inode will be referenced and shared-locked to prevent 952 * it from being moved to the flusher. 953 */ 954 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 955 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 956 NULL, &nip); 957 if (error) { 958 hkprintf("hammer_create_inode error %d\n", error); 959 hammer_done_transaction(&trans); 960 *ap->a_vpp = NULL; 961 lwkt_reltoken(&hmp->fs_token); 962 return (error); 963 } 964 965 /* 966 * Add the new filesystem object to the directory. This will also 967 * bump the inode's link count. 968 */ 969 error = hammer_ip_add_direntry(&trans, dip, 970 nch->ncp->nc_name, nch->ncp->nc_nlen, 971 nip); 972 if (error) 973 hkprintf("hammer_ip_add_direntry error %d\n", error); 974 975 /* 976 * Finish up. 977 */ 978 if (error) { 979 hammer_rel_inode(nip, 0); 980 hammer_done_transaction(&trans); 981 *ap->a_vpp = NULL; 982 } else { 983 error = hammer_get_vnode(nip, ap->a_vpp); 984 hammer_done_transaction(&trans); 985 hammer_rel_inode(nip, 0); 986 if (error == 0) { 987 cache_setunresolved(ap->a_nch); 988 cache_setvp(ap->a_nch, *ap->a_vpp); 989 } 990 hammer_knote(ap->a_dvp, NOTE_WRITE); 991 } 992 lwkt_reltoken(&hmp->fs_token); 993 return (error); 994 } 995 996 /* 997 * hammer_vop_getattr { vp, vap } 998 * 999 * Retrieve an inode's attribute information. When accessing inodes 1000 * historically we fake the atime field to ensure consistent results. 1001 * The atime field is stored in the B-Tree element and allowed to be 1002 * updated without cycling the element. 1003 * 1004 * MPSAFE - does not require fs_token 1005 */ 1006 static 1007 int 1008 hammer_vop_getattr(struct vop_getattr_args *ap) 1009 { 1010 hammer_inode_t ip = VTOI(ap->a_vp); 1011 struct vattr *vap = ap->a_vap; 1012 1013 /* 1014 * We want the fsid to be different when accessing a filesystem 1015 * with different as-of's so programs like diff don't think 1016 * the files are the same. 1017 * 1018 * We also want the fsid to be the same when comparing snapshots, 1019 * or when comparing mirrors (which might be backed by different 1020 * physical devices). HAMMER fsids are based on the PFS's 1021 * shared_uuid field. 1022 * 1023 * XXX there is a chance of collision here. The va_fsid reported 1024 * by stat is different from the more involved fsid used in the 1025 * mount structure. 1026 */ 1027 hammer_lock_sh(&ip->lock); 1028 vap->va_fsid = ip->pfsm->fsid_udev ^ (uint32_t)ip->obj_asof ^ 1029 (uint32_t)(ip->obj_asof >> 32); 1030 1031 vap->va_fileid = ip->ino_leaf.base.obj_id; 1032 vap->va_mode = ip->ino_data.mode; 1033 vap->va_nlink = ip->ino_data.nlinks; 1034 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1035 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1036 vap->va_rmajor = 0; 1037 vap->va_rminor = 0; 1038 vap->va_size = ip->ino_data.size; 1039 1040 /* 1041 * Special case for @@PFS softlinks. The actual size of the 1042 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1043 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1044 * 1045 * Note that userspace hammer command does not allow users to 1046 * create a @@PFS softlink under an existing other PFS (id!=0) 1047 * so the ip localization here for @@PFS softlink is always 0. 1048 */ 1049 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1050 ip->ino_data.size == 10 && 1051 ip->obj_asof == HAMMER_MAX_TID && 1052 ip->obj_localization == HAMMER_DEF_LOCALIZATION && 1053 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1054 if (hammer_is_pfs_slave(&ip->pfsm->pfsd)) 1055 vap->va_size = 26; 1056 else 1057 vap->va_size = 10; 1058 } 1059 1060 /* 1061 * We must provide a consistent atime and mtime for snapshots 1062 * so people can do a 'tar cf - ... | md5' on them and get 1063 * consistent results. 1064 */ 1065 if (ip->flags & HAMMER_INODE_RO) { 1066 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1067 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1068 } else { 1069 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1070 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1071 } 1072 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1073 vap->va_flags = ip->ino_data.uflags; 1074 vap->va_gen = 1; /* hammer inums are unique for all time */ 1075 vap->va_blocksize = HAMMER_BUFSIZE; 1076 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1077 vap->va_bytes = HAMMER_XBUFSIZE64_DOALIGN(ip->ino_data.size); 1078 } else if (ip->ino_data.size > HAMMER_HBUFSIZE) { 1079 vap->va_bytes = HAMMER_BUFSIZE64_DOALIGN(ip->ino_data.size); 1080 } else { 1081 vap->va_bytes = HAMMER_DATA_DOALIGN(ip->ino_data.size); 1082 } 1083 1084 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1085 vap->va_filerev = 0; /* XXX */ 1086 vap->va_uid_uuid = ip->ino_data.uid; 1087 vap->va_gid_uuid = ip->ino_data.gid; 1088 vap->va_fsid_uuid = ip->hmp->fsid; 1089 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1090 VA_FSID_UUID_VALID; 1091 1092 switch (ip->ino_data.obj_type) { 1093 case HAMMER_OBJTYPE_CDEV: 1094 case HAMMER_OBJTYPE_BDEV: 1095 vap->va_rmajor = ip->ino_data.rmajor; 1096 vap->va_rminor = ip->ino_data.rminor; 1097 break; 1098 default: 1099 break; 1100 } 1101 hammer_unlock(&ip->lock); 1102 return(0); 1103 } 1104 1105 /* 1106 * hammer_vop_nresolve { nch, dvp, cred } 1107 * 1108 * Locate the requested directory entry. 1109 */ 1110 static 1111 int 1112 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1113 { 1114 struct hammer_transaction trans; 1115 struct namecache *ncp; 1116 hammer_mount_t hmp; 1117 hammer_inode_t dip; 1118 hammer_inode_t ip; 1119 hammer_tid_t asof; 1120 struct hammer_cursor cursor; 1121 struct vnode *vp; 1122 int64_t namekey; 1123 int error; 1124 int i; 1125 int nlen; 1126 int flags; 1127 int ispfs; 1128 int64_t obj_id; 1129 uint32_t localization; 1130 uint32_t max_iterations; 1131 1132 /* 1133 * Misc initialization, plus handle as-of name extensions. Look for 1134 * the '@@' extension. Note that as-of files and directories cannot 1135 * be modified. 1136 */ 1137 dip = VTOI(ap->a_dvp); 1138 ncp = ap->a_nch->ncp; 1139 asof = dip->obj_asof; 1140 localization = dip->obj_localization; /* for code consistency */ 1141 nlen = ncp->nc_nlen; 1142 flags = dip->flags & HAMMER_INODE_RO; 1143 ispfs = 0; 1144 hmp = dip->hmp; 1145 1146 lwkt_gettoken(&hmp->fs_token); 1147 hammer_simple_transaction(&trans, hmp); 1148 1149 for (i = 0; i < nlen; ++i) { 1150 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1151 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1152 &ispfs, &asof, &localization); 1153 if (error != 0) { 1154 i = nlen; 1155 break; 1156 } 1157 if (asof != HAMMER_MAX_TID) 1158 flags |= HAMMER_INODE_RO; 1159 break; 1160 } 1161 } 1162 nlen = i; 1163 1164 /* 1165 * If this is a PFS we dive into the PFS root inode 1166 */ 1167 if (ispfs && nlen == 0) { 1168 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1169 asof, localization, 1170 flags, &error); 1171 if (error == 0) { 1172 error = hammer_get_vnode(ip, &vp); 1173 hammer_rel_inode(ip, 0); 1174 } else { 1175 vp = NULL; 1176 } 1177 if (error == 0) { 1178 vn_unlock(vp); 1179 cache_setvp(ap->a_nch, vp); 1180 vrele(vp); 1181 } 1182 goto done; 1183 } 1184 1185 /* 1186 * If there is no path component the time extension is relative to dip. 1187 * e.g. "fubar/@@<snapshot>" 1188 * 1189 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1190 * e.g. "fubar/.@@<snapshot>" 1191 * 1192 * ".." is handled by the kernel. We do not currently handle 1193 * "..@<snapshot>". 1194 */ 1195 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1196 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1197 asof, dip->obj_localization, 1198 flags, &error); 1199 if (error == 0) { 1200 error = hammer_get_vnode(ip, &vp); 1201 hammer_rel_inode(ip, 0); 1202 } else { 1203 vp = NULL; 1204 } 1205 if (error == 0) { 1206 vn_unlock(vp); 1207 cache_setvp(ap->a_nch, vp); 1208 vrele(vp); 1209 } 1210 goto done; 1211 } 1212 1213 /* 1214 * Calculate the namekey and setup the key range for the scan. This 1215 * works kinda like a chained hash table where the lower 32 bits 1216 * of the namekey synthesize the chain. 1217 * 1218 * The key range is inclusive of both key_beg and key_end. 1219 */ 1220 namekey = hammer_direntry_namekey(dip, ncp->nc_name, nlen, 1221 &max_iterations); 1222 1223 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1224 cursor.key_beg.localization = dip->obj_localization | 1225 hammer_dir_localization(dip); 1226 cursor.key_beg.obj_id = dip->obj_id; 1227 cursor.key_beg.key = namekey; 1228 cursor.key_beg.create_tid = 0; 1229 cursor.key_beg.delete_tid = 0; 1230 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1231 cursor.key_beg.obj_type = 0; 1232 1233 cursor.key_end = cursor.key_beg; 1234 cursor.key_end.key += max_iterations; 1235 cursor.asof = asof; 1236 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1237 1238 /* 1239 * Scan all matching records (the chain), locate the one matching 1240 * the requested path component. 1241 * 1242 * The hammer_ip_*() functions merge in-memory records with on-disk 1243 * records for the purposes of the search. 1244 */ 1245 obj_id = 0; 1246 localization = HAMMER_DEF_LOCALIZATION; 1247 1248 if (error == 0) { 1249 error = hammer_ip_first(&cursor); 1250 while (error == 0) { 1251 error = hammer_ip_resolve_data(&cursor); 1252 if (error) 1253 break; 1254 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1255 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1256 obj_id = cursor.data->entry.obj_id; 1257 localization = cursor.data->entry.localization; 1258 break; 1259 } 1260 error = hammer_ip_next(&cursor); 1261 } 1262 } 1263 hammer_done_cursor(&cursor); 1264 1265 /* 1266 * Lookup the obj_id. This should always succeed. If it does not 1267 * the filesystem may be damaged and we return a dummy inode. 1268 */ 1269 if (error == 0) { 1270 ip = hammer_get_inode(&trans, dip, obj_id, 1271 asof, localization, 1272 flags, &error); 1273 if (error == ENOENT) { 1274 hkprintf("WARNING: Missing inode for dirent \"%s\"\n" 1275 "\tobj_id = %016jx, asof=%016jx, lo=%08x\n", 1276 ncp->nc_name, 1277 (intmax_t)obj_id, (intmax_t)asof, 1278 localization); 1279 error = 0; 1280 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1281 asof, localization, 1282 flags, &error); 1283 } 1284 if (error == 0) { 1285 error = hammer_get_vnode(ip, &vp); 1286 hammer_rel_inode(ip, 0); 1287 } else { 1288 vp = NULL; 1289 } 1290 if (error == 0) { 1291 vn_unlock(vp); 1292 cache_setvp(ap->a_nch, vp); 1293 vrele(vp); 1294 } 1295 } else if (error == ENOENT) { 1296 cache_setvp(ap->a_nch, NULL); 1297 } 1298 done: 1299 hammer_done_transaction(&trans); 1300 lwkt_reltoken(&hmp->fs_token); 1301 return (error); 1302 } 1303 1304 /* 1305 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1306 * 1307 * Locate the parent directory of a directory vnode. 1308 * 1309 * dvp is referenced but not locked. *vpp must be returned referenced and 1310 * locked. A parent_obj_id of 0 indicates that we are at the root. 1311 * 1312 * NOTE: as-of sequences are not linked into the directory structure. If 1313 * we are at the root with a different asof then the mount point, reload 1314 * the same directory with the mount point's asof. I'm not sure what this 1315 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1316 * get confused, but it hasn't been tested. 1317 */ 1318 static 1319 int 1320 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1321 { 1322 struct hammer_transaction trans; 1323 hammer_inode_t dip; 1324 hammer_inode_t ip; 1325 hammer_mount_t hmp; 1326 int64_t parent_obj_id; 1327 uint32_t parent_obj_localization; 1328 hammer_tid_t asof; 1329 int error; 1330 1331 dip = VTOI(ap->a_dvp); 1332 asof = dip->obj_asof; 1333 hmp = dip->hmp; 1334 1335 /* 1336 * Whos are parent? This could be the root of a pseudo-filesystem 1337 * whos parent is in another localization domain. 1338 */ 1339 lwkt_gettoken(&hmp->fs_token); 1340 parent_obj_id = dip->ino_data.parent_obj_id; 1341 if (dip->obj_id == HAMMER_OBJID_ROOT) 1342 parent_obj_localization = HAMMER_DEF_LOCALIZATION; 1343 else 1344 parent_obj_localization = dip->obj_localization; 1345 1346 /* 1347 * It's probably a PFS root when dip->ino_data.parent_obj_id is 0. 1348 */ 1349 if (parent_obj_id == 0) { 1350 if (dip->obj_id == HAMMER_OBJID_ROOT && 1351 asof != hmp->asof) { 1352 parent_obj_id = dip->obj_id; 1353 asof = hmp->asof; 1354 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1355 ksnprintf(*ap->a_fakename, 19, "0x%016jx", 1356 (intmax_t)dip->obj_asof); 1357 } else { 1358 *ap->a_vpp = NULL; 1359 lwkt_reltoken(&hmp->fs_token); 1360 return ENOENT; 1361 } 1362 } 1363 1364 hammer_simple_transaction(&trans, hmp); 1365 1366 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1367 asof, parent_obj_localization, 1368 dip->flags, &error); 1369 if (ip) { 1370 error = hammer_get_vnode(ip, ap->a_vpp); 1371 hammer_rel_inode(ip, 0); 1372 } else { 1373 *ap->a_vpp = NULL; 1374 } 1375 hammer_done_transaction(&trans); 1376 lwkt_reltoken(&hmp->fs_token); 1377 return (error); 1378 } 1379 1380 /* 1381 * hammer_vop_nlink { nch, dvp, vp, cred } 1382 */ 1383 static 1384 int 1385 hammer_vop_nlink(struct vop_nlink_args *ap) 1386 { 1387 struct hammer_transaction trans; 1388 hammer_inode_t dip; 1389 hammer_inode_t ip; 1390 struct nchandle *nch; 1391 hammer_mount_t hmp; 1392 int error; 1393 1394 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1395 return(EXDEV); 1396 1397 nch = ap->a_nch; 1398 dip = VTOI(ap->a_dvp); 1399 ip = VTOI(ap->a_vp); 1400 hmp = dip->hmp; 1401 1402 if (dip->obj_localization != ip->obj_localization) 1403 return(EXDEV); 1404 1405 if (dip->flags & HAMMER_INODE_RO) 1406 return (EROFS); 1407 if (ip->flags & HAMMER_INODE_RO) 1408 return (EROFS); 1409 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1410 return (error); 1411 1412 /* 1413 * Create a transaction to cover the operations we perform. 1414 */ 1415 lwkt_gettoken(&hmp->fs_token); 1416 hammer_start_transaction(&trans, hmp); 1417 1418 /* 1419 * Add the filesystem object to the directory. Note that neither 1420 * dip nor ip are referenced or locked, but their vnodes are 1421 * referenced. This function will bump the inode's link count. 1422 */ 1423 error = hammer_ip_add_direntry(&trans, dip, 1424 nch->ncp->nc_name, nch->ncp->nc_nlen, 1425 ip); 1426 1427 /* 1428 * Finish up. 1429 */ 1430 if (error == 0) { 1431 cache_setunresolved(nch); 1432 cache_setvp(nch, ap->a_vp); 1433 } 1434 hammer_done_transaction(&trans); 1435 hammer_knote(ap->a_vp, NOTE_LINK); 1436 hammer_knote(ap->a_dvp, NOTE_WRITE); 1437 lwkt_reltoken(&hmp->fs_token); 1438 return (error); 1439 } 1440 1441 /* 1442 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1443 * 1444 * The operating system has already ensured that the directory entry 1445 * does not exist and done all appropriate namespace locking. 1446 */ 1447 static 1448 int 1449 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1450 { 1451 struct hammer_transaction trans; 1452 hammer_inode_t dip; 1453 hammer_inode_t nip; 1454 struct nchandle *nch; 1455 hammer_mount_t hmp; 1456 int error; 1457 1458 nch = ap->a_nch; 1459 dip = VTOI(ap->a_dvp); 1460 hmp = dip->hmp; 1461 1462 if (dip->flags & HAMMER_INODE_RO) 1463 return (EROFS); 1464 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1465 return (error); 1466 1467 /* 1468 * Create a transaction to cover the operations we perform. 1469 */ 1470 lwkt_gettoken(&hmp->fs_token); 1471 hammer_start_transaction(&trans, hmp); 1472 1473 /* 1474 * Create a new filesystem object of the requested type. The 1475 * returned inode will be referenced but not locked. 1476 */ 1477 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1478 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1479 NULL, &nip); 1480 if (error) { 1481 hammer_done_transaction(&trans); 1482 *ap->a_vpp = NULL; 1483 lwkt_reltoken(&hmp->fs_token); 1484 return (error); 1485 } 1486 /* 1487 * Add the new filesystem object to the directory. This will also 1488 * bump the inode's link count. 1489 */ 1490 error = hammer_ip_add_direntry(&trans, dip, 1491 nch->ncp->nc_name, nch->ncp->nc_nlen, 1492 nip); 1493 if (error) 1494 hkprintf("hammer_mkdir (add) error %d\n", error); 1495 1496 /* 1497 * Finish up. 1498 */ 1499 if (error) { 1500 hammer_rel_inode(nip, 0); 1501 *ap->a_vpp = NULL; 1502 } else { 1503 error = hammer_get_vnode(nip, ap->a_vpp); 1504 hammer_rel_inode(nip, 0); 1505 if (error == 0) { 1506 cache_setunresolved(ap->a_nch); 1507 cache_setvp(ap->a_nch, *ap->a_vpp); 1508 } 1509 } 1510 hammer_done_transaction(&trans); 1511 if (error == 0) 1512 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1513 lwkt_reltoken(&hmp->fs_token); 1514 return (error); 1515 } 1516 1517 /* 1518 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1519 * 1520 * The operating system has already ensured that the directory entry 1521 * does not exist and done all appropriate namespace locking. 1522 */ 1523 static 1524 int 1525 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1526 { 1527 struct hammer_transaction trans; 1528 hammer_inode_t dip; 1529 hammer_inode_t nip; 1530 struct nchandle *nch; 1531 hammer_mount_t hmp; 1532 int error; 1533 1534 nch = ap->a_nch; 1535 dip = VTOI(ap->a_dvp); 1536 hmp = dip->hmp; 1537 1538 if (dip->flags & HAMMER_INODE_RO) 1539 return (EROFS); 1540 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1541 return (error); 1542 1543 /* 1544 * Create a transaction to cover the operations we perform. 1545 */ 1546 lwkt_gettoken(&hmp->fs_token); 1547 hammer_start_transaction(&trans, hmp); 1548 1549 /* 1550 * Create a new filesystem object of the requested type. The 1551 * returned inode will be referenced but not locked. 1552 * 1553 * If mknod specifies a directory a pseudo-fs is created. 1554 */ 1555 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1556 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1557 NULL, &nip); 1558 if (error) { 1559 hammer_done_transaction(&trans); 1560 *ap->a_vpp = NULL; 1561 lwkt_reltoken(&hmp->fs_token); 1562 return (error); 1563 } 1564 1565 /* 1566 * Add the new filesystem object to the directory. This will also 1567 * bump the inode's link count. 1568 */ 1569 error = hammer_ip_add_direntry(&trans, dip, 1570 nch->ncp->nc_name, nch->ncp->nc_nlen, 1571 nip); 1572 1573 /* 1574 * Finish up. 1575 */ 1576 if (error) { 1577 hammer_rel_inode(nip, 0); 1578 *ap->a_vpp = NULL; 1579 } else { 1580 error = hammer_get_vnode(nip, ap->a_vpp); 1581 hammer_rel_inode(nip, 0); 1582 if (error == 0) { 1583 cache_setunresolved(ap->a_nch); 1584 cache_setvp(ap->a_nch, *ap->a_vpp); 1585 } 1586 } 1587 hammer_done_transaction(&trans); 1588 if (error == 0) 1589 hammer_knote(ap->a_dvp, NOTE_WRITE); 1590 lwkt_reltoken(&hmp->fs_token); 1591 return (error); 1592 } 1593 1594 /* 1595 * hammer_vop_open { vp, mode, cred, fp } 1596 * 1597 * MPSAFE (does not require fs_token) 1598 */ 1599 static 1600 int 1601 hammer_vop_open(struct vop_open_args *ap) 1602 { 1603 hammer_inode_t ip; 1604 1605 ip = VTOI(ap->a_vp); 1606 1607 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1608 return (EROFS); 1609 return(vop_stdopen(ap)); 1610 } 1611 1612 /* 1613 * hammer_vop_print { vp } 1614 */ 1615 static 1616 int 1617 hammer_vop_print(struct vop_print_args *ap) 1618 { 1619 return EOPNOTSUPP; 1620 } 1621 1622 /* 1623 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1624 */ 1625 static 1626 int 1627 hammer_vop_readdir(struct vop_readdir_args *ap) 1628 { 1629 struct hammer_transaction trans; 1630 struct hammer_cursor cursor; 1631 hammer_inode_t ip; 1632 hammer_mount_t hmp; 1633 struct uio *uio; 1634 hammer_base_elm_t base; 1635 int error; 1636 int cookie_index; 1637 int ncookies; 1638 off_t *cookies; 1639 off_t saveoff; 1640 int r; 1641 int dtype; 1642 1643 ip = VTOI(ap->a_vp); 1644 uio = ap->a_uio; 1645 saveoff = uio->uio_offset; 1646 hmp = ip->hmp; 1647 1648 if (ap->a_ncookies) { 1649 ncookies = uio->uio_resid / 16 + 1; 1650 if (ncookies > 1024) 1651 ncookies = 1024; 1652 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1653 cookie_index = 0; 1654 } else { 1655 ncookies = -1; 1656 cookies = NULL; 1657 cookie_index = 0; 1658 } 1659 1660 lwkt_gettoken(&hmp->fs_token); 1661 hammer_simple_transaction(&trans, hmp); 1662 1663 /* 1664 * Handle artificial entries 1665 * 1666 * It should be noted that the minimum value for a directory 1667 * hash key on-media is 0x0000000100000000, so we can use anything 1668 * less then that to represent our 'special' key space. 1669 */ 1670 error = 0; 1671 if (saveoff == 0) { 1672 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1673 if (r) 1674 goto done; 1675 if (cookies) 1676 cookies[cookie_index] = saveoff; 1677 ++saveoff; 1678 ++cookie_index; 1679 if (cookie_index == ncookies) 1680 goto done; 1681 } 1682 if (saveoff == 1) { 1683 if (ip->ino_data.parent_obj_id) { 1684 r = vop_write_dirent(&error, uio, 1685 ip->ino_data.parent_obj_id, 1686 DT_DIR, 2, ".."); 1687 } else { 1688 r = vop_write_dirent(&error, uio, 1689 ip->obj_id, DT_DIR, 2, ".."); 1690 } 1691 if (r) 1692 goto done; 1693 if (cookies) 1694 cookies[cookie_index] = saveoff; 1695 ++saveoff; 1696 ++cookie_index; 1697 if (cookie_index == ncookies) 1698 goto done; 1699 } 1700 1701 /* 1702 * Key range (begin and end inclusive) to scan. Directory keys 1703 * directly translate to a 64 bit 'seek' position. 1704 */ 1705 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1706 cursor.key_beg.localization = ip->obj_localization | 1707 hammer_dir_localization(ip); 1708 cursor.key_beg.obj_id = ip->obj_id; 1709 cursor.key_beg.create_tid = 0; 1710 cursor.key_beg.delete_tid = 0; 1711 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1712 cursor.key_beg.obj_type = 0; 1713 cursor.key_beg.key = saveoff; 1714 1715 cursor.key_end = cursor.key_beg; 1716 cursor.key_end.key = HAMMER_MAX_KEY; 1717 cursor.asof = ip->obj_asof; 1718 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1719 1720 error = hammer_ip_first(&cursor); 1721 1722 while (error == 0) { 1723 error = hammer_ip_resolve_data(&cursor); 1724 if (error) 1725 break; 1726 base = &cursor.leaf->base; 1727 saveoff = base->key; 1728 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1729 1730 if (base->obj_id != ip->obj_id) 1731 hpanic("bad record at %p", cursor.node); 1732 1733 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1734 r = vop_write_dirent( 1735 &error, uio, cursor.data->entry.obj_id, 1736 dtype, 1737 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1738 (void *)cursor.data->entry.name); 1739 if (r) 1740 break; 1741 ++saveoff; 1742 if (cookies) 1743 cookies[cookie_index] = base->key; 1744 ++cookie_index; 1745 if (cookie_index == ncookies) 1746 break; 1747 error = hammer_ip_next(&cursor); 1748 } 1749 hammer_done_cursor(&cursor); 1750 1751 done: 1752 hammer_done_transaction(&trans); 1753 1754 if (ap->a_eofflag) 1755 *ap->a_eofflag = (error == ENOENT); 1756 uio->uio_offset = saveoff; 1757 if (error && cookie_index == 0) { 1758 if (error == ENOENT) 1759 error = 0; 1760 if (cookies) { 1761 kfree(cookies, M_TEMP); 1762 *ap->a_ncookies = 0; 1763 *ap->a_cookies = NULL; 1764 } 1765 } else { 1766 if (error == ENOENT) 1767 error = 0; 1768 if (cookies) { 1769 *ap->a_ncookies = cookie_index; 1770 *ap->a_cookies = cookies; 1771 } 1772 } 1773 lwkt_reltoken(&hmp->fs_token); 1774 return(error); 1775 } 1776 1777 /* 1778 * hammer_vop_readlink { vp, uio, cred } 1779 */ 1780 static 1781 int 1782 hammer_vop_readlink(struct vop_readlink_args *ap) 1783 { 1784 struct hammer_transaction trans; 1785 struct hammer_cursor cursor; 1786 hammer_inode_t ip; 1787 hammer_mount_t hmp; 1788 char buf[32]; 1789 uint32_t localization; 1790 hammer_pseudofs_inmem_t pfsm; 1791 int error; 1792 1793 ip = VTOI(ap->a_vp); 1794 hmp = ip->hmp; 1795 1796 lwkt_gettoken(&hmp->fs_token); 1797 1798 /* 1799 * Shortcut if the symlink data was stuffed into ino_data. 1800 * 1801 * Also expand special "@@PFS%05d" softlinks (expansion only 1802 * occurs for non-historical (current) accesses made from the 1803 * primary filesystem). 1804 * 1805 * Note that userspace hammer command does not allow users to 1806 * create a @@PFS softlink under an existing other PFS (id!=0) 1807 * so the ip localization here for @@PFS softlink is always 0. 1808 */ 1809 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1810 char *ptr; 1811 int bytes; 1812 1813 ptr = ip->ino_data.ext.symlink; 1814 bytes = (int)ip->ino_data.size; 1815 if (bytes == 10 && 1816 ip->obj_asof == HAMMER_MAX_TID && 1817 ip->obj_localization == HAMMER_DEF_LOCALIZATION && 1818 strncmp(ptr, "@@PFS", 5) == 0) { 1819 hammer_simple_transaction(&trans, hmp); 1820 bcopy(ptr + 5, buf, 5); 1821 buf[5] = 0; 1822 localization = pfs_to_lo(strtoul(buf, NULL, 10)); 1823 pfsm = hammer_load_pseudofs(&trans, localization, 1824 &error); 1825 if (error == 0) { 1826 if (hammer_is_pfs_slave(&pfsm->pfsd)) { 1827 /* vap->va_size == 26 */ 1828 ksnprintf(buf, sizeof(buf), 1829 "@@0x%016jx:%05d", 1830 (intmax_t)pfsm->pfsd.sync_end_tid, 1831 lo_to_pfs(localization)); 1832 } else { 1833 /* vap->va_size == 10 */ 1834 ksnprintf(buf, sizeof(buf), 1835 "@@-1:%05d", 1836 lo_to_pfs(localization)); 1837 } 1838 ptr = buf; 1839 bytes = strlen(buf); 1840 } 1841 if (pfsm) 1842 hammer_rel_pseudofs(hmp, pfsm); 1843 hammer_done_transaction(&trans); 1844 } 1845 error = uiomove(ptr, bytes, ap->a_uio); 1846 lwkt_reltoken(&hmp->fs_token); 1847 return(error); 1848 } 1849 1850 /* 1851 * Long version 1852 */ 1853 hammer_simple_transaction(&trans, hmp); 1854 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1855 1856 /* 1857 * Key range (begin and end inclusive) to scan. Directory keys 1858 * directly translate to a 64 bit 'seek' position. 1859 */ 1860 cursor.key_beg.localization = ip->obj_localization | 1861 HAMMER_LOCALIZE_MISC; 1862 cursor.key_beg.obj_id = ip->obj_id; 1863 cursor.key_beg.create_tid = 0; 1864 cursor.key_beg.delete_tid = 0; 1865 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1866 cursor.key_beg.obj_type = 0; 1867 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1868 cursor.asof = ip->obj_asof; 1869 cursor.flags |= HAMMER_CURSOR_ASOF; 1870 1871 error = hammer_ip_lookup(&cursor); 1872 if (error == 0) { 1873 error = hammer_ip_resolve_data(&cursor); 1874 if (error == 0) { 1875 KKASSERT(cursor.leaf->data_len >= 1876 HAMMER_SYMLINK_NAME_OFF); 1877 error = uiomove(cursor.data->symlink.name, 1878 cursor.leaf->data_len - 1879 HAMMER_SYMLINK_NAME_OFF, 1880 ap->a_uio); 1881 } 1882 } 1883 hammer_done_cursor(&cursor); 1884 hammer_done_transaction(&trans); 1885 lwkt_reltoken(&hmp->fs_token); 1886 return(error); 1887 } 1888 1889 /* 1890 * hammer_vop_nremove { nch, dvp, cred } 1891 */ 1892 static 1893 int 1894 hammer_vop_nremove(struct vop_nremove_args *ap) 1895 { 1896 struct hammer_transaction trans; 1897 hammer_inode_t dip; 1898 hammer_mount_t hmp; 1899 int error; 1900 1901 dip = VTOI(ap->a_dvp); 1902 hmp = dip->hmp; 1903 1904 if (hammer_nohistory(dip) == 0 && 1905 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1906 return (error); 1907 } 1908 1909 lwkt_gettoken(&hmp->fs_token); 1910 hammer_start_transaction(&trans, hmp); 1911 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1912 hammer_done_transaction(&trans); 1913 if (error == 0) 1914 hammer_knote(ap->a_dvp, NOTE_WRITE); 1915 lwkt_reltoken(&hmp->fs_token); 1916 return (error); 1917 } 1918 1919 /* 1920 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1921 */ 1922 static 1923 int 1924 hammer_vop_nrename(struct vop_nrename_args *ap) 1925 { 1926 struct hammer_transaction trans; 1927 struct namecache *fncp; 1928 struct namecache *tncp; 1929 hammer_inode_t fdip; 1930 hammer_inode_t tdip; 1931 hammer_inode_t ip; 1932 hammer_mount_t hmp; 1933 struct hammer_cursor cursor; 1934 int64_t namekey; 1935 uint32_t max_iterations; 1936 int nlen, error; 1937 1938 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1939 return(EXDEV); 1940 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1941 return(EXDEV); 1942 1943 fdip = VTOI(ap->a_fdvp); 1944 tdip = VTOI(ap->a_tdvp); 1945 fncp = ap->a_fnch->ncp; 1946 tncp = ap->a_tnch->ncp; 1947 ip = VTOI(fncp->nc_vp); 1948 KKASSERT(ip != NULL); 1949 1950 hmp = ip->hmp; 1951 1952 if (fdip->obj_localization != tdip->obj_localization) 1953 return(EXDEV); 1954 if (fdip->obj_localization != ip->obj_localization) 1955 return(EXDEV); 1956 1957 if (fdip->flags & HAMMER_INODE_RO) 1958 return (EROFS); 1959 if (tdip->flags & HAMMER_INODE_RO) 1960 return (EROFS); 1961 if (ip->flags & HAMMER_INODE_RO) 1962 return (EROFS); 1963 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1964 return (error); 1965 1966 lwkt_gettoken(&hmp->fs_token); 1967 hammer_start_transaction(&trans, hmp); 1968 1969 /* 1970 * Remove tncp from the target directory and then link ip as 1971 * tncp. XXX pass trans to dounlink 1972 * 1973 * Force the inode sync-time to match the transaction so it is 1974 * in-sync with the creation of the target directory entry. 1975 */ 1976 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1977 ap->a_cred, 0, -1); 1978 if (error == 0 || error == ENOENT) { 1979 error = hammer_ip_add_direntry(&trans, tdip, 1980 tncp->nc_name, tncp->nc_nlen, 1981 ip); 1982 if (error == 0) { 1983 ip->ino_data.parent_obj_id = tdip->obj_id; 1984 ip->ino_data.ctime = trans.time; 1985 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 1986 } 1987 } 1988 if (error) 1989 goto failed; /* XXX */ 1990 1991 /* 1992 * Locate the record in the originating directory and remove it. 1993 * 1994 * Calculate the namekey and setup the key range for the scan. This 1995 * works kinda like a chained hash table where the lower 32 bits 1996 * of the namekey synthesize the chain. 1997 * 1998 * The key range is inclusive of both key_beg and key_end. 1999 */ 2000 namekey = hammer_direntry_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2001 &max_iterations); 2002 retry: 2003 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2004 cursor.key_beg.localization = fdip->obj_localization | 2005 hammer_dir_localization(fdip); 2006 cursor.key_beg.obj_id = fdip->obj_id; 2007 cursor.key_beg.key = namekey; 2008 cursor.key_beg.create_tid = 0; 2009 cursor.key_beg.delete_tid = 0; 2010 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2011 cursor.key_beg.obj_type = 0; 2012 2013 cursor.key_end = cursor.key_beg; 2014 cursor.key_end.key += max_iterations; 2015 cursor.asof = fdip->obj_asof; 2016 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2017 2018 /* 2019 * Scan all matching records (the chain), locate the one matching 2020 * the requested path component. 2021 * 2022 * The hammer_ip_*() functions merge in-memory records with on-disk 2023 * records for the purposes of the search. 2024 */ 2025 error = hammer_ip_first(&cursor); 2026 while (error == 0) { 2027 if (hammer_ip_resolve_data(&cursor) != 0) 2028 break; 2029 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2030 KKASSERT(nlen > 0); 2031 if (fncp->nc_nlen == nlen && 2032 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2033 break; 2034 } 2035 error = hammer_ip_next(&cursor); 2036 } 2037 2038 /* 2039 * If all is ok we have to get the inode so we can adjust nlinks. 2040 * 2041 * WARNING: hammer_ip_del_direntry() may have to terminate the 2042 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2043 * twice. 2044 */ 2045 if (error == 0) 2046 error = hammer_ip_del_direntry(&trans, &cursor, fdip, ip); 2047 2048 /* 2049 * XXX A deadlock here will break rename's atomicy for the purposes 2050 * of crash recovery. 2051 */ 2052 if (error == EDEADLK) { 2053 hammer_done_cursor(&cursor); 2054 goto retry; 2055 } 2056 2057 /* 2058 * Cleanup and tell the kernel that the rename succeeded. 2059 * 2060 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2061 * without formally acquiring the vp since the vp might 2062 * have zero refs on it, or in the middle of a reclaim, 2063 * etc. 2064 */ 2065 hammer_done_cursor(&cursor); 2066 if (error == 0) { 2067 cache_rename(ap->a_fnch, ap->a_tnch); 2068 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2069 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2070 while (ip->vp) { 2071 struct vnode *vp; 2072 2073 error = hammer_get_vnode(ip, &vp); 2074 if (error == 0 && vp) { 2075 vn_unlock(vp); 2076 hammer_knote(ip->vp, NOTE_RENAME); 2077 vrele(vp); 2078 break; 2079 } 2080 hdkprintf("ip/vp race2 avoided\n"); 2081 } 2082 } 2083 2084 failed: 2085 hammer_done_transaction(&trans); 2086 lwkt_reltoken(&hmp->fs_token); 2087 return (error); 2088 } 2089 2090 /* 2091 * hammer_vop_nrmdir { nch, dvp, cred } 2092 */ 2093 static 2094 int 2095 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2096 { 2097 struct hammer_transaction trans; 2098 hammer_inode_t dip; 2099 hammer_mount_t hmp; 2100 int error; 2101 2102 dip = VTOI(ap->a_dvp); 2103 hmp = dip->hmp; 2104 2105 if (hammer_nohistory(dip) == 0 && 2106 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2107 return (error); 2108 } 2109 2110 lwkt_gettoken(&hmp->fs_token); 2111 hammer_start_transaction(&trans, hmp); 2112 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2113 hammer_done_transaction(&trans); 2114 if (error == 0) 2115 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2116 lwkt_reltoken(&hmp->fs_token); 2117 return (error); 2118 } 2119 2120 /* 2121 * hammer_vop_markatime { vp, cred } 2122 */ 2123 static 2124 int 2125 hammer_vop_markatime(struct vop_markatime_args *ap) 2126 { 2127 struct hammer_transaction trans; 2128 hammer_inode_t ip; 2129 hammer_mount_t hmp; 2130 2131 ip = VTOI(ap->a_vp); 2132 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2133 return (EROFS); 2134 if (ip->flags & HAMMER_INODE_RO) 2135 return (EROFS); 2136 hmp = ip->hmp; 2137 if (hmp->mp->mnt_flag & MNT_NOATIME) 2138 return (0); 2139 lwkt_gettoken(&hmp->fs_token); 2140 hammer_start_transaction(&trans, hmp); 2141 2142 ip->ino_data.atime = trans.time; 2143 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2144 hammer_done_transaction(&trans); 2145 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2146 lwkt_reltoken(&hmp->fs_token); 2147 return (0); 2148 } 2149 2150 /* 2151 * hammer_vop_setattr { vp, vap, cred } 2152 */ 2153 static 2154 int 2155 hammer_vop_setattr(struct vop_setattr_args *ap) 2156 { 2157 struct hammer_transaction trans; 2158 hammer_inode_t ip; 2159 struct vattr *vap; 2160 hammer_mount_t hmp; 2161 int modflags; 2162 int error; 2163 int truncating; 2164 int blksize; 2165 int kflags; 2166 #if 0 2167 int64_t aligned_size; 2168 #endif 2169 uint32_t flags; 2170 2171 vap = ap->a_vap; 2172 ip = ap->a_vp->v_data; 2173 modflags = 0; 2174 kflags = 0; 2175 hmp = ip->hmp; 2176 2177 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2178 return(EROFS); 2179 if (ip->flags & HAMMER_INODE_RO) 2180 return (EROFS); 2181 if (hammer_nohistory(ip) == 0 && 2182 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2183 return (error); 2184 } 2185 2186 lwkt_gettoken(&hmp->fs_token); 2187 hammer_start_transaction(&trans, hmp); 2188 error = 0; 2189 2190 if (vap->va_flags != VNOVAL) { 2191 flags = ip->ino_data.uflags; 2192 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2193 hammer_to_unix_xid(&ip->ino_data.uid), 2194 ap->a_cred); 2195 if (error == 0) { 2196 if (ip->ino_data.uflags != flags) { 2197 ip->ino_data.uflags = flags; 2198 ip->ino_data.ctime = trans.time; 2199 modflags |= HAMMER_INODE_DDIRTY; 2200 kflags |= NOTE_ATTRIB; 2201 } 2202 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2203 error = 0; 2204 goto done; 2205 } 2206 } 2207 goto done; 2208 } 2209 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2210 error = EPERM; 2211 goto done; 2212 } 2213 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2214 mode_t cur_mode = ip->ino_data.mode; 2215 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2216 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2217 hammer_uuid_t uuid_uid; 2218 hammer_uuid_t uuid_gid; 2219 2220 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2221 ap->a_cred, 2222 &cur_uid, &cur_gid, &cur_mode); 2223 if (error == 0) { 2224 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2225 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2226 if (kuuid_compare(&uuid_uid, &ip->ino_data.uid) || 2227 kuuid_compare(&uuid_gid, &ip->ino_data.gid) || 2228 ip->ino_data.mode != cur_mode) { 2229 ip->ino_data.uid = uuid_uid; 2230 ip->ino_data.gid = uuid_gid; 2231 ip->ino_data.mode = cur_mode; 2232 ip->ino_data.ctime = trans.time; 2233 modflags |= HAMMER_INODE_DDIRTY; 2234 } 2235 kflags |= NOTE_ATTRIB; 2236 } 2237 } 2238 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2239 switch(ap->a_vp->v_type) { 2240 case VREG: 2241 if (vap->va_size == ip->ino_data.size) 2242 break; 2243 2244 /* 2245 * Log the operation if in fast-fsync mode or if 2246 * there are unterminated redo write records present. 2247 * 2248 * The second check is needed so the recovery code 2249 * properly truncates write redos even if nominal 2250 * REDO operations is turned off due to excessive 2251 * writes, because the related records might be 2252 * destroyed and never lay down a TERM_WRITE. 2253 */ 2254 if ((ip->flags & HAMMER_INODE_REDO) || 2255 (ip->flags & HAMMER_INODE_RDIRTY)) { 2256 error = hammer_generate_redo(&trans, ip, 2257 vap->va_size, 2258 HAMMER_REDO_TRUNC, 2259 NULL, 0); 2260 } 2261 blksize = hammer_blocksize(vap->va_size); 2262 2263 /* 2264 * XXX break atomicy, we can deadlock the backend 2265 * if we do not release the lock. Probably not a 2266 * big deal here. 2267 */ 2268 if (vap->va_size < ip->ino_data.size) { 2269 nvtruncbuf(ap->a_vp, vap->va_size, 2270 blksize, 2271 hammer_blockoff(vap->va_size), 2272 0); 2273 truncating = 1; 2274 kflags |= NOTE_WRITE; 2275 } else { 2276 nvextendbuf(ap->a_vp, 2277 ip->ino_data.size, 2278 vap->va_size, 2279 hammer_blocksize(ip->ino_data.size), 2280 hammer_blocksize(vap->va_size), 2281 hammer_blockoff(ip->ino_data.size), 2282 hammer_blockoff(vap->va_size), 2283 0); 2284 truncating = 0; 2285 kflags |= NOTE_WRITE | NOTE_EXTEND; 2286 } 2287 ip->ino_data.size = vap->va_size; 2288 ip->ino_data.mtime = trans.time; 2289 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2290 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2291 vclrflags(ap->a_vp, VLASTWRITETS); 2292 2293 /* 2294 * On-media truncation is cached in the inode until 2295 * the inode is synchronized. We must immediately 2296 * handle any frontend records. 2297 */ 2298 if (truncating) { 2299 hammer_ip_frontend_trunc(ip, vap->va_size); 2300 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2301 ip->flags |= HAMMER_INODE_TRUNCATED; 2302 ip->trunc_off = vap->va_size; 2303 hammer_inode_dirty(ip); 2304 } else if (ip->trunc_off > vap->va_size) { 2305 ip->trunc_off = vap->va_size; 2306 } 2307 } 2308 2309 #if 0 2310 /* 2311 * When truncating, nvtruncbuf() may have cleaned out 2312 * a portion of the last block on-disk in the buffer 2313 * cache. We must clean out any frontend records 2314 * for blocks beyond the new last block. 2315 */ 2316 aligned_size = (vap->va_size + (blksize - 1)) & 2317 ~(int64_t)(blksize - 1); 2318 if (truncating && vap->va_size < aligned_size) { 2319 aligned_size -= blksize; 2320 hammer_ip_frontend_trunc(ip, aligned_size); 2321 } 2322 #endif 2323 break; 2324 case VDATABASE: 2325 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2326 ip->flags |= HAMMER_INODE_TRUNCATED; 2327 ip->trunc_off = vap->va_size; 2328 hammer_inode_dirty(ip); 2329 } else if (ip->trunc_off > vap->va_size) { 2330 ip->trunc_off = vap->va_size; 2331 } 2332 hammer_ip_frontend_trunc(ip, vap->va_size); 2333 ip->ino_data.size = vap->va_size; 2334 ip->ino_data.mtime = trans.time; 2335 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2336 vclrflags(ap->a_vp, VLASTWRITETS); 2337 kflags |= NOTE_ATTRIB; 2338 break; 2339 default: 2340 error = EINVAL; 2341 goto done; 2342 } 2343 break; 2344 } 2345 if (vap->va_atime.tv_sec != VNOVAL) { 2346 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2347 modflags |= HAMMER_INODE_ATIME; 2348 kflags |= NOTE_ATTRIB; 2349 } 2350 if (vap->va_mtime.tv_sec != VNOVAL) { 2351 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2352 modflags |= HAMMER_INODE_MTIME; 2353 kflags |= NOTE_ATTRIB; 2354 vclrflags(ap->a_vp, VLASTWRITETS); 2355 } 2356 if (vap->va_mode != (mode_t)VNOVAL) { 2357 mode_t cur_mode = ip->ino_data.mode; 2358 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2359 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2360 2361 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2362 cur_uid, cur_gid, &cur_mode); 2363 if (error == 0) { 2364 ip->ino_data.mode = cur_mode; 2365 ip->ino_data.ctime = trans.time; 2366 modflags |= HAMMER_INODE_DDIRTY; 2367 kflags |= NOTE_ATTRIB; 2368 } 2369 } 2370 done: 2371 if (error == 0) 2372 hammer_modify_inode(&trans, ip, modflags); 2373 hammer_done_transaction(&trans); 2374 hammer_knote(ap->a_vp, kflags); 2375 lwkt_reltoken(&hmp->fs_token); 2376 return (error); 2377 } 2378 2379 /* 2380 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2381 */ 2382 static 2383 int 2384 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2385 { 2386 struct hammer_transaction trans; 2387 hammer_inode_t dip; 2388 hammer_inode_t nip; 2389 hammer_record_t record; 2390 struct nchandle *nch; 2391 hammer_mount_t hmp; 2392 int error; 2393 int bytes; 2394 2395 ap->a_vap->va_type = VLNK; 2396 2397 nch = ap->a_nch; 2398 dip = VTOI(ap->a_dvp); 2399 hmp = dip->hmp; 2400 2401 if (dip->flags & HAMMER_INODE_RO) 2402 return (EROFS); 2403 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2404 return (error); 2405 2406 /* 2407 * Create a transaction to cover the operations we perform. 2408 */ 2409 lwkt_gettoken(&hmp->fs_token); 2410 hammer_start_transaction(&trans, hmp); 2411 2412 /* 2413 * Create a new filesystem object of the requested type. The 2414 * returned inode will be referenced but not locked. 2415 */ 2416 2417 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2418 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2419 NULL, &nip); 2420 if (error) { 2421 hammer_done_transaction(&trans); 2422 *ap->a_vpp = NULL; 2423 lwkt_reltoken(&hmp->fs_token); 2424 return (error); 2425 } 2426 2427 /* 2428 * Add a record representing the symlink. symlink stores the link 2429 * as pure data, not a string, and is no \0 terminated. 2430 */ 2431 if (error == 0) { 2432 bytes = strlen(ap->a_target); 2433 2434 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2435 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2436 } else { 2437 record = hammer_alloc_mem_record(nip, bytes); 2438 record->type = HAMMER_MEM_RECORD_GENERAL; 2439 2440 record->leaf.base.localization = nip->obj_localization | 2441 HAMMER_LOCALIZE_MISC; 2442 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2443 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2444 record->leaf.data_len = bytes; 2445 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2446 bcopy(ap->a_target, record->data->symlink.name, bytes); 2447 error = hammer_ip_add_record(&trans, record); 2448 } 2449 2450 /* 2451 * Set the file size to the length of the link. 2452 */ 2453 if (error == 0) { 2454 nip->ino_data.size = bytes; 2455 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2456 } 2457 } 2458 if (error == 0) 2459 error = hammer_ip_add_direntry(&trans, dip, nch->ncp->nc_name, 2460 nch->ncp->nc_nlen, nip); 2461 2462 /* 2463 * Finish up. 2464 */ 2465 if (error) { 2466 hammer_rel_inode(nip, 0); 2467 *ap->a_vpp = NULL; 2468 } else { 2469 error = hammer_get_vnode(nip, ap->a_vpp); 2470 hammer_rel_inode(nip, 0); 2471 if (error == 0) { 2472 cache_setunresolved(ap->a_nch); 2473 cache_setvp(ap->a_nch, *ap->a_vpp); 2474 hammer_knote(ap->a_dvp, NOTE_WRITE); 2475 } 2476 } 2477 hammer_done_transaction(&trans); 2478 lwkt_reltoken(&hmp->fs_token); 2479 return (error); 2480 } 2481 2482 /* 2483 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2484 */ 2485 static 2486 int 2487 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2488 { 2489 struct hammer_transaction trans; 2490 hammer_inode_t dip; 2491 hammer_mount_t hmp; 2492 int error; 2493 2494 dip = VTOI(ap->a_dvp); 2495 hmp = dip->hmp; 2496 2497 if (hammer_nohistory(dip) == 0 && 2498 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2499 return (error); 2500 } 2501 2502 lwkt_gettoken(&hmp->fs_token); 2503 hammer_start_transaction(&trans, hmp); 2504 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2505 ap->a_cred, ap->a_flags, -1); 2506 hammer_done_transaction(&trans); 2507 lwkt_reltoken(&hmp->fs_token); 2508 2509 return (error); 2510 } 2511 2512 /* 2513 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2514 */ 2515 static 2516 int 2517 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2518 { 2519 hammer_inode_t ip = ap->a_vp->v_data; 2520 hammer_mount_t hmp = ip->hmp; 2521 int error; 2522 2523 lwkt_gettoken(&hmp->fs_token); 2524 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2525 ap->a_fflag, ap->a_cred); 2526 lwkt_reltoken(&hmp->fs_token); 2527 return (error); 2528 } 2529 2530 static 2531 int 2532 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2533 { 2534 static const struct mountctl_opt extraopt[] = { 2535 { HMNT_NOHISTORY, "nohistory" }, 2536 { HMNT_MASTERID, "master" }, 2537 { HMNT_NOMIRROR, "nomirror" }, 2538 { 0, NULL} 2539 2540 }; 2541 hammer_mount_t hmp; 2542 struct mount *mp; 2543 int usedbytes; 2544 int error; 2545 2546 error = 0; 2547 usedbytes = 0; 2548 mp = ap->a_head.a_ops->head.vv_mount; 2549 KKASSERT(mp->mnt_data != NULL); 2550 hmp = (hammer_mount_t)mp->mnt_data; 2551 2552 lwkt_gettoken(&hmp->fs_token); 2553 2554 switch(ap->a_op) { 2555 case MOUNTCTL_SET_EXPORT: 2556 if (ap->a_ctllen != sizeof(struct export_args)) 2557 error = EINVAL; 2558 else 2559 error = hammer_vfs_export(mp, ap->a_op, 2560 (const struct export_args *)ap->a_ctl); 2561 break; 2562 case MOUNTCTL_MOUNTFLAGS: 2563 /* 2564 * Call standard mountctl VOP function 2565 * so we get user mount flags. 2566 */ 2567 error = vop_stdmountctl(ap); 2568 if (error) 2569 break; 2570 2571 usedbytes = *ap->a_res; 2572 2573 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2574 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2575 ap->a_buf, 2576 ap->a_buflen - usedbytes, 2577 &error); 2578 } 2579 2580 *ap->a_res += usedbytes; 2581 break; 2582 default: 2583 error = vop_stdmountctl(ap); 2584 break; 2585 } 2586 lwkt_reltoken(&hmp->fs_token); 2587 return(error); 2588 } 2589 2590 /* 2591 * hammer_vop_strategy { vp, bio } 2592 * 2593 * Strategy call, used for regular file read & write only. Note that the 2594 * bp may represent a cluster. 2595 * 2596 * To simplify operation and allow better optimizations in the future, 2597 * this code does not make any assumptions with regards to buffer alignment 2598 * or size. 2599 */ 2600 static 2601 int 2602 hammer_vop_strategy(struct vop_strategy_args *ap) 2603 { 2604 struct buf *bp; 2605 int error; 2606 2607 bp = ap->a_bio->bio_buf; 2608 2609 switch(bp->b_cmd) { 2610 case BUF_CMD_READ: 2611 error = hammer_vop_strategy_read(ap); 2612 break; 2613 case BUF_CMD_WRITE: 2614 error = hammer_vop_strategy_write(ap); 2615 break; 2616 default: 2617 bp->b_error = error = EINVAL; 2618 bp->b_flags |= B_ERROR; 2619 biodone(ap->a_bio); 2620 break; 2621 } 2622 return (error); 2623 } 2624 2625 /* 2626 * Read from a regular file. Iterate the related records and fill in the 2627 * BIO/BUF. Gaps are zero-filled. 2628 * 2629 * The support code in hammer_object.c should be used to deal with mixed 2630 * in-memory and on-disk records. 2631 * 2632 * NOTE: Can be called from the cluster code with an oversized buf. 2633 * 2634 * XXX atime update 2635 */ 2636 static 2637 int 2638 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2639 { 2640 struct hammer_transaction trans; 2641 hammer_inode_t ip; 2642 hammer_inode_t dip; 2643 hammer_mount_t hmp; 2644 struct hammer_cursor cursor; 2645 hammer_base_elm_t base; 2646 hammer_off_t disk_offset; 2647 struct bio *bio; 2648 struct bio *nbio; 2649 struct buf *bp; 2650 int64_t rec_offset; 2651 int64_t ran_end; 2652 int64_t tmp64; 2653 int error; 2654 int boff; 2655 int roff; 2656 int n; 2657 int isdedupable; 2658 2659 bio = ap->a_bio; 2660 bp = bio->bio_buf; 2661 ip = ap->a_vp->v_data; 2662 hmp = ip->hmp; 2663 2664 /* 2665 * The zone-2 disk offset may have been set by the cluster code via 2666 * a BMAP operation, or else should be NOOFFSET. 2667 * 2668 * Checking the high bits for a match against zone-2 should suffice. 2669 * 2670 * In cases where a lot of data duplication is present it may be 2671 * more beneficial to drop through and doubule-buffer through the 2672 * device. 2673 */ 2674 nbio = push_bio(bio); 2675 if (hammer_is_zone_large_data(nbio->bio_offset)) { 2676 if (hammer_double_buffer == 0) { 2677 lwkt_gettoken(&hmp->fs_token); 2678 error = hammer_io_direct_read(hmp, nbio, NULL); 2679 lwkt_reltoken(&hmp->fs_token); 2680 return (error); 2681 } 2682 2683 /* 2684 * Try to shortcut requests for double_buffer mode too. 2685 * Since this mode runs through the device buffer cache 2686 * only compatible buffer sizes (meaning those generated 2687 * by normal filesystem buffers) are legal. 2688 */ 2689 if ((bp->b_flags & B_PAGING) == 0) { 2690 lwkt_gettoken(&hmp->fs_token); 2691 error = hammer_io_indirect_read(hmp, nbio, NULL); 2692 lwkt_reltoken(&hmp->fs_token); 2693 return (error); 2694 } 2695 } 2696 2697 /* 2698 * Well, that sucked. Do it the hard way. If all the stars are 2699 * aligned we may still be able to issue a direct-read. 2700 */ 2701 lwkt_gettoken(&hmp->fs_token); 2702 hammer_simple_transaction(&trans, hmp); 2703 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2704 2705 /* 2706 * Key range (begin and end inclusive) to scan. Note that the key's 2707 * stored in the actual records represent BASE+LEN, not BASE. The 2708 * first record containing bio_offset will have a key > bio_offset. 2709 */ 2710 cursor.key_beg.localization = ip->obj_localization | 2711 HAMMER_LOCALIZE_MISC; 2712 cursor.key_beg.obj_id = ip->obj_id; 2713 cursor.key_beg.create_tid = 0; 2714 cursor.key_beg.delete_tid = 0; 2715 cursor.key_beg.obj_type = 0; 2716 cursor.key_beg.key = bio->bio_offset + 1; 2717 cursor.asof = ip->obj_asof; 2718 cursor.flags |= HAMMER_CURSOR_ASOF; 2719 2720 cursor.key_end = cursor.key_beg; 2721 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2722 #if 0 2723 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2724 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2725 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2726 cursor.key_end.key = HAMMER_MAX_KEY; 2727 } else 2728 #endif 2729 { 2730 ran_end = bio->bio_offset + bp->b_bufsize; 2731 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2732 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2733 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2734 if (tmp64 < ran_end) 2735 cursor.key_end.key = HAMMER_MAX_KEY; 2736 else 2737 cursor.key_end.key = ran_end + MAXPHYS + 1; 2738 } 2739 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2740 2741 /* 2742 * Set NOSWAPCACHE for cursor data extraction if double buffering 2743 * is disabled or (if the file is not marked cacheable via chflags 2744 * and vm.swapcache_use_chflags is enabled). 2745 */ 2746 if (hammer_double_buffer == 0 || 2747 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2748 vm_swapcache_use_chflags)) { 2749 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2750 } 2751 2752 error = hammer_ip_first(&cursor); 2753 boff = 0; 2754 2755 while (error == 0) { 2756 /* 2757 * Get the base file offset of the record. The key for 2758 * data records is (base + bytes) rather then (base). 2759 */ 2760 base = &cursor.leaf->base; 2761 rec_offset = base->key - cursor.leaf->data_len; 2762 2763 /* 2764 * Calculate the gap, if any, and zero-fill it. 2765 * 2766 * n is the offset of the start of the record verses our 2767 * current seek offset in the bio. 2768 */ 2769 n = (int)(rec_offset - (bio->bio_offset + boff)); 2770 if (n > 0) { 2771 if (n > bp->b_bufsize - boff) 2772 n = bp->b_bufsize - boff; 2773 bzero((char *)bp->b_data + boff, n); 2774 boff += n; 2775 n = 0; 2776 } 2777 2778 /* 2779 * Calculate the data offset in the record and the number 2780 * of bytes we can copy. 2781 * 2782 * There are two degenerate cases. First, boff may already 2783 * be at bp->b_bufsize. Secondly, the data offset within 2784 * the record may exceed the record's size. 2785 */ 2786 roff = -n; 2787 rec_offset += roff; 2788 n = cursor.leaf->data_len - roff; 2789 if (n <= 0) { 2790 hdkprintf("bad n=%d roff=%d\n", n, roff); 2791 n = 0; 2792 } else if (n > bp->b_bufsize - boff) { 2793 n = bp->b_bufsize - boff; 2794 } 2795 2796 /* 2797 * Deal with cached truncations. This cool bit of code 2798 * allows truncate()/ftruncate() to avoid having to sync 2799 * the file. 2800 * 2801 * If the frontend is truncated then all backend records are 2802 * subject to the frontend's truncation. 2803 * 2804 * If the backend is truncated then backend records on-disk 2805 * (but not in-memory) are subject to the backend's 2806 * truncation. In-memory records owned by the backend 2807 * represent data written after the truncation point on the 2808 * backend and must not be truncated. 2809 * 2810 * Truncate operations deal with frontend buffer cache 2811 * buffers and frontend-owned in-memory records synchronously. 2812 */ 2813 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2814 if (hammer_cursor_ondisk(&cursor)/* || 2815 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2816 if (ip->trunc_off <= rec_offset) 2817 n = 0; 2818 else if (ip->trunc_off < rec_offset + n) 2819 n = (int)(ip->trunc_off - rec_offset); 2820 } 2821 } 2822 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2823 if (hammer_cursor_ondisk(&cursor)) { 2824 if (ip->sync_trunc_off <= rec_offset) 2825 n = 0; 2826 else if (ip->sync_trunc_off < rec_offset + n) 2827 n = (int)(ip->sync_trunc_off - rec_offset); 2828 } 2829 } 2830 2831 /* 2832 * Try to issue a direct read into our bio if possible, 2833 * otherwise resolve the element data into a hammer_buffer 2834 * and copy. 2835 * 2836 * The buffer on-disk should be zerod past any real 2837 * truncation point, but may not be for any synthesized 2838 * truncation point from above. 2839 * 2840 * NOTE: disk_offset is only valid if the cursor data is 2841 * on-disk. 2842 */ 2843 disk_offset = cursor.leaf->data_offset + roff; 2844 isdedupable = (boff == 0 && n == bp->b_bufsize && 2845 hammer_cursor_ondisk(&cursor) && 2846 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2847 2848 if (isdedupable && hammer_double_buffer == 0) { 2849 /* 2850 * Direct read case 2851 */ 2852 KKASSERT(hammer_is_zone_large_data(disk_offset)); 2853 nbio->bio_offset = disk_offset; 2854 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2855 goto done; 2856 } else if (isdedupable) { 2857 /* 2858 * Async I/O case for reading from backing store 2859 * and copying the data to the filesystem buffer. 2860 */ 2861 KKASSERT(hammer_is_zone_large_data(disk_offset)); 2862 nbio->bio_offset = disk_offset; 2863 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2864 goto done; 2865 } else if (n) { 2866 error = hammer_ip_resolve_data(&cursor); 2867 if (error == 0) { 2868 bcopy((char *)cursor.data + roff, 2869 (char *)bp->b_data + boff, n); 2870 } 2871 } 2872 if (error) 2873 break; 2874 2875 /* 2876 * Iterate until we have filled the request. 2877 */ 2878 boff += n; 2879 if (boff == bp->b_bufsize) 2880 break; 2881 error = hammer_ip_next(&cursor); 2882 } 2883 2884 /* 2885 * There may have been a gap after the last record 2886 */ 2887 if (error == ENOENT) 2888 error = 0; 2889 if (error == 0 && boff != bp->b_bufsize) { 2890 KKASSERT(boff < bp->b_bufsize); 2891 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2892 /* boff = bp->b_bufsize; */ 2893 } 2894 2895 /* 2896 * Disallow swapcache operation on the vnode buffer if double 2897 * buffering is enabled, the swapcache will get the data via 2898 * the block device buffer. 2899 */ 2900 if (hammer_double_buffer) 2901 bp->b_flags |= B_NOTMETA; 2902 2903 /* 2904 * Cleanup 2905 */ 2906 bp->b_resid = 0; 2907 bp->b_error = error; 2908 if (error) 2909 bp->b_flags |= B_ERROR; 2910 biodone(ap->a_bio); 2911 2912 done: 2913 /* 2914 * Cache the b-tree node for the last data read in cache[1]. 2915 * 2916 * If we hit the file EOF then also cache the node in the 2917 * governing directory's cache[3], it will be used to initialize 2918 * the new inode's cache[1] for any inodes looked up via the directory. 2919 * 2920 * This doesn't reduce disk accesses since the B-Tree chain is 2921 * likely cached, but it does reduce cpu overhead when looking 2922 * up file offsets for cpdup/tar/cpio style iterations. 2923 */ 2924 if (cursor.node) 2925 hammer_cache_node(&ip->cache[1], cursor.node); 2926 if (ran_end >= ip->ino_data.size) { 2927 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2928 ip->obj_asof, ip->obj_localization); 2929 if (dip) { 2930 hammer_cache_node(&dip->cache[3], cursor.node); 2931 hammer_rel_inode(dip, 0); 2932 } 2933 } 2934 hammer_done_cursor(&cursor); 2935 hammer_done_transaction(&trans); 2936 lwkt_reltoken(&hmp->fs_token); 2937 return(error); 2938 } 2939 2940 /* 2941 * BMAP operation - used to support cluster_read() only. 2942 * 2943 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2944 * 2945 * This routine may return EOPNOTSUPP if the opration is not supported for 2946 * the specified offset. The contents of the pointer arguments do not 2947 * need to be initialized in that case. 2948 * 2949 * If a disk address is available and properly aligned return 0 with 2950 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2951 * to the run-length relative to that offset. Callers may assume that 2952 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2953 * large, so return EOPNOTSUPP if it is not sufficiently large. 2954 */ 2955 static 2956 int 2957 hammer_vop_bmap(struct vop_bmap_args *ap) 2958 { 2959 struct hammer_transaction trans; 2960 hammer_inode_t ip; 2961 hammer_mount_t hmp; 2962 struct hammer_cursor cursor; 2963 hammer_base_elm_t base; 2964 int64_t rec_offset; 2965 int64_t ran_end; 2966 int64_t tmp64; 2967 int64_t base_offset; 2968 int64_t base_disk_offset; 2969 int64_t last_offset; 2970 hammer_off_t last_disk_offset; 2971 hammer_off_t disk_offset; 2972 int rec_len; 2973 int error; 2974 int blksize; 2975 2976 ip = ap->a_vp->v_data; 2977 hmp = ip->hmp; 2978 2979 /* 2980 * We can only BMAP regular files. We can't BMAP database files, 2981 * directories, etc. 2982 */ 2983 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 2984 return(EOPNOTSUPP); 2985 2986 /* 2987 * bmap is typically called with runp/runb both NULL when used 2988 * for writing. We do not support BMAP for writing atm. 2989 */ 2990 if (ap->a_cmd != BUF_CMD_READ) 2991 return(EOPNOTSUPP); 2992 2993 /* 2994 * Scan the B-Tree to acquire blockmap addresses, then translate 2995 * to raw addresses. 2996 */ 2997 lwkt_gettoken(&hmp->fs_token); 2998 hammer_simple_transaction(&trans, hmp); 2999 3000 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3001 3002 /* 3003 * Key range (begin and end inclusive) to scan. Note that the key's 3004 * stored in the actual records represent BASE+LEN, not BASE. The 3005 * first record containing bio_offset will have a key > bio_offset. 3006 */ 3007 cursor.key_beg.localization = ip->obj_localization | 3008 HAMMER_LOCALIZE_MISC; 3009 cursor.key_beg.obj_id = ip->obj_id; 3010 cursor.key_beg.create_tid = 0; 3011 cursor.key_beg.delete_tid = 0; 3012 cursor.key_beg.obj_type = 0; 3013 if (ap->a_runb) 3014 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3015 else 3016 cursor.key_beg.key = ap->a_loffset + 1; 3017 if (cursor.key_beg.key < 0) 3018 cursor.key_beg.key = 0; 3019 cursor.asof = ip->obj_asof; 3020 cursor.flags |= HAMMER_CURSOR_ASOF; 3021 3022 cursor.key_end = cursor.key_beg; 3023 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3024 3025 ran_end = ap->a_loffset + MAXPHYS; 3026 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3027 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3028 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3029 if (tmp64 < ran_end) 3030 cursor.key_end.key = HAMMER_MAX_KEY; 3031 else 3032 cursor.key_end.key = ran_end + MAXPHYS + 1; 3033 3034 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3035 3036 error = hammer_ip_first(&cursor); 3037 base_offset = last_offset = 0; 3038 base_disk_offset = last_disk_offset = 0; 3039 3040 while (error == 0) { 3041 /* 3042 * Get the base file offset of the record. The key for 3043 * data records is (base + bytes) rather then (base). 3044 * 3045 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3046 * The extra bytes should be zero on-disk and the BMAP op 3047 * should still be ok. 3048 */ 3049 base = &cursor.leaf->base; 3050 rec_offset = base->key - cursor.leaf->data_len; 3051 rec_len = cursor.leaf->data_len; 3052 3053 /* 3054 * Incorporate any cached truncation. 3055 * 3056 * NOTE: Modifications to rec_len based on synthesized 3057 * truncation points remove the guarantee that any extended 3058 * data on disk is zero (since the truncations may not have 3059 * taken place on-media yet). 3060 */ 3061 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3062 if (hammer_cursor_ondisk(&cursor) || 3063 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3064 if (ip->trunc_off <= rec_offset) 3065 rec_len = 0; 3066 else if (ip->trunc_off < rec_offset + rec_len) 3067 rec_len = (int)(ip->trunc_off - rec_offset); 3068 } 3069 } 3070 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3071 if (hammer_cursor_ondisk(&cursor)) { 3072 if (ip->sync_trunc_off <= rec_offset) 3073 rec_len = 0; 3074 else if (ip->sync_trunc_off < rec_offset + rec_len) 3075 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3076 } 3077 } 3078 3079 /* 3080 * Accumulate information. If we have hit a discontiguous 3081 * block reset base_offset unless we are already beyond the 3082 * requested offset. If we are, that's it, we stop. 3083 */ 3084 if (error) 3085 break; 3086 if (hammer_cursor_ondisk(&cursor)) { 3087 disk_offset = cursor.leaf->data_offset; 3088 if (rec_offset != last_offset || 3089 disk_offset != last_disk_offset) { 3090 if (rec_offset > ap->a_loffset) 3091 break; 3092 base_offset = rec_offset; 3093 base_disk_offset = disk_offset; 3094 } 3095 last_offset = rec_offset + rec_len; 3096 last_disk_offset = disk_offset + rec_len; 3097 } 3098 error = hammer_ip_next(&cursor); 3099 } 3100 3101 if (cursor.node) 3102 hammer_cache_node(&ip->cache[1], cursor.node); 3103 3104 hammer_done_cursor(&cursor); 3105 hammer_done_transaction(&trans); 3106 lwkt_reltoken(&hmp->fs_token); 3107 3108 /* 3109 * If we couldn't find any records or the records we did find were 3110 * all behind the requested offset, return failure. A forward 3111 * truncation can leave a hole w/ no on-disk records. 3112 */ 3113 if (last_offset == 0 || last_offset < ap->a_loffset) 3114 return (EOPNOTSUPP); 3115 3116 /* 3117 * Figure out the block size at the requested offset and adjust 3118 * our limits so the cluster_read() does not create inappropriately 3119 * sized buffer cache buffers. 3120 */ 3121 blksize = hammer_blocksize(ap->a_loffset); 3122 if (hammer_blocksize(base_offset) != blksize) { 3123 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3124 } 3125 if (last_offset != ap->a_loffset && 3126 hammer_blocksize(last_offset - 1) != blksize) { 3127 last_offset = hammer_blockdemarc(ap->a_loffset, 3128 last_offset - 1); 3129 } 3130 3131 /* 3132 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3133 * from occuring. 3134 */ 3135 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3136 3137 if (!hammer_is_zone_large_data(disk_offset)) { 3138 /* 3139 * Only large-data zones can be direct-IOd 3140 */ 3141 error = EOPNOTSUPP; 3142 } else if ((disk_offset & HAMMER_BUFMASK) || 3143 (last_offset - ap->a_loffset) < blksize) { 3144 /* 3145 * doffsetp is not aligned or the forward run size does 3146 * not cover a whole buffer, disallow the direct I/O. 3147 */ 3148 error = EOPNOTSUPP; 3149 } else { 3150 /* 3151 * We're good. 3152 */ 3153 *ap->a_doffsetp = disk_offset; 3154 if (ap->a_runb) { 3155 *ap->a_runb = ap->a_loffset - base_offset; 3156 KKASSERT(*ap->a_runb >= 0); 3157 } 3158 if (ap->a_runp) { 3159 *ap->a_runp = last_offset - ap->a_loffset; 3160 KKASSERT(*ap->a_runp >= 0); 3161 } 3162 error = 0; 3163 } 3164 return(error); 3165 } 3166 3167 /* 3168 * Write to a regular file. Because this is a strategy call the OS is 3169 * trying to actually get data onto the media. 3170 */ 3171 static 3172 int 3173 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3174 { 3175 hammer_record_t record; 3176 hammer_mount_t hmp; 3177 hammer_inode_t ip; 3178 struct bio *bio; 3179 struct buf *bp; 3180 int blksize __debugvar; 3181 int bytes; 3182 int error; 3183 3184 bio = ap->a_bio; 3185 bp = bio->bio_buf; 3186 ip = ap->a_vp->v_data; 3187 hmp = ip->hmp; 3188 3189 blksize = hammer_blocksize(bio->bio_offset); 3190 KKASSERT(bp->b_bufsize == blksize); 3191 3192 if (ip->flags & HAMMER_INODE_RO) { 3193 bp->b_error = EROFS; 3194 bp->b_flags |= B_ERROR; 3195 biodone(ap->a_bio); 3196 return(EROFS); 3197 } 3198 3199 lwkt_gettoken(&hmp->fs_token); 3200 3201 /* 3202 * Disallow swapcache operation on the vnode buffer if double 3203 * buffering is enabled, the swapcache will get the data via 3204 * the block device buffer. 3205 */ 3206 if (hammer_double_buffer) 3207 bp->b_flags |= B_NOTMETA; 3208 3209 /* 3210 * Interlock with inode destruction (no in-kernel or directory 3211 * topology visibility). If we queue new IO while trying to 3212 * destroy the inode we can deadlock the vtrunc call in 3213 * hammer_inode_unloadable_check(). 3214 * 3215 * Besides, there's no point flushing a bp associated with an 3216 * inode that is being destroyed on-media and has no kernel 3217 * references. 3218 */ 3219 if ((ip->flags | ip->sync_flags) & 3220 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3221 bp->b_resid = 0; 3222 biodone(ap->a_bio); 3223 lwkt_reltoken(&hmp->fs_token); 3224 return(0); 3225 } 3226 3227 /* 3228 * Reserve space and issue a direct-write from the front-end. 3229 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3230 * allocations. 3231 * 3232 * An in-memory record will be installed to reference the storage 3233 * until the flusher can get to it. 3234 * 3235 * Since we own the high level bio the front-end will not try to 3236 * do a direct-read until the write completes. 3237 * 3238 * NOTE: The only time we do not reserve a full-sized buffers 3239 * worth of data is if the file is small. We do not try to 3240 * allocate a fragment (from the small-data zone) at the end of 3241 * an otherwise large file as this can lead to wildly separated 3242 * data. 3243 */ 3244 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3245 KKASSERT(bio->bio_offset < ip->ino_data.size); 3246 if (bio->bio_offset || ip->ino_data.size > HAMMER_HBUFSIZE) 3247 bytes = bp->b_bufsize; 3248 else 3249 bytes = HAMMER_DATA_DOALIGN_WITH(int, ip->ino_data.size); 3250 3251 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3252 bytes, &error); 3253 3254 /* 3255 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3256 * in hammer_vop_write(). We must flag the record so the proper 3257 * REDO_TERM_WRITE entry is generated during the flush. 3258 */ 3259 if (record) { 3260 if (bp->b_flags & B_VFSFLAG1) { 3261 record->flags |= HAMMER_RECF_REDO; 3262 bp->b_flags &= ~B_VFSFLAG1; 3263 } 3264 hammer_io_direct_write(hmp, bio, record); 3265 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3266 hammer_flush_inode(ip, 0); 3267 } else { 3268 bp->b_bio2.bio_offset = NOOFFSET; 3269 bp->b_error = error; 3270 bp->b_flags |= B_ERROR; 3271 biodone(ap->a_bio); 3272 } 3273 lwkt_reltoken(&hmp->fs_token); 3274 return(error); 3275 } 3276 3277 /* 3278 * dounlink - disconnect a directory entry 3279 * 3280 * XXX whiteout support not really in yet 3281 */ 3282 static int 3283 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3284 struct vnode *dvp, struct ucred *cred, 3285 int flags, int isdir) 3286 { 3287 struct namecache *ncp; 3288 hammer_inode_t dip; 3289 hammer_inode_t ip; 3290 hammer_mount_t hmp; 3291 struct hammer_cursor cursor; 3292 int64_t namekey; 3293 uint32_t max_iterations; 3294 int nlen, error; 3295 3296 /* 3297 * Calculate the namekey and setup the key range for the scan. This 3298 * works kinda like a chained hash table where the lower 32 bits 3299 * of the namekey synthesize the chain. 3300 * 3301 * The key range is inclusive of both key_beg and key_end. 3302 */ 3303 dip = VTOI(dvp); 3304 ncp = nch->ncp; 3305 hmp = dip->hmp; 3306 3307 if (dip->flags & HAMMER_INODE_RO) 3308 return (EROFS); 3309 3310 namekey = hammer_direntry_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3311 &max_iterations); 3312 retry: 3313 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3314 cursor.key_beg.localization = dip->obj_localization | 3315 hammer_dir_localization(dip); 3316 cursor.key_beg.obj_id = dip->obj_id; 3317 cursor.key_beg.key = namekey; 3318 cursor.key_beg.create_tid = 0; 3319 cursor.key_beg.delete_tid = 0; 3320 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3321 cursor.key_beg.obj_type = 0; 3322 3323 cursor.key_end = cursor.key_beg; 3324 cursor.key_end.key += max_iterations; 3325 cursor.asof = dip->obj_asof; 3326 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3327 3328 /* 3329 * Scan all matching records (the chain), locate the one matching 3330 * the requested path component. info->last_error contains the 3331 * error code on search termination and could be 0, ENOENT, or 3332 * something else. 3333 * 3334 * The hammer_ip_*() functions merge in-memory records with on-disk 3335 * records for the purposes of the search. 3336 */ 3337 error = hammer_ip_first(&cursor); 3338 3339 while (error == 0) { 3340 error = hammer_ip_resolve_data(&cursor); 3341 if (error) 3342 break; 3343 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3344 KKASSERT(nlen > 0); 3345 if (ncp->nc_nlen == nlen && 3346 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3347 break; 3348 } 3349 error = hammer_ip_next(&cursor); 3350 } 3351 3352 /* 3353 * If all is ok we have to get the inode so we can adjust nlinks. 3354 * To avoid a deadlock with the flusher we must release the inode 3355 * lock on the directory when acquiring the inode for the entry. 3356 * 3357 * If the target is a directory, it must be empty. 3358 */ 3359 if (error == 0) { 3360 hammer_unlock(&cursor.ip->lock); 3361 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3362 hmp->asof, 3363 cursor.data->entry.localization, 3364 0, &error); 3365 hammer_lock_sh(&cursor.ip->lock); 3366 if (error == ENOENT) { 3367 hkprintf("WARNING: Removing dirent w/missing inode " 3368 "\"%s\"\n" 3369 "\tobj_id = %016jx\n", 3370 ncp->nc_name, 3371 (intmax_t)cursor.data->entry.obj_id); 3372 error = 0; 3373 } 3374 3375 /* 3376 * If isdir >= 0 we validate that the entry is or is not a 3377 * directory. If isdir < 0 we don't care. 3378 */ 3379 if (error == 0 && isdir >= 0 && ip) { 3380 if (isdir && 3381 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3382 error = ENOTDIR; 3383 } else if (isdir == 0 && 3384 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3385 error = EISDIR; 3386 } 3387 } 3388 3389 /* 3390 * If we are trying to remove a directory the directory must 3391 * be empty. 3392 * 3393 * The check directory code can loop and deadlock/retry. Our 3394 * own cursor's node locks must be released to avoid a 3-way 3395 * deadlock with the flusher if the check directory code 3396 * blocks. 3397 * 3398 * If any changes whatsoever have been made to the cursor 3399 * set EDEADLK and retry. 3400 * 3401 * WARNING: See warnings in hammer_unlock_cursor() 3402 * function. 3403 */ 3404 if (error == 0 && ip && ip->ino_data.obj_type == 3405 HAMMER_OBJTYPE_DIRECTORY) { 3406 hammer_unlock_cursor(&cursor); 3407 error = hammer_ip_check_directory_empty(trans, ip); 3408 hammer_lock_cursor(&cursor); 3409 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3410 hkprintf("Warning: avoided deadlock " 3411 "on rmdir '%s'\n", 3412 ncp->nc_name); 3413 error = EDEADLK; 3414 } 3415 } 3416 3417 /* 3418 * Delete the directory entry. 3419 * 3420 * WARNING: hammer_ip_del_direntry() may have to terminate 3421 * the cursor to avoid a deadlock. It is ok to call 3422 * hammer_done_cursor() twice. 3423 */ 3424 if (error == 0) { 3425 error = hammer_ip_del_direntry(trans, &cursor, 3426 dip, ip); 3427 } 3428 hammer_done_cursor(&cursor); 3429 if (error == 0) { 3430 /* 3431 * Tell the namecache that we are now unlinked. 3432 */ 3433 cache_unlink(nch); 3434 3435 /* 3436 * NOTE: ip->vp, if non-NULL, cannot be directly 3437 * referenced without formally acquiring the 3438 * vp since the vp might have zero refs on it, 3439 * or in the middle of a reclaim, etc. 3440 * 3441 * NOTE: The cache_setunresolved() can rip the vp 3442 * out from under us since the vp may not have 3443 * any refs, in which case ip->vp will be NULL 3444 * from the outset. 3445 */ 3446 while (ip && ip->vp) { 3447 struct vnode *vp; 3448 3449 error = hammer_get_vnode(ip, &vp); 3450 if (error == 0 && vp) { 3451 vn_unlock(vp); 3452 hammer_knote(ip->vp, NOTE_DELETE); 3453 #if 0 3454 /* 3455 * Don't do this, it can deadlock 3456 * on concurrent rm's of hardlinks. 3457 * Shouldn't be needed any more. 3458 */ 3459 cache_inval_vp(ip->vp, CINV_DESTROY); 3460 #endif 3461 vrele(vp); 3462 break; 3463 } 3464 hdkprintf("ip/vp race1 avoided\n"); 3465 } 3466 } 3467 if (ip) 3468 hammer_rel_inode(ip, 0); 3469 } else { 3470 hammer_done_cursor(&cursor); 3471 } 3472 if (error == EDEADLK) 3473 goto retry; 3474 3475 return (error); 3476 } 3477 3478 /************************************************************************ 3479 * FIFO AND SPECFS OPS * 3480 ************************************************************************ 3481 * 3482 */ 3483 static int 3484 hammer_vop_fifoclose (struct vop_close_args *ap) 3485 { 3486 /* XXX update itimes */ 3487 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3488 } 3489 3490 static int 3491 hammer_vop_fiforead (struct vop_read_args *ap) 3492 { 3493 int error; 3494 3495 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3496 /* XXX update access time */ 3497 return (error); 3498 } 3499 3500 static int 3501 hammer_vop_fifowrite (struct vop_write_args *ap) 3502 { 3503 int error; 3504 3505 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3506 /* XXX update access time */ 3507 return (error); 3508 } 3509 3510 static 3511 int 3512 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3513 { 3514 int error; 3515 3516 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3517 if (error) 3518 error = hammer_vop_kqfilter(ap); 3519 return(error); 3520 } 3521 3522 /************************************************************************ 3523 * KQFILTER OPS * 3524 ************************************************************************ 3525 * 3526 */ 3527 static void filt_hammerdetach(struct knote *kn); 3528 static int filt_hammerread(struct knote *kn, long hint); 3529 static int filt_hammerwrite(struct knote *kn, long hint); 3530 static int filt_hammervnode(struct knote *kn, long hint); 3531 3532 static struct filterops hammerread_filtops = 3533 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3534 NULL, filt_hammerdetach, filt_hammerread }; 3535 static struct filterops hammerwrite_filtops = 3536 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3537 NULL, filt_hammerdetach, filt_hammerwrite }; 3538 static struct filterops hammervnode_filtops = 3539 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3540 NULL, filt_hammerdetach, filt_hammervnode }; 3541 3542 static 3543 int 3544 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3545 { 3546 struct vnode *vp = ap->a_vp; 3547 struct knote *kn = ap->a_kn; 3548 3549 switch (kn->kn_filter) { 3550 case EVFILT_READ: 3551 kn->kn_fop = &hammerread_filtops; 3552 break; 3553 case EVFILT_WRITE: 3554 kn->kn_fop = &hammerwrite_filtops; 3555 break; 3556 case EVFILT_VNODE: 3557 kn->kn_fop = &hammervnode_filtops; 3558 break; 3559 default: 3560 return (EOPNOTSUPP); 3561 } 3562 3563 kn->kn_hook = (caddr_t)vp; 3564 3565 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3566 3567 return(0); 3568 } 3569 3570 static void 3571 filt_hammerdetach(struct knote *kn) 3572 { 3573 struct vnode *vp = (void *)kn->kn_hook; 3574 3575 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3576 } 3577 3578 static int 3579 filt_hammerread(struct knote *kn, long hint) 3580 { 3581 struct vnode *vp = (void *)kn->kn_hook; 3582 hammer_inode_t ip = VTOI(vp); 3583 hammer_mount_t hmp = ip->hmp; 3584 off_t off; 3585 3586 if (hint == NOTE_REVOKE) { 3587 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3588 return(1); 3589 } 3590 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3591 off = ip->ino_data.size - kn->kn_fp->f_offset; 3592 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3593 lwkt_reltoken(&hmp->fs_token); 3594 if (kn->kn_sfflags & NOTE_OLDAPI) 3595 return(1); 3596 return (kn->kn_data != 0); 3597 } 3598 3599 static int 3600 filt_hammerwrite(struct knote *kn, long hint) 3601 { 3602 if (hint == NOTE_REVOKE) 3603 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3604 kn->kn_data = 0; 3605 return (1); 3606 } 3607 3608 static int 3609 filt_hammervnode(struct knote *kn, long hint) 3610 { 3611 if (kn->kn_sfflags & hint) 3612 kn->kn_fflags |= hint; 3613 if (hint == NOTE_REVOKE) { 3614 kn->kn_flags |= (EV_EOF | EV_NODATA); 3615 return (1); 3616 } 3617 return (kn->kn_fflags != 0); 3618 } 3619 3620