1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/mountctl.h> 36 #include <sys/namecache.h> 37 #include <sys/buf2.h> 38 #include <vfs/fifofs/fifo.h> 39 40 #include "hammer.h" 41 42 /* 43 * USERFS VNOPS 44 */ 45 static int hammer_vop_fsync(struct vop_fsync_args *); 46 static int hammer_vop_read(struct vop_read_args *); 47 static int hammer_vop_write(struct vop_write_args *); 48 static int hammer_vop_access(struct vop_access_args *); 49 static int hammer_vop_advlock(struct vop_advlock_args *); 50 static int hammer_vop_close(struct vop_close_args *); 51 static int hammer_vop_ncreate(struct vop_ncreate_args *); 52 static int hammer_vop_getattr(struct vop_getattr_args *); 53 static int hammer_vop_nresolve(struct vop_nresolve_args *); 54 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 55 static int hammer_vop_nlink(struct vop_nlink_args *); 56 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 57 static int hammer_vop_nmknod(struct vop_nmknod_args *); 58 static int hammer_vop_open(struct vop_open_args *); 59 static int hammer_vop_print(struct vop_print_args *); 60 static int hammer_vop_readdir(struct vop_readdir_args *); 61 static int hammer_vop_readlink(struct vop_readlink_args *); 62 static int hammer_vop_nremove(struct vop_nremove_args *); 63 static int hammer_vop_nrename(struct vop_nrename_args *); 64 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 65 static int hammer_vop_markatime(struct vop_markatime_args *); 66 static int hammer_vop_setattr(struct vop_setattr_args *); 67 static int hammer_vop_strategy(struct vop_strategy_args *); 68 static int hammer_vop_bmap(struct vop_bmap_args *ap); 69 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 70 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 71 static int hammer_vop_ioctl(struct vop_ioctl_args *); 72 static int hammer_vop_mountctl(struct vop_mountctl_args *); 73 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 74 75 static int hammer_vop_fifoclose (struct vop_close_args *); 76 static int hammer_vop_fiforead (struct vop_read_args *); 77 static int hammer_vop_fifowrite (struct vop_write_args *); 78 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 79 80 struct vop_ops hammer_vnode_vops = { 81 .vop_default = vop_defaultop, 82 .vop_fsync = hammer_vop_fsync, 83 .vop_getpages = vop_stdgetpages, 84 .vop_putpages = vop_stdputpages, 85 .vop_read = hammer_vop_read, 86 .vop_write = hammer_vop_write, 87 .vop_access = hammer_vop_access, 88 .vop_advlock = hammer_vop_advlock, 89 .vop_close = hammer_vop_close, 90 .vop_ncreate = hammer_vop_ncreate, 91 .vop_getattr = hammer_vop_getattr, 92 .vop_inactive = hammer_vop_inactive, 93 .vop_reclaim = hammer_vop_reclaim, 94 .vop_nresolve = hammer_vop_nresolve, 95 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 96 .vop_nlink = hammer_vop_nlink, 97 .vop_nmkdir = hammer_vop_nmkdir, 98 .vop_nmknod = hammer_vop_nmknod, 99 .vop_open = hammer_vop_open, 100 .vop_pathconf = vop_stdpathconf, 101 .vop_print = hammer_vop_print, 102 .vop_readdir = hammer_vop_readdir, 103 .vop_readlink = hammer_vop_readlink, 104 .vop_nremove = hammer_vop_nremove, 105 .vop_nrename = hammer_vop_nrename, 106 .vop_nrmdir = hammer_vop_nrmdir, 107 .vop_markatime = hammer_vop_markatime, 108 .vop_setattr = hammer_vop_setattr, 109 .vop_bmap = hammer_vop_bmap, 110 .vop_strategy = hammer_vop_strategy, 111 .vop_nsymlink = hammer_vop_nsymlink, 112 .vop_nwhiteout = hammer_vop_nwhiteout, 113 .vop_ioctl = hammer_vop_ioctl, 114 .vop_mountctl = hammer_vop_mountctl, 115 .vop_kqfilter = hammer_vop_kqfilter 116 }; 117 118 struct vop_ops hammer_spec_vops = { 119 .vop_default = vop_defaultop, 120 .vop_fsync = hammer_vop_fsync, 121 .vop_read = vop_stdnoread, 122 .vop_write = vop_stdnowrite, 123 .vop_access = hammer_vop_access, 124 .vop_close = hammer_vop_close, 125 .vop_markatime = hammer_vop_markatime, 126 .vop_getattr = hammer_vop_getattr, 127 .vop_inactive = hammer_vop_inactive, 128 .vop_reclaim = hammer_vop_reclaim, 129 .vop_setattr = hammer_vop_setattr 130 }; 131 132 struct vop_ops hammer_fifo_vops = { 133 .vop_default = fifo_vnoperate, 134 .vop_fsync = hammer_vop_fsync, 135 .vop_read = hammer_vop_fiforead, 136 .vop_write = hammer_vop_fifowrite, 137 .vop_access = hammer_vop_access, 138 .vop_close = hammer_vop_fifoclose, 139 .vop_markatime = hammer_vop_markatime, 140 .vop_getattr = hammer_vop_getattr, 141 .vop_inactive = hammer_vop_inactive, 142 .vop_reclaim = hammer_vop_reclaim, 143 .vop_setattr = hammer_vop_setattr, 144 .vop_kqfilter = hammer_vop_fifokqfilter 145 }; 146 147 static __inline 148 void 149 hammer_knote(struct vnode *vp, int flags) 150 { 151 if (flags) 152 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 153 } 154 155 #ifdef DEBUG_TRUNCATE 156 struct hammer_inode *HammerTruncIp; 157 #endif 158 159 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 160 struct vnode *dvp, struct ucred *cred, 161 int flags, int isdir); 162 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 163 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 164 165 /* 166 * hammer_vop_fsync { vp, waitfor } 167 * 168 * fsync() an inode to disk and wait for it to be completely committed 169 * such that the information would not be undone if a crash occured after 170 * return. 171 * 172 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 173 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 174 * operation. 175 * 176 * Ultimately the combination of a REDO log and use of fast storage 177 * to front-end cluster caches will make fsync fast, but it aint 178 * here yet. And, in anycase, we need real transactional 179 * all-or-nothing features which are not restricted to a single file. 180 */ 181 static 182 int 183 hammer_vop_fsync(struct vop_fsync_args *ap) 184 { 185 hammer_inode_t ip = VTOI(ap->a_vp); 186 hammer_mount_t hmp = ip->hmp; 187 int waitfor = ap->a_waitfor; 188 int mode; 189 190 lwkt_gettoken(&hmp->fs_token); 191 192 /* 193 * Fsync rule relaxation (default is either full synchronous flush 194 * or REDO semantics with synchronous flush). 195 */ 196 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 197 switch(hammer_fsync_mode) { 198 case 0: 199 mode0: 200 /* no REDO, full synchronous flush */ 201 goto skip; 202 case 1: 203 mode1: 204 /* no REDO, full asynchronous flush */ 205 if (waitfor == MNT_WAIT) 206 waitfor = MNT_NOWAIT; 207 goto skip; 208 case 2: 209 /* REDO semantics, synchronous flush */ 210 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 211 goto mode0; 212 mode = HAMMER_FLUSH_UNDOS_AUTO; 213 break; 214 case 3: 215 /* REDO semantics, relaxed asynchronous flush */ 216 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 217 goto mode1; 218 mode = HAMMER_FLUSH_UNDOS_RELAXED; 219 if (waitfor == MNT_WAIT) 220 waitfor = MNT_NOWAIT; 221 break; 222 case 4: 223 /* ignore the fsync() system call */ 224 lwkt_reltoken(&hmp->fs_token); 225 return(0); 226 default: 227 /* we have to do something */ 228 mode = HAMMER_FLUSH_UNDOS_RELAXED; 229 if (waitfor == MNT_WAIT) 230 waitfor = MNT_NOWAIT; 231 break; 232 } 233 234 /* 235 * Fast fsync only needs to flush the UNDO/REDO fifo if 236 * HAMMER_INODE_REDO is non-zero and the only modifications 237 * made to the file are write or write-extends. 238 */ 239 if ((ip->flags & HAMMER_INODE_REDO) && 240 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0) { 241 ++hammer_count_fsyncs; 242 hammer_flusher_flush_undos(hmp, mode); 243 ip->redo_count = 0; 244 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 245 vclrisdirty(ip->vp); 246 lwkt_reltoken(&hmp->fs_token); 247 return(0); 248 } 249 250 /* 251 * REDO is enabled by fsync(), the idea being we really only 252 * want to lay down REDO records when programs are using 253 * fsync() heavily. The first fsync() on the file starts 254 * the gravy train going and later fsync()s keep it hot by 255 * resetting the redo_count. 256 * 257 * We weren't running REDOs before now so we have to fall 258 * through and do a full fsync of what we have. 259 */ 260 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 261 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 262 ip->flags |= HAMMER_INODE_REDO; 263 ip->redo_count = 0; 264 } 265 } 266 skip: 267 268 /* 269 * Do a full flush sequence. 270 * 271 * Attempt to release the vnode while waiting for the inode to 272 * finish flushing. This can really mess up inactive->reclaim 273 * sequences so only do it if the vnode is active. 274 * 275 * WARNING! The VX lock functions must be used. vn_lock() will 276 * fail when this is part of a VOP_RECLAIM sequence. 277 */ 278 ++hammer_count_fsyncs; 279 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 280 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 281 if (waitfor == MNT_WAIT) { 282 int dorelock; 283 284 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 285 vx_unlock(ap->a_vp); 286 dorelock = 1; 287 } else { 288 dorelock = 0; 289 } 290 hammer_wait_inode(ip); 291 if (dorelock) 292 vx_lock(ap->a_vp); 293 } 294 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 295 vclrisdirty(ip->vp); 296 lwkt_reltoken(&hmp->fs_token); 297 return (ip->error); 298 } 299 300 /* 301 * hammer_vop_read { vp, uio, ioflag, cred } 302 * 303 * MPSAFE (for the cache safe does not require fs_token) 304 */ 305 static 306 int 307 hammer_vop_read(struct vop_read_args *ap) 308 { 309 struct hammer_transaction trans; 310 hammer_inode_t ip; 311 hammer_mount_t hmp; 312 off_t offset; 313 struct buf *bp; 314 struct uio *uio; 315 int error; 316 int n; 317 int seqcount; 318 int ioseqcount; 319 int blksize; 320 int bigread; 321 int got_trans; 322 size_t resid; 323 324 if (ap->a_vp->v_type != VREG) 325 return (EINVAL); 326 ip = VTOI(ap->a_vp); 327 hmp = ip->hmp; 328 error = 0; 329 got_trans = 0; 330 uio = ap->a_uio; 331 332 /* 333 * Attempt to shortcut directly to the VM object using lwbufs. 334 * This is much faster than instantiating buffer cache buffers. 335 */ 336 resid = uio->uio_resid; 337 error = vop_helper_read_shortcut(ap); 338 hammer_stats_file_read += resid - uio->uio_resid; 339 if (error) 340 return (error); 341 if (uio->uio_resid == 0) 342 goto finished; 343 344 /* 345 * Allow the UIO's size to override the sequential heuristic. 346 */ 347 blksize = hammer_blocksize(uio->uio_offset); 348 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 349 ioseqcount = (ap->a_ioflag >> 16); 350 if (seqcount < ioseqcount) 351 seqcount = ioseqcount; 352 353 /* 354 * If reading or writing a huge amount of data we have to break 355 * atomicy and allow the operation to be interrupted by a signal 356 * or it can DOS the machine. 357 */ 358 bigread = (uio->uio_resid > 100 * 1024 * 1024); 359 360 /* 361 * Access the data typically in HAMMER_BUFSIZE blocks via the 362 * buffer cache, but HAMMER may use a variable block size based 363 * on the offset. 364 * 365 * XXX Temporary hack, delay the start transaction while we remain 366 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 367 * locked-shared. 368 */ 369 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 370 int64_t base_offset; 371 int64_t file_limit; 372 373 blksize = hammer_blocksize(uio->uio_offset); 374 offset = (int)uio->uio_offset & (blksize - 1); 375 base_offset = uio->uio_offset - offset; 376 377 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 378 break; 379 380 /* 381 * MPSAFE 382 */ 383 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 384 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 385 bp->b_flags &= ~B_AGE; 386 error = 0; 387 goto skip; 388 } 389 if (ap->a_ioflag & IO_NRDELAY) { 390 bqrelse(bp); 391 return (EWOULDBLOCK); 392 } 393 394 /* 395 * MPUNSAFE 396 */ 397 if (got_trans == 0) { 398 hammer_start_transaction(&trans, ip->hmp); 399 got_trans = 1; 400 } 401 402 /* 403 * NOTE: A valid bp has already been acquired, but was not 404 * B_CACHE. 405 */ 406 if (hammer_cluster_enable) { 407 /* 408 * Use file_limit to prevent cluster_read() from 409 * creating buffers of the wrong block size past 410 * the demarc. 411 */ 412 file_limit = ip->ino_data.size; 413 if (base_offset < HAMMER_XDEMARC && 414 file_limit > HAMMER_XDEMARC) { 415 file_limit = HAMMER_XDEMARC; 416 } 417 error = cluster_readx(ap->a_vp, 418 file_limit, base_offset, 419 blksize, uio->uio_resid, 420 seqcount * BKVASIZE, &bp); 421 } else { 422 error = breadnx(ap->a_vp, base_offset, blksize, 423 NULL, NULL, 0, &bp); 424 } 425 if (error) { 426 brelse(bp); 427 break; 428 } 429 skip: 430 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 431 hdkprintf("doff %016jx read file %016jx@%016jx\n", 432 (intmax_t)bp->b_bio2.bio_offset, 433 (intmax_t)ip->obj_id, 434 (intmax_t)bp->b_loffset); 435 } 436 bp->b_flags &= ~B_IODEBUG; 437 if (blksize == HAMMER_XBUFSIZE) 438 bp->b_flags |= B_CLUSTEROK; 439 440 n = blksize - offset; 441 if (n > uio->uio_resid) 442 n = uio->uio_resid; 443 if (n > ip->ino_data.size - uio->uio_offset) 444 n = (int)(ip->ino_data.size - uio->uio_offset); 445 446 /* 447 * Set B_AGE, data has a lower priority than meta-data. 448 * 449 * Use a hold/unlock/drop sequence to run the uiomove 450 * with the buffer unlocked, avoiding deadlocks against 451 * read()s on mmap()'d spaces. 452 */ 453 bp->b_flags |= B_AGE; 454 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 455 bqrelse(bp); 456 457 if (error) 458 break; 459 hammer_stats_file_read += n; 460 } 461 462 finished: 463 464 /* 465 * Try to update the atime with just the inode lock for maximum 466 * concurrency. If we can't shortcut it we have to get the full 467 * blown transaction. 468 */ 469 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 470 hammer_start_transaction(&trans, ip->hmp); 471 got_trans = 1; 472 } 473 474 if (got_trans) { 475 if ((ip->flags & HAMMER_INODE_RO) == 0 && 476 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 477 lwkt_gettoken(&hmp->fs_token); 478 ip->ino_data.atime = trans.time; 479 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 480 hammer_done_transaction(&trans); 481 lwkt_reltoken(&hmp->fs_token); 482 } else { 483 hammer_done_transaction(&trans); 484 } 485 } 486 return (error); 487 } 488 489 /* 490 * hammer_vop_write { vp, uio, ioflag, cred } 491 */ 492 static 493 int 494 hammer_vop_write(struct vop_write_args *ap) 495 { 496 struct hammer_transaction trans; 497 struct hammer_inode *ip; 498 hammer_mount_t hmp; 499 thread_t td; 500 struct uio *uio; 501 int offset; 502 off_t base_offset; 503 int64_t cluster_eof; 504 struct buf *bp; 505 int kflags; 506 int error; 507 int n; 508 int flags; 509 int seqcount; 510 int bigwrite; 511 512 if (ap->a_vp->v_type != VREG) 513 return (EINVAL); 514 ip = VTOI(ap->a_vp); 515 hmp = ip->hmp; 516 error = 0; 517 kflags = 0; 518 seqcount = ap->a_ioflag >> 16; 519 520 if (ip->flags & HAMMER_INODE_RO) 521 return (EROFS); 522 523 /* 524 * Create a transaction to cover the operations we perform. 525 */ 526 hammer_start_transaction(&trans, hmp); 527 uio = ap->a_uio; 528 529 /* 530 * Check append mode 531 */ 532 if (ap->a_ioflag & IO_APPEND) 533 uio->uio_offset = ip->ino_data.size; 534 535 /* 536 * Check for illegal write offsets. Valid range is 0...2^63-1. 537 * 538 * NOTE: the base_off assignment is required to work around what 539 * I consider to be a GCC-4 optimization bug. 540 */ 541 if (uio->uio_offset < 0) { 542 hammer_done_transaction(&trans); 543 return (EFBIG); 544 } 545 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 546 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 547 hammer_done_transaction(&trans); 548 return (EFBIG); 549 } 550 551 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 552 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 553 hammer_done_transaction(&trans); 554 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 555 return (EFBIG); 556 } 557 558 /* 559 * If reading or writing a huge amount of data we have to break 560 * atomicy and allow the operation to be interrupted by a signal 561 * or it can DOS the machine. 562 * 563 * Preset redo_count so we stop generating REDOs earlier if the 564 * limit is exceeded. 565 * 566 * redo_count is heuristical, SMP races are ok 567 */ 568 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 569 if ((ip->flags & HAMMER_INODE_REDO) && 570 ip->redo_count < hammer_limit_redo) { 571 ip->redo_count += uio->uio_resid; 572 } 573 574 /* 575 * Access the data typically in HAMMER_BUFSIZE blocks via the 576 * buffer cache, but HAMMER may use a variable block size based 577 * on the offset. 578 */ 579 while (uio->uio_resid > 0) { 580 int fixsize = 0; 581 int blksize; 582 int blkmask; 583 int trivial; 584 int endofblk; 585 off_t nsize; 586 587 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 588 break; 589 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 590 break; 591 592 blksize = hammer_blocksize(uio->uio_offset); 593 594 /* 595 * Control the number of pending records associated with 596 * this inode. If too many have accumulated start a 597 * flush. Try to maintain a pipeline with the flusher. 598 * 599 * NOTE: It is possible for other sources to grow the 600 * records but not necessarily issue another flush, 601 * so use a timeout and ensure that a re-flush occurs. 602 */ 603 if (ip->rsv_recs >= hammer_limit_inode_recs) { 604 lwkt_gettoken(&hmp->fs_token); 605 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 606 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 607 ip->flags |= HAMMER_INODE_RECSW; 608 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 609 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 610 } 611 lwkt_reltoken(&hmp->fs_token); 612 } 613 614 /* 615 * Do not allow HAMMER to blow out the buffer cache. Very 616 * large UIOs can lockout other processes due to bwillwrite() 617 * mechanics. 618 * 619 * The hammer inode is not locked during these operations. 620 * The vnode is locked which can interfere with the pageout 621 * daemon for non-UIO_NOCOPY writes but should not interfere 622 * with the buffer cache. Even so, we cannot afford to 623 * allow the pageout daemon to build up too many dirty buffer 624 * cache buffers. 625 * 626 * Only call this if we aren't being recursively called from 627 * a virtual disk device (vn), else we may deadlock. 628 */ 629 if ((ap->a_ioflag & IO_RECURSE) == 0) 630 bwillwrite(blksize); 631 632 /* 633 * Calculate the blocksize at the current offset and figure 634 * out how much we can actually write. 635 */ 636 blkmask = blksize - 1; 637 offset = (int)uio->uio_offset & blkmask; 638 base_offset = uio->uio_offset & ~(int64_t)blkmask; 639 n = blksize - offset; 640 if (n > uio->uio_resid) { 641 n = uio->uio_resid; 642 endofblk = 0; 643 } else { 644 endofblk = 1; 645 } 646 nsize = uio->uio_offset + n; 647 if (nsize > ip->ino_data.size) { 648 if (uio->uio_offset > ip->ino_data.size) 649 trivial = 0; 650 else 651 trivial = 1; 652 nvextendbuf(ap->a_vp, 653 ip->ino_data.size, 654 nsize, 655 hammer_blocksize(ip->ino_data.size), 656 hammer_blocksize(nsize), 657 hammer_blockoff(ip->ino_data.size), 658 hammer_blockoff(nsize), 659 trivial); 660 fixsize = 1; 661 kflags |= NOTE_EXTEND; 662 } 663 664 if (uio->uio_segflg == UIO_NOCOPY) { 665 /* 666 * Issuing a write with the same data backing the 667 * buffer. Instantiate the buffer to collect the 668 * backing vm pages, then read-in any missing bits. 669 * 670 * This case is used by vop_stdputpages(). 671 */ 672 bp = getblk(ap->a_vp, base_offset, 673 blksize, GETBLK_BHEAVY, 0); 674 if ((bp->b_flags & B_CACHE) == 0) { 675 bqrelse(bp); 676 error = bread(ap->a_vp, base_offset, 677 blksize, &bp); 678 } 679 } else if (offset == 0 && uio->uio_resid >= blksize) { 680 /* 681 * Even though we are entirely overwriting the buffer 682 * we may still have to zero it out to avoid a 683 * mmap/write visibility issue. 684 */ 685 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 686 if ((bp->b_flags & B_CACHE) == 0) 687 vfs_bio_clrbuf(bp); 688 } else if (base_offset >= ip->ino_data.size) { 689 /* 690 * If the base offset of the buffer is beyond the 691 * file EOF, we don't have to issue a read. 692 */ 693 bp = getblk(ap->a_vp, base_offset, 694 blksize, GETBLK_BHEAVY, 0); 695 vfs_bio_clrbuf(bp); 696 } else { 697 /* 698 * Partial overwrite, read in any missing bits then 699 * replace the portion being written. 700 */ 701 error = bread(ap->a_vp, base_offset, blksize, &bp); 702 if (error == 0) 703 bheavy(bp); 704 } 705 if (error == 0) 706 error = uiomovebp(bp, bp->b_data + offset, n, uio); 707 708 lwkt_gettoken(&hmp->fs_token); 709 710 /* 711 * Generate REDO records if enabled and redo_count will not 712 * exceeded the limit. 713 * 714 * If redo_count exceeds the limit we stop generating records 715 * and clear HAMMER_INODE_REDO. This will cause the next 716 * fsync() to do a full meta-data sync instead of just an 717 * UNDO/REDO fifo update. 718 * 719 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 720 * will still be tracked. The tracks will be terminated 721 * when the related meta-data (including possible data 722 * modifications which are not tracked via REDO) is 723 * flushed. 724 */ 725 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 726 if (ip->redo_count < hammer_limit_redo) { 727 bp->b_flags |= B_VFSFLAG1; 728 error = hammer_generate_redo(&trans, ip, 729 base_offset + offset, 730 HAMMER_REDO_WRITE, 731 bp->b_data + offset, 732 (size_t)n); 733 } else { 734 ip->flags &= ~HAMMER_INODE_REDO; 735 } 736 } 737 738 /* 739 * If we screwed up we have to undo any VM size changes we 740 * made. 741 */ 742 if (error) { 743 brelse(bp); 744 if (fixsize) { 745 nvtruncbuf(ap->a_vp, ip->ino_data.size, 746 hammer_blocksize(ip->ino_data.size), 747 hammer_blockoff(ip->ino_data.size), 748 0); 749 } 750 lwkt_reltoken(&hmp->fs_token); 751 break; 752 } 753 kflags |= NOTE_WRITE; 754 hammer_stats_file_write += n; 755 if (blksize == HAMMER_XBUFSIZE) 756 bp->b_flags |= B_CLUSTEROK; 757 if (ip->ino_data.size < uio->uio_offset) { 758 ip->ino_data.size = uio->uio_offset; 759 flags = HAMMER_INODE_SDIRTY; 760 } else { 761 flags = 0; 762 } 763 ip->ino_data.mtime = trans.time; 764 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 765 hammer_modify_inode(&trans, ip, flags); 766 767 /* 768 * Once we dirty the buffer any cached zone-X offset 769 * becomes invalid. HAMMER NOTE: no-history mode cannot 770 * allow overwriting over the same data sector unless 771 * we provide UNDOs for the old data, which we don't. 772 */ 773 bp->b_bio2.bio_offset = NOOFFSET; 774 775 lwkt_reltoken(&hmp->fs_token); 776 777 /* 778 * Final buffer disposition. 779 * 780 * Because meta-data updates are deferred, HAMMER is 781 * especially sensitive to excessive bdwrite()s because 782 * the I/O stream is not broken up by disk reads. So the 783 * buffer cache simply cannot keep up. 784 * 785 * WARNING! blksize is variable. cluster_write() is 786 * expected to not blow up if it encounters 787 * buffers that do not match the passed blksize. 788 * 789 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 790 * The ip->rsv_recs check should burst-flush the data. 791 * If we queue it immediately the buf could be left 792 * locked on the device queue for a very long time. 793 * 794 * However, failing to flush a dirty buffer out when 795 * issued from the pageout daemon can result in a low 796 * memory deadlock against bio_page_alloc(), so we 797 * have to bawrite() on IO_ASYNC as well. 798 * 799 * NOTE! To avoid degenerate stalls due to mismatched block 800 * sizes we only honor IO_DIRECT on the write which 801 * abuts the end of the buffer. However, we must 802 * honor IO_SYNC in case someone is silly enough to 803 * configure a HAMMER file as swap, or when HAMMER 804 * is serving NFS (for commits). Ick ick. 805 */ 806 bp->b_flags |= B_AGE; 807 if (blksize == HAMMER_XBUFSIZE) 808 bp->b_flags |= B_CLUSTEROK; 809 810 if (ap->a_ioflag & IO_SYNC) { 811 bwrite(bp); 812 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 813 bawrite(bp); 814 } else if (ap->a_ioflag & IO_ASYNC) { 815 bawrite(bp); 816 } else if (hammer_cluster_enable && 817 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 818 if (base_offset < HAMMER_XDEMARC) 819 cluster_eof = hammer_blockdemarc(base_offset, 820 ip->ino_data.size); 821 else 822 cluster_eof = ip->ino_data.size; 823 cluster_write(bp, cluster_eof, blksize, seqcount); 824 } else { 825 bdwrite(bp); 826 } 827 } 828 hammer_done_transaction(&trans); 829 hammer_knote(ap->a_vp, kflags); 830 831 return (error); 832 } 833 834 /* 835 * hammer_vop_access { vp, mode, cred } 836 * 837 * MPSAFE - does not require fs_token 838 */ 839 static 840 int 841 hammer_vop_access(struct vop_access_args *ap) 842 { 843 struct hammer_inode *ip = VTOI(ap->a_vp); 844 uid_t uid; 845 gid_t gid; 846 int error; 847 848 ++hammer_stats_file_iopsr; 849 uid = hammer_to_unix_xid(&ip->ino_data.uid); 850 gid = hammer_to_unix_xid(&ip->ino_data.gid); 851 852 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 853 ip->ino_data.uflags); 854 return (error); 855 } 856 857 /* 858 * hammer_vop_advlock { vp, id, op, fl, flags } 859 * 860 * MPSAFE - does not require fs_token 861 */ 862 static 863 int 864 hammer_vop_advlock(struct vop_advlock_args *ap) 865 { 866 hammer_inode_t ip = VTOI(ap->a_vp); 867 868 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 869 } 870 871 /* 872 * hammer_vop_close { vp, fflag } 873 * 874 * We can only sync-on-close for normal closes. XXX disabled for now. 875 */ 876 static 877 int 878 hammer_vop_close(struct vop_close_args *ap) 879 { 880 #if 0 881 struct vnode *vp = ap->a_vp; 882 hammer_inode_t ip = VTOI(vp); 883 int waitfor; 884 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 885 if (vn_islocked(vp) == LK_EXCLUSIVE && 886 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 887 if (ip->flags & HAMMER_INODE_CLOSESYNC) 888 waitfor = MNT_WAIT; 889 else 890 waitfor = MNT_NOWAIT; 891 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 892 HAMMER_INODE_CLOSEASYNC); 893 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 894 } 895 } 896 #endif 897 return (vop_stdclose(ap)); 898 } 899 900 /* 901 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 902 * 903 * The operating system has already ensured that the directory entry 904 * does not exist and done all appropriate namespace locking. 905 */ 906 static 907 int 908 hammer_vop_ncreate(struct vop_ncreate_args *ap) 909 { 910 struct hammer_transaction trans; 911 struct hammer_inode *dip; 912 struct hammer_inode *nip; 913 struct nchandle *nch; 914 hammer_mount_t hmp; 915 int error; 916 917 nch = ap->a_nch; 918 dip = VTOI(ap->a_dvp); 919 hmp = dip->hmp; 920 921 if (dip->flags & HAMMER_INODE_RO) 922 return (EROFS); 923 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 924 return (error); 925 926 /* 927 * Create a transaction to cover the operations we perform. 928 */ 929 lwkt_gettoken(&hmp->fs_token); 930 hammer_start_transaction(&trans, hmp); 931 ++hammer_stats_file_iopsw; 932 933 /* 934 * Create a new filesystem object of the requested type. The 935 * returned inode will be referenced and shared-locked to prevent 936 * it from being moved to the flusher. 937 */ 938 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 939 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 940 NULL, &nip); 941 if (error) { 942 hkprintf("hammer_create_inode error %d\n", error); 943 hammer_done_transaction(&trans); 944 *ap->a_vpp = NULL; 945 lwkt_reltoken(&hmp->fs_token); 946 return (error); 947 } 948 949 /* 950 * Add the new filesystem object to the directory. This will also 951 * bump the inode's link count. 952 */ 953 error = hammer_ip_add_directory(&trans, dip, 954 nch->ncp->nc_name, nch->ncp->nc_nlen, 955 nip); 956 if (error) 957 hkprintf("hammer_ip_add_directory error %d\n", error); 958 959 /* 960 * Finish up. 961 */ 962 if (error) { 963 hammer_rel_inode(nip, 0); 964 hammer_done_transaction(&trans); 965 *ap->a_vpp = NULL; 966 } else { 967 error = hammer_get_vnode(nip, ap->a_vpp); 968 hammer_done_transaction(&trans); 969 hammer_rel_inode(nip, 0); 970 if (error == 0) { 971 cache_setunresolved(ap->a_nch); 972 cache_setvp(ap->a_nch, *ap->a_vpp); 973 } 974 hammer_knote(ap->a_dvp, NOTE_WRITE); 975 } 976 lwkt_reltoken(&hmp->fs_token); 977 return (error); 978 } 979 980 /* 981 * hammer_vop_getattr { vp, vap } 982 * 983 * Retrieve an inode's attribute information. When accessing inodes 984 * historically we fake the atime field to ensure consistent results. 985 * The atime field is stored in the B-Tree element and allowed to be 986 * updated without cycling the element. 987 * 988 * MPSAFE - does not require fs_token 989 */ 990 static 991 int 992 hammer_vop_getattr(struct vop_getattr_args *ap) 993 { 994 struct hammer_inode *ip = VTOI(ap->a_vp); 995 struct vattr *vap = ap->a_vap; 996 997 /* 998 * We want the fsid to be different when accessing a filesystem 999 * with different as-of's so programs like diff don't think 1000 * the files are the same. 1001 * 1002 * We also want the fsid to be the same when comparing snapshots, 1003 * or when comparing mirrors (which might be backed by different 1004 * physical devices). HAMMER fsids are based on the PFS's 1005 * shared_uuid field. 1006 * 1007 * XXX there is a chance of collision here. The va_fsid reported 1008 * by stat is different from the more involved fsid used in the 1009 * mount structure. 1010 */ 1011 ++hammer_stats_file_iopsr; 1012 hammer_lock_sh(&ip->lock); 1013 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1014 (u_int32_t)(ip->obj_asof >> 32); 1015 1016 vap->va_fileid = ip->ino_leaf.base.obj_id; 1017 vap->va_mode = ip->ino_data.mode; 1018 vap->va_nlink = ip->ino_data.nlinks; 1019 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1020 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1021 vap->va_rmajor = 0; 1022 vap->va_rminor = 0; 1023 vap->va_size = ip->ino_data.size; 1024 1025 /* 1026 * Special case for @@PFS softlinks. The actual size of the 1027 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1028 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1029 * 1030 * Note that userspace hammer command does not allow users to 1031 * create a @@PFS softlink under an existing other PFS (id!=0) 1032 * so the ip localization here for @@PFS softlink is always 0. 1033 */ 1034 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1035 ip->ino_data.size == 10 && 1036 ip->obj_asof == HAMMER_MAX_TID && 1037 ip->obj_localization == 0 && 1038 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1039 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1040 vap->va_size = 26; 1041 else 1042 vap->va_size = 10; 1043 } 1044 1045 /* 1046 * We must provide a consistent atime and mtime for snapshots 1047 * so people can do a 'tar cf - ... | md5' on them and get 1048 * consistent results. 1049 */ 1050 if (ip->flags & HAMMER_INODE_RO) { 1051 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1052 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1053 } else { 1054 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1055 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1056 } 1057 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1058 vap->va_flags = ip->ino_data.uflags; 1059 vap->va_gen = 1; /* hammer inums are unique for all time */ 1060 vap->va_blocksize = HAMMER_BUFSIZE; 1061 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1062 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1063 ~HAMMER_XBUFMASK64; 1064 } else if (ip->ino_data.size > HAMMER_HBUFSIZE) { 1065 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1066 ~HAMMER_BUFMASK64; 1067 } else { 1068 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1069 } 1070 1071 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1072 vap->va_filerev = 0; /* XXX */ 1073 vap->va_uid_uuid = ip->ino_data.uid; 1074 vap->va_gid_uuid = ip->ino_data.gid; 1075 vap->va_fsid_uuid = ip->hmp->fsid; 1076 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1077 VA_FSID_UUID_VALID; 1078 1079 switch (ip->ino_data.obj_type) { 1080 case HAMMER_OBJTYPE_CDEV: 1081 case HAMMER_OBJTYPE_BDEV: 1082 vap->va_rmajor = ip->ino_data.rmajor; 1083 vap->va_rminor = ip->ino_data.rminor; 1084 break; 1085 default: 1086 break; 1087 } 1088 hammer_unlock(&ip->lock); 1089 return(0); 1090 } 1091 1092 /* 1093 * hammer_vop_nresolve { nch, dvp, cred } 1094 * 1095 * Locate the requested directory entry. 1096 */ 1097 static 1098 int 1099 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1100 { 1101 struct hammer_transaction trans; 1102 struct namecache *ncp; 1103 hammer_mount_t hmp; 1104 hammer_inode_t dip; 1105 hammer_inode_t ip; 1106 hammer_tid_t asof; 1107 struct hammer_cursor cursor; 1108 struct vnode *vp; 1109 int64_t namekey; 1110 int error; 1111 int i; 1112 int nlen; 1113 int flags; 1114 int ispfs; 1115 int64_t obj_id; 1116 u_int32_t localization; 1117 u_int32_t max_iterations; 1118 1119 /* 1120 * Misc initialization, plus handle as-of name extensions. Look for 1121 * the '@@' extension. Note that as-of files and directories cannot 1122 * be modified. 1123 */ 1124 dip = VTOI(ap->a_dvp); 1125 ncp = ap->a_nch->ncp; 1126 asof = dip->obj_asof; 1127 localization = dip->obj_localization; /* for code consistency */ 1128 nlen = ncp->nc_nlen; 1129 flags = dip->flags & HAMMER_INODE_RO; 1130 ispfs = 0; 1131 hmp = dip->hmp; 1132 1133 lwkt_gettoken(&hmp->fs_token); 1134 hammer_simple_transaction(&trans, hmp); 1135 ++hammer_stats_file_iopsr; 1136 1137 for (i = 0; i < nlen; ++i) { 1138 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1139 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1140 &ispfs, &asof, &localization); 1141 if (error != 0) { 1142 i = nlen; 1143 break; 1144 } 1145 if (asof != HAMMER_MAX_TID) 1146 flags |= HAMMER_INODE_RO; 1147 break; 1148 } 1149 } 1150 nlen = i; 1151 1152 /* 1153 * If this is a PFS softlink we dive into the PFS 1154 */ 1155 if (ispfs && nlen == 0) { 1156 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1157 asof, localization, 1158 flags, &error); 1159 if (error == 0) { 1160 error = hammer_get_vnode(ip, &vp); 1161 hammer_rel_inode(ip, 0); 1162 } else { 1163 vp = NULL; 1164 } 1165 if (error == 0) { 1166 vn_unlock(vp); 1167 cache_setvp(ap->a_nch, vp); 1168 vrele(vp); 1169 } 1170 goto done; 1171 } 1172 1173 /* 1174 * If there is no path component the time extension is relative to dip. 1175 * e.g. "fubar/@@<snapshot>" 1176 * 1177 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1178 * e.g. "fubar/.@@<snapshot>" 1179 * 1180 * ".." is handled by the kernel. We do not currently handle 1181 * "..@<snapshot>". 1182 */ 1183 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1184 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1185 asof, dip->obj_localization, 1186 flags, &error); 1187 if (error == 0) { 1188 error = hammer_get_vnode(ip, &vp); 1189 hammer_rel_inode(ip, 0); 1190 } else { 1191 vp = NULL; 1192 } 1193 if (error == 0) { 1194 vn_unlock(vp); 1195 cache_setvp(ap->a_nch, vp); 1196 vrele(vp); 1197 } 1198 goto done; 1199 } 1200 1201 /* 1202 * Calculate the namekey and setup the key range for the scan. This 1203 * works kinda like a chained hash table where the lower 32 bits 1204 * of the namekey synthesize the chain. 1205 * 1206 * The key range is inclusive of both key_beg and key_end. 1207 */ 1208 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1209 &max_iterations); 1210 1211 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1212 cursor.key_beg.localization = dip->obj_localization + 1213 hammer_dir_localization(dip); 1214 cursor.key_beg.obj_id = dip->obj_id; 1215 cursor.key_beg.key = namekey; 1216 cursor.key_beg.create_tid = 0; 1217 cursor.key_beg.delete_tid = 0; 1218 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1219 cursor.key_beg.obj_type = 0; 1220 1221 cursor.key_end = cursor.key_beg; 1222 cursor.key_end.key += max_iterations; 1223 cursor.asof = asof; 1224 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1225 1226 /* 1227 * Scan all matching records (the chain), locate the one matching 1228 * the requested path component. 1229 * 1230 * The hammer_ip_*() functions merge in-memory records with on-disk 1231 * records for the purposes of the search. 1232 */ 1233 obj_id = 0; 1234 localization = HAMMER_DEF_LOCALIZATION; 1235 1236 if (error == 0) { 1237 error = hammer_ip_first(&cursor); 1238 while (error == 0) { 1239 error = hammer_ip_resolve_data(&cursor); 1240 if (error) 1241 break; 1242 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1243 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1244 obj_id = cursor.data->entry.obj_id; 1245 localization = cursor.data->entry.localization; 1246 break; 1247 } 1248 error = hammer_ip_next(&cursor); 1249 } 1250 } 1251 hammer_done_cursor(&cursor); 1252 1253 /* 1254 * Lookup the obj_id. This should always succeed. If it does not 1255 * the filesystem may be damaged and we return a dummy inode. 1256 */ 1257 if (error == 0) { 1258 ip = hammer_get_inode(&trans, dip, obj_id, 1259 asof, localization, 1260 flags, &error); 1261 if (error == ENOENT) { 1262 hkprintf("WARNING: Missing inode for dirent \"%s\"\n" 1263 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1264 ncp->nc_name, 1265 (long long)obj_id, (long long)asof, 1266 localization); 1267 error = 0; 1268 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1269 asof, localization, 1270 flags, &error); 1271 } 1272 if (error == 0) { 1273 error = hammer_get_vnode(ip, &vp); 1274 hammer_rel_inode(ip, 0); 1275 } else { 1276 vp = NULL; 1277 } 1278 if (error == 0) { 1279 vn_unlock(vp); 1280 cache_setvp(ap->a_nch, vp); 1281 vrele(vp); 1282 } 1283 } else if (error == ENOENT) { 1284 cache_setvp(ap->a_nch, NULL); 1285 } 1286 done: 1287 hammer_done_transaction(&trans); 1288 lwkt_reltoken(&hmp->fs_token); 1289 return (error); 1290 } 1291 1292 /* 1293 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1294 * 1295 * Locate the parent directory of a directory vnode. 1296 * 1297 * dvp is referenced but not locked. *vpp must be returned referenced and 1298 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1299 * at the root, instead it could indicate that the directory we were in was 1300 * removed. 1301 * 1302 * NOTE: as-of sequences are not linked into the directory structure. If 1303 * we are at the root with a different asof then the mount point, reload 1304 * the same directory with the mount point's asof. I'm not sure what this 1305 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1306 * get confused, but it hasn't been tested. 1307 */ 1308 static 1309 int 1310 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1311 { 1312 struct hammer_transaction trans; 1313 struct hammer_inode *dip; 1314 struct hammer_inode *ip; 1315 hammer_mount_t hmp; 1316 int64_t parent_obj_id; 1317 u_int32_t parent_obj_localization; 1318 hammer_tid_t asof; 1319 int error; 1320 1321 dip = VTOI(ap->a_dvp); 1322 asof = dip->obj_asof; 1323 hmp = dip->hmp; 1324 1325 /* 1326 * Whos are parent? This could be the root of a pseudo-filesystem 1327 * whos parent is in another localization domain. 1328 */ 1329 lwkt_gettoken(&hmp->fs_token); 1330 parent_obj_id = dip->ino_data.parent_obj_id; 1331 if (dip->obj_id == HAMMER_OBJID_ROOT) 1332 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1333 else 1334 parent_obj_localization = dip->obj_localization; 1335 1336 /* 1337 * It's probably a PFS root when dip->ino_data.parent_obj_id is 0. 1338 */ 1339 if (parent_obj_id == 0) { 1340 if (dip->obj_id == HAMMER_OBJID_ROOT && 1341 asof != hmp->asof) { 1342 parent_obj_id = dip->obj_id; 1343 asof = hmp->asof; 1344 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1345 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1346 (long long)dip->obj_asof); 1347 } else { 1348 *ap->a_vpp = NULL; 1349 lwkt_reltoken(&hmp->fs_token); 1350 return ENOENT; 1351 } 1352 } 1353 1354 hammer_simple_transaction(&trans, hmp); 1355 ++hammer_stats_file_iopsr; 1356 1357 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1358 asof, parent_obj_localization, 1359 dip->flags, &error); 1360 if (ip) { 1361 error = hammer_get_vnode(ip, ap->a_vpp); 1362 hammer_rel_inode(ip, 0); 1363 } else { 1364 *ap->a_vpp = NULL; 1365 } 1366 hammer_done_transaction(&trans); 1367 lwkt_reltoken(&hmp->fs_token); 1368 return (error); 1369 } 1370 1371 /* 1372 * hammer_vop_nlink { nch, dvp, vp, cred } 1373 */ 1374 static 1375 int 1376 hammer_vop_nlink(struct vop_nlink_args *ap) 1377 { 1378 struct hammer_transaction trans; 1379 struct hammer_inode *dip; 1380 struct hammer_inode *ip; 1381 struct nchandle *nch; 1382 hammer_mount_t hmp; 1383 int error; 1384 1385 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1386 return(EXDEV); 1387 1388 nch = ap->a_nch; 1389 dip = VTOI(ap->a_dvp); 1390 ip = VTOI(ap->a_vp); 1391 hmp = dip->hmp; 1392 1393 if (dip->obj_localization != ip->obj_localization) 1394 return(EXDEV); 1395 1396 if (dip->flags & HAMMER_INODE_RO) 1397 return (EROFS); 1398 if (ip->flags & HAMMER_INODE_RO) 1399 return (EROFS); 1400 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1401 return (error); 1402 1403 /* 1404 * Create a transaction to cover the operations we perform. 1405 */ 1406 lwkt_gettoken(&hmp->fs_token); 1407 hammer_start_transaction(&trans, hmp); 1408 ++hammer_stats_file_iopsw; 1409 1410 /* 1411 * Add the filesystem object to the directory. Note that neither 1412 * dip nor ip are referenced or locked, but their vnodes are 1413 * referenced. This function will bump the inode's link count. 1414 */ 1415 error = hammer_ip_add_directory(&trans, dip, 1416 nch->ncp->nc_name, nch->ncp->nc_nlen, 1417 ip); 1418 1419 /* 1420 * Finish up. 1421 */ 1422 if (error == 0) { 1423 cache_setunresolved(nch); 1424 cache_setvp(nch, ap->a_vp); 1425 } 1426 hammer_done_transaction(&trans); 1427 hammer_knote(ap->a_vp, NOTE_LINK); 1428 hammer_knote(ap->a_dvp, NOTE_WRITE); 1429 lwkt_reltoken(&hmp->fs_token); 1430 return (error); 1431 } 1432 1433 /* 1434 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1435 * 1436 * The operating system has already ensured that the directory entry 1437 * does not exist and done all appropriate namespace locking. 1438 */ 1439 static 1440 int 1441 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1442 { 1443 struct hammer_transaction trans; 1444 struct hammer_inode *dip; 1445 struct hammer_inode *nip; 1446 struct nchandle *nch; 1447 hammer_mount_t hmp; 1448 int error; 1449 1450 nch = ap->a_nch; 1451 dip = VTOI(ap->a_dvp); 1452 hmp = dip->hmp; 1453 1454 if (dip->flags & HAMMER_INODE_RO) 1455 return (EROFS); 1456 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1457 return (error); 1458 1459 /* 1460 * Create a transaction to cover the operations we perform. 1461 */ 1462 lwkt_gettoken(&hmp->fs_token); 1463 hammer_start_transaction(&trans, hmp); 1464 ++hammer_stats_file_iopsw; 1465 1466 /* 1467 * Create a new filesystem object of the requested type. The 1468 * returned inode will be referenced but not locked. 1469 */ 1470 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1471 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1472 NULL, &nip); 1473 if (error) { 1474 hkprintf("hammer_mkdir error %d\n", error); 1475 hammer_done_transaction(&trans); 1476 *ap->a_vpp = NULL; 1477 lwkt_reltoken(&hmp->fs_token); 1478 return (error); 1479 } 1480 /* 1481 * Add the new filesystem object to the directory. This will also 1482 * bump the inode's link count. 1483 */ 1484 error = hammer_ip_add_directory(&trans, dip, 1485 nch->ncp->nc_name, nch->ncp->nc_nlen, 1486 nip); 1487 if (error) 1488 hkprintf("hammer_mkdir (add) error %d\n", error); 1489 1490 /* 1491 * Finish up. 1492 */ 1493 if (error) { 1494 hammer_rel_inode(nip, 0); 1495 *ap->a_vpp = NULL; 1496 } else { 1497 error = hammer_get_vnode(nip, ap->a_vpp); 1498 hammer_rel_inode(nip, 0); 1499 if (error == 0) { 1500 cache_setunresolved(ap->a_nch); 1501 cache_setvp(ap->a_nch, *ap->a_vpp); 1502 } 1503 } 1504 hammer_done_transaction(&trans); 1505 if (error == 0) 1506 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1507 lwkt_reltoken(&hmp->fs_token); 1508 return (error); 1509 } 1510 1511 /* 1512 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1513 * 1514 * The operating system has already ensured that the directory entry 1515 * does not exist and done all appropriate namespace locking. 1516 */ 1517 static 1518 int 1519 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1520 { 1521 struct hammer_transaction trans; 1522 struct hammer_inode *dip; 1523 struct hammer_inode *nip; 1524 struct nchandle *nch; 1525 hammer_mount_t hmp; 1526 int error; 1527 1528 nch = ap->a_nch; 1529 dip = VTOI(ap->a_dvp); 1530 hmp = dip->hmp; 1531 1532 if (dip->flags & HAMMER_INODE_RO) 1533 return (EROFS); 1534 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1535 return (error); 1536 1537 /* 1538 * Create a transaction to cover the operations we perform. 1539 */ 1540 lwkt_gettoken(&hmp->fs_token); 1541 hammer_start_transaction(&trans, hmp); 1542 ++hammer_stats_file_iopsw; 1543 1544 /* 1545 * Create a new filesystem object of the requested type. The 1546 * returned inode will be referenced but not locked. 1547 * 1548 * If mknod specifies a directory a pseudo-fs is created. 1549 */ 1550 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1551 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1552 NULL, &nip); 1553 if (error) { 1554 hammer_done_transaction(&trans); 1555 *ap->a_vpp = NULL; 1556 lwkt_reltoken(&hmp->fs_token); 1557 return (error); 1558 } 1559 1560 /* 1561 * Add the new filesystem object to the directory. This will also 1562 * bump the inode's link count. 1563 */ 1564 error = hammer_ip_add_directory(&trans, dip, 1565 nch->ncp->nc_name, nch->ncp->nc_nlen, 1566 nip); 1567 1568 /* 1569 * Finish up. 1570 */ 1571 if (error) { 1572 hammer_rel_inode(nip, 0); 1573 *ap->a_vpp = NULL; 1574 } else { 1575 error = hammer_get_vnode(nip, ap->a_vpp); 1576 hammer_rel_inode(nip, 0); 1577 if (error == 0) { 1578 cache_setunresolved(ap->a_nch); 1579 cache_setvp(ap->a_nch, *ap->a_vpp); 1580 } 1581 } 1582 hammer_done_transaction(&trans); 1583 if (error == 0) 1584 hammer_knote(ap->a_dvp, NOTE_WRITE); 1585 lwkt_reltoken(&hmp->fs_token); 1586 return (error); 1587 } 1588 1589 /* 1590 * hammer_vop_open { vp, mode, cred, fp } 1591 * 1592 * MPSAFE (does not require fs_token) 1593 */ 1594 static 1595 int 1596 hammer_vop_open(struct vop_open_args *ap) 1597 { 1598 hammer_inode_t ip; 1599 1600 ++hammer_stats_file_iopsr; 1601 ip = VTOI(ap->a_vp); 1602 1603 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1604 return (EROFS); 1605 return(vop_stdopen(ap)); 1606 } 1607 1608 /* 1609 * hammer_vop_print { vp } 1610 */ 1611 static 1612 int 1613 hammer_vop_print(struct vop_print_args *ap) 1614 { 1615 return EOPNOTSUPP; 1616 } 1617 1618 /* 1619 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1620 */ 1621 static 1622 int 1623 hammer_vop_readdir(struct vop_readdir_args *ap) 1624 { 1625 struct hammer_transaction trans; 1626 struct hammer_cursor cursor; 1627 struct hammer_inode *ip; 1628 hammer_mount_t hmp; 1629 struct uio *uio; 1630 hammer_base_elm_t base; 1631 int error; 1632 int cookie_index; 1633 int ncookies; 1634 off_t *cookies; 1635 off_t saveoff; 1636 int r; 1637 int dtype; 1638 1639 ++hammer_stats_file_iopsr; 1640 ip = VTOI(ap->a_vp); 1641 uio = ap->a_uio; 1642 saveoff = uio->uio_offset; 1643 hmp = ip->hmp; 1644 1645 if (ap->a_ncookies) { 1646 ncookies = uio->uio_resid / 16 + 1; 1647 if (ncookies > 1024) 1648 ncookies = 1024; 1649 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1650 cookie_index = 0; 1651 } else { 1652 ncookies = -1; 1653 cookies = NULL; 1654 cookie_index = 0; 1655 } 1656 1657 lwkt_gettoken(&hmp->fs_token); 1658 hammer_simple_transaction(&trans, hmp); 1659 1660 /* 1661 * Handle artificial entries 1662 * 1663 * It should be noted that the minimum value for a directory 1664 * hash key on-media is 0x0000000100000000, so we can use anything 1665 * less then that to represent our 'special' key space. 1666 */ 1667 error = 0; 1668 if (saveoff == 0) { 1669 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1670 if (r) 1671 goto done; 1672 if (cookies) 1673 cookies[cookie_index] = saveoff; 1674 ++saveoff; 1675 ++cookie_index; 1676 if (cookie_index == ncookies) 1677 goto done; 1678 } 1679 if (saveoff == 1) { 1680 if (ip->ino_data.parent_obj_id) { 1681 r = vop_write_dirent(&error, uio, 1682 ip->ino_data.parent_obj_id, 1683 DT_DIR, 2, ".."); 1684 } else { 1685 r = vop_write_dirent(&error, uio, 1686 ip->obj_id, DT_DIR, 2, ".."); 1687 } 1688 if (r) 1689 goto done; 1690 if (cookies) 1691 cookies[cookie_index] = saveoff; 1692 ++saveoff; 1693 ++cookie_index; 1694 if (cookie_index == ncookies) 1695 goto done; 1696 } 1697 1698 /* 1699 * Key range (begin and end inclusive) to scan. Directory keys 1700 * directly translate to a 64 bit 'seek' position. 1701 */ 1702 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1703 cursor.key_beg.localization = ip->obj_localization + 1704 hammer_dir_localization(ip); 1705 cursor.key_beg.obj_id = ip->obj_id; 1706 cursor.key_beg.create_tid = 0; 1707 cursor.key_beg.delete_tid = 0; 1708 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1709 cursor.key_beg.obj_type = 0; 1710 cursor.key_beg.key = saveoff; 1711 1712 cursor.key_end = cursor.key_beg; 1713 cursor.key_end.key = HAMMER_MAX_KEY; 1714 cursor.asof = ip->obj_asof; 1715 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1716 1717 error = hammer_ip_first(&cursor); 1718 1719 while (error == 0) { 1720 error = hammer_ip_resolve_data(&cursor); 1721 if (error) 1722 break; 1723 base = &cursor.leaf->base; 1724 saveoff = base->key; 1725 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1726 1727 if (base->obj_id != ip->obj_id) 1728 hpanic("bad record at %p", cursor.node); 1729 1730 /* 1731 * Convert pseudo-filesystems into softlinks 1732 */ 1733 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1734 r = vop_write_dirent( 1735 &error, uio, cursor.data->entry.obj_id, 1736 dtype, 1737 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1738 (void *)cursor.data->entry.name); 1739 if (r) 1740 break; 1741 ++saveoff; 1742 if (cookies) 1743 cookies[cookie_index] = base->key; 1744 ++cookie_index; 1745 if (cookie_index == ncookies) 1746 break; 1747 error = hammer_ip_next(&cursor); 1748 } 1749 hammer_done_cursor(&cursor); 1750 1751 done: 1752 hammer_done_transaction(&trans); 1753 1754 if (ap->a_eofflag) 1755 *ap->a_eofflag = (error == ENOENT); 1756 uio->uio_offset = saveoff; 1757 if (error && cookie_index == 0) { 1758 if (error == ENOENT) 1759 error = 0; 1760 if (cookies) { 1761 kfree(cookies, M_TEMP); 1762 *ap->a_ncookies = 0; 1763 *ap->a_cookies = NULL; 1764 } 1765 } else { 1766 if (error == ENOENT) 1767 error = 0; 1768 if (cookies) { 1769 *ap->a_ncookies = cookie_index; 1770 *ap->a_cookies = cookies; 1771 } 1772 } 1773 lwkt_reltoken(&hmp->fs_token); 1774 return(error); 1775 } 1776 1777 /* 1778 * hammer_vop_readlink { vp, uio, cred } 1779 */ 1780 static 1781 int 1782 hammer_vop_readlink(struct vop_readlink_args *ap) 1783 { 1784 struct hammer_transaction trans; 1785 struct hammer_cursor cursor; 1786 struct hammer_inode *ip; 1787 hammer_mount_t hmp; 1788 char buf[32]; 1789 u_int32_t localization; 1790 hammer_pseudofs_inmem_t pfsm; 1791 int error; 1792 1793 ip = VTOI(ap->a_vp); 1794 hmp = ip->hmp; 1795 1796 lwkt_gettoken(&hmp->fs_token); 1797 1798 /* 1799 * Shortcut if the symlink data was stuffed into ino_data. 1800 * 1801 * Also expand special "@@PFS%05d" softlinks (expansion only 1802 * occurs for non-historical (current) accesses made from the 1803 * primary filesystem). 1804 * 1805 * Note that userspace hammer command does not allow users to 1806 * create a @@PFS softlink under an existing other PFS (id!=0) 1807 * so the ip localization here for @@PFS softlink is always 0. 1808 */ 1809 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1810 char *ptr; 1811 int bytes; 1812 1813 ptr = ip->ino_data.ext.symlink; 1814 bytes = (int)ip->ino_data.size; 1815 if (bytes == 10 && 1816 ip->obj_asof == HAMMER_MAX_TID && 1817 ip->obj_localization == 0 && 1818 strncmp(ptr, "@@PFS", 5) == 0) { 1819 hammer_simple_transaction(&trans, hmp); 1820 bcopy(ptr + 5, buf, 5); 1821 buf[5] = 0; 1822 localization = strtoul(buf, NULL, 10) << 16; 1823 pfsm = hammer_load_pseudofs(&trans, localization, 1824 &error); 1825 if (error == 0) { 1826 if (pfsm->pfsd.mirror_flags & 1827 HAMMER_PFSD_SLAVE) { 1828 /* vap->va_size == 26 */ 1829 ksnprintf(buf, sizeof(buf), 1830 "@@0x%016llx:%05d", 1831 (long long)pfsm->pfsd.sync_end_tid, 1832 localization >> 16); 1833 } else { 1834 /* vap->va_size == 10 */ 1835 ksnprintf(buf, sizeof(buf), 1836 "@@-1:%05d", 1837 localization >> 16); 1838 #if 0 1839 ksnprintf(buf, sizeof(buf), 1840 "@@0x%016llx:%05d", 1841 (long long)HAMMER_MAX_TID, 1842 localization >> 16); 1843 #endif 1844 } 1845 ptr = buf; 1846 bytes = strlen(buf); 1847 } 1848 if (pfsm) 1849 hammer_rel_pseudofs(hmp, pfsm); 1850 hammer_done_transaction(&trans); 1851 } 1852 error = uiomove(ptr, bytes, ap->a_uio); 1853 lwkt_reltoken(&hmp->fs_token); 1854 return(error); 1855 } 1856 1857 /* 1858 * Long version 1859 */ 1860 hammer_simple_transaction(&trans, hmp); 1861 ++hammer_stats_file_iopsr; 1862 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1863 1864 /* 1865 * Key range (begin and end inclusive) to scan. Directory keys 1866 * directly translate to a 64 bit 'seek' position. 1867 */ 1868 cursor.key_beg.localization = ip->obj_localization + 1869 HAMMER_LOCALIZE_MISC; 1870 cursor.key_beg.obj_id = ip->obj_id; 1871 cursor.key_beg.create_tid = 0; 1872 cursor.key_beg.delete_tid = 0; 1873 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1874 cursor.key_beg.obj_type = 0; 1875 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1876 cursor.asof = ip->obj_asof; 1877 cursor.flags |= HAMMER_CURSOR_ASOF; 1878 1879 error = hammer_ip_lookup(&cursor); 1880 if (error == 0) { 1881 error = hammer_ip_resolve_data(&cursor); 1882 if (error == 0) { 1883 KKASSERT(cursor.leaf->data_len >= 1884 HAMMER_SYMLINK_NAME_OFF); 1885 error = uiomove(cursor.data->symlink.name, 1886 cursor.leaf->data_len - 1887 HAMMER_SYMLINK_NAME_OFF, 1888 ap->a_uio); 1889 } 1890 } 1891 hammer_done_cursor(&cursor); 1892 hammer_done_transaction(&trans); 1893 lwkt_reltoken(&hmp->fs_token); 1894 return(error); 1895 } 1896 1897 /* 1898 * hammer_vop_nremove { nch, dvp, cred } 1899 */ 1900 static 1901 int 1902 hammer_vop_nremove(struct vop_nremove_args *ap) 1903 { 1904 struct hammer_transaction trans; 1905 struct hammer_inode *dip; 1906 hammer_mount_t hmp; 1907 int error; 1908 1909 dip = VTOI(ap->a_dvp); 1910 hmp = dip->hmp; 1911 1912 if (hammer_nohistory(dip) == 0 && 1913 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1914 return (error); 1915 } 1916 1917 lwkt_gettoken(&hmp->fs_token); 1918 hammer_start_transaction(&trans, hmp); 1919 ++hammer_stats_file_iopsw; 1920 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1921 hammer_done_transaction(&trans); 1922 if (error == 0) 1923 hammer_knote(ap->a_dvp, NOTE_WRITE); 1924 lwkt_reltoken(&hmp->fs_token); 1925 return (error); 1926 } 1927 1928 /* 1929 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1930 */ 1931 static 1932 int 1933 hammer_vop_nrename(struct vop_nrename_args *ap) 1934 { 1935 struct hammer_transaction trans; 1936 struct namecache *fncp; 1937 struct namecache *tncp; 1938 struct hammer_inode *fdip; 1939 struct hammer_inode *tdip; 1940 struct hammer_inode *ip; 1941 hammer_mount_t hmp; 1942 struct hammer_cursor cursor; 1943 int64_t namekey; 1944 u_int32_t max_iterations; 1945 int nlen, error; 1946 1947 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1948 return(EXDEV); 1949 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1950 return(EXDEV); 1951 1952 fdip = VTOI(ap->a_fdvp); 1953 tdip = VTOI(ap->a_tdvp); 1954 fncp = ap->a_fnch->ncp; 1955 tncp = ap->a_tnch->ncp; 1956 ip = VTOI(fncp->nc_vp); 1957 KKASSERT(ip != NULL); 1958 1959 hmp = ip->hmp; 1960 1961 if (fdip->obj_localization != tdip->obj_localization) 1962 return(EXDEV); 1963 if (fdip->obj_localization != ip->obj_localization) 1964 return(EXDEV); 1965 1966 if (fdip->flags & HAMMER_INODE_RO) 1967 return (EROFS); 1968 if (tdip->flags & HAMMER_INODE_RO) 1969 return (EROFS); 1970 if (ip->flags & HAMMER_INODE_RO) 1971 return (EROFS); 1972 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1973 return (error); 1974 1975 lwkt_gettoken(&hmp->fs_token); 1976 hammer_start_transaction(&trans, hmp); 1977 ++hammer_stats_file_iopsw; 1978 1979 /* 1980 * Remove tncp from the target directory and then link ip as 1981 * tncp. XXX pass trans to dounlink 1982 * 1983 * Force the inode sync-time to match the transaction so it is 1984 * in-sync with the creation of the target directory entry. 1985 */ 1986 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1987 ap->a_cred, 0, -1); 1988 if (error == 0 || error == ENOENT) { 1989 error = hammer_ip_add_directory(&trans, tdip, 1990 tncp->nc_name, tncp->nc_nlen, 1991 ip); 1992 if (error == 0) { 1993 ip->ino_data.parent_obj_id = tdip->obj_id; 1994 ip->ino_data.ctime = trans.time; 1995 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 1996 } 1997 } 1998 if (error) 1999 goto failed; /* XXX */ 2000 2001 /* 2002 * Locate the record in the originating directory and remove it. 2003 * 2004 * Calculate the namekey and setup the key range for the scan. This 2005 * works kinda like a chained hash table where the lower 32 bits 2006 * of the namekey synthesize the chain. 2007 * 2008 * The key range is inclusive of both key_beg and key_end. 2009 */ 2010 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2011 &max_iterations); 2012 retry: 2013 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2014 cursor.key_beg.localization = fdip->obj_localization + 2015 hammer_dir_localization(fdip); 2016 cursor.key_beg.obj_id = fdip->obj_id; 2017 cursor.key_beg.key = namekey; 2018 cursor.key_beg.create_tid = 0; 2019 cursor.key_beg.delete_tid = 0; 2020 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2021 cursor.key_beg.obj_type = 0; 2022 2023 cursor.key_end = cursor.key_beg; 2024 cursor.key_end.key += max_iterations; 2025 cursor.asof = fdip->obj_asof; 2026 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2027 2028 /* 2029 * Scan all matching records (the chain), locate the one matching 2030 * the requested path component. 2031 * 2032 * The hammer_ip_*() functions merge in-memory records with on-disk 2033 * records for the purposes of the search. 2034 */ 2035 error = hammer_ip_first(&cursor); 2036 while (error == 0) { 2037 if (hammer_ip_resolve_data(&cursor) != 0) 2038 break; 2039 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2040 KKASSERT(nlen > 0); 2041 if (fncp->nc_nlen == nlen && 2042 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2043 break; 2044 } 2045 error = hammer_ip_next(&cursor); 2046 } 2047 2048 /* 2049 * If all is ok we have to get the inode so we can adjust nlinks. 2050 * 2051 * WARNING: hammer_ip_del_directory() may have to terminate the 2052 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2053 * twice. 2054 */ 2055 if (error == 0) 2056 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2057 2058 /* 2059 * XXX A deadlock here will break rename's atomicy for the purposes 2060 * of crash recovery. 2061 */ 2062 if (error == EDEADLK) { 2063 hammer_done_cursor(&cursor); 2064 goto retry; 2065 } 2066 2067 /* 2068 * Cleanup and tell the kernel that the rename succeeded. 2069 * 2070 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2071 * without formally acquiring the vp since the vp might 2072 * have zero refs on it, or in the middle of a reclaim, 2073 * etc. 2074 */ 2075 hammer_done_cursor(&cursor); 2076 if (error == 0) { 2077 cache_rename(ap->a_fnch, ap->a_tnch); 2078 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2079 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2080 while (ip->vp) { 2081 struct vnode *vp; 2082 2083 error = hammer_get_vnode(ip, &vp); 2084 if (error == 0 && vp) { 2085 vn_unlock(vp); 2086 hammer_knote(ip->vp, NOTE_RENAME); 2087 vrele(vp); 2088 break; 2089 } 2090 hdkprintf("ip/vp race2 avoided\n"); 2091 } 2092 } 2093 2094 failed: 2095 hammer_done_transaction(&trans); 2096 lwkt_reltoken(&hmp->fs_token); 2097 return (error); 2098 } 2099 2100 /* 2101 * hammer_vop_nrmdir { nch, dvp, cred } 2102 */ 2103 static 2104 int 2105 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2106 { 2107 struct hammer_transaction trans; 2108 struct hammer_inode *dip; 2109 hammer_mount_t hmp; 2110 int error; 2111 2112 dip = VTOI(ap->a_dvp); 2113 hmp = dip->hmp; 2114 2115 if (hammer_nohistory(dip) == 0 && 2116 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2117 return (error); 2118 } 2119 2120 lwkt_gettoken(&hmp->fs_token); 2121 hammer_start_transaction(&trans, hmp); 2122 ++hammer_stats_file_iopsw; 2123 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2124 hammer_done_transaction(&trans); 2125 if (error == 0) 2126 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2127 lwkt_reltoken(&hmp->fs_token); 2128 return (error); 2129 } 2130 2131 /* 2132 * hammer_vop_markatime { vp, cred } 2133 */ 2134 static 2135 int 2136 hammer_vop_markatime(struct vop_markatime_args *ap) 2137 { 2138 struct hammer_transaction trans; 2139 struct hammer_inode *ip; 2140 hammer_mount_t hmp; 2141 2142 ip = VTOI(ap->a_vp); 2143 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2144 return (EROFS); 2145 if (ip->flags & HAMMER_INODE_RO) 2146 return (EROFS); 2147 hmp = ip->hmp; 2148 if (hmp->mp->mnt_flag & MNT_NOATIME) 2149 return (0); 2150 lwkt_gettoken(&hmp->fs_token); 2151 hammer_start_transaction(&trans, hmp); 2152 ++hammer_stats_file_iopsw; 2153 2154 ip->ino_data.atime = trans.time; 2155 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2156 hammer_done_transaction(&trans); 2157 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2158 lwkt_reltoken(&hmp->fs_token); 2159 return (0); 2160 } 2161 2162 /* 2163 * hammer_vop_setattr { vp, vap, cred } 2164 */ 2165 static 2166 int 2167 hammer_vop_setattr(struct vop_setattr_args *ap) 2168 { 2169 struct hammer_transaction trans; 2170 struct hammer_inode *ip; 2171 struct vattr *vap; 2172 hammer_mount_t hmp; 2173 int modflags; 2174 int error; 2175 int truncating; 2176 int blksize; 2177 int kflags; 2178 #if 0 2179 int64_t aligned_size; 2180 #endif 2181 u_int32_t flags; 2182 2183 vap = ap->a_vap; 2184 ip = ap->a_vp->v_data; 2185 modflags = 0; 2186 kflags = 0; 2187 hmp = ip->hmp; 2188 2189 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2190 return(EROFS); 2191 if (ip->flags & HAMMER_INODE_RO) 2192 return (EROFS); 2193 if (hammer_nohistory(ip) == 0 && 2194 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2195 return (error); 2196 } 2197 2198 lwkt_gettoken(&hmp->fs_token); 2199 hammer_start_transaction(&trans, hmp); 2200 ++hammer_stats_file_iopsw; 2201 error = 0; 2202 2203 if (vap->va_flags != VNOVAL) { 2204 flags = ip->ino_data.uflags; 2205 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2206 hammer_to_unix_xid(&ip->ino_data.uid), 2207 ap->a_cred); 2208 if (error == 0) { 2209 if (ip->ino_data.uflags != flags) { 2210 ip->ino_data.uflags = flags; 2211 ip->ino_data.ctime = trans.time; 2212 modflags |= HAMMER_INODE_DDIRTY; 2213 kflags |= NOTE_ATTRIB; 2214 } 2215 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2216 error = 0; 2217 goto done; 2218 } 2219 } 2220 goto done; 2221 } 2222 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2223 error = EPERM; 2224 goto done; 2225 } 2226 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2227 mode_t cur_mode = ip->ino_data.mode; 2228 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2229 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2230 uuid_t uuid_uid; 2231 uuid_t uuid_gid; 2232 2233 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2234 ap->a_cred, 2235 &cur_uid, &cur_gid, &cur_mode); 2236 if (error == 0) { 2237 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2238 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2239 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2240 sizeof(uuid_uid)) || 2241 bcmp(&uuid_gid, &ip->ino_data.gid, 2242 sizeof(uuid_gid)) || 2243 ip->ino_data.mode != cur_mode) { 2244 ip->ino_data.uid = uuid_uid; 2245 ip->ino_data.gid = uuid_gid; 2246 ip->ino_data.mode = cur_mode; 2247 ip->ino_data.ctime = trans.time; 2248 modflags |= HAMMER_INODE_DDIRTY; 2249 } 2250 kflags |= NOTE_ATTRIB; 2251 } 2252 } 2253 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2254 switch(ap->a_vp->v_type) { 2255 case VREG: 2256 if (vap->va_size == ip->ino_data.size) 2257 break; 2258 2259 /* 2260 * Log the operation if in fast-fsync mode or if 2261 * there are unterminated redo write records present. 2262 * 2263 * The second check is needed so the recovery code 2264 * properly truncates write redos even if nominal 2265 * REDO operations is turned off due to excessive 2266 * writes, because the related records might be 2267 * destroyed and never lay down a TERM_WRITE. 2268 */ 2269 if ((ip->flags & HAMMER_INODE_REDO) || 2270 (ip->flags & HAMMER_INODE_RDIRTY)) { 2271 error = hammer_generate_redo(&trans, ip, 2272 vap->va_size, 2273 HAMMER_REDO_TRUNC, 2274 NULL, 0); 2275 } 2276 blksize = hammer_blocksize(vap->va_size); 2277 2278 /* 2279 * XXX break atomicy, we can deadlock the backend 2280 * if we do not release the lock. Probably not a 2281 * big deal here. 2282 */ 2283 if (vap->va_size < ip->ino_data.size) { 2284 nvtruncbuf(ap->a_vp, vap->va_size, 2285 blksize, 2286 hammer_blockoff(vap->va_size), 2287 0); 2288 truncating = 1; 2289 kflags |= NOTE_WRITE; 2290 } else { 2291 nvextendbuf(ap->a_vp, 2292 ip->ino_data.size, 2293 vap->va_size, 2294 hammer_blocksize(ip->ino_data.size), 2295 hammer_blocksize(vap->va_size), 2296 hammer_blockoff(ip->ino_data.size), 2297 hammer_blockoff(vap->va_size), 2298 0); 2299 truncating = 0; 2300 kflags |= NOTE_WRITE | NOTE_EXTEND; 2301 } 2302 ip->ino_data.size = vap->va_size; 2303 ip->ino_data.mtime = trans.time; 2304 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2305 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2306 2307 /* 2308 * On-media truncation is cached in the inode until 2309 * the inode is synchronized. We must immediately 2310 * handle any frontend records. 2311 */ 2312 if (truncating) { 2313 hammer_ip_frontend_trunc(ip, vap->va_size); 2314 #ifdef DEBUG_TRUNCATE 2315 if (HammerTruncIp == NULL) 2316 HammerTruncIp = ip; 2317 #endif 2318 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2319 ip->flags |= HAMMER_INODE_TRUNCATED; 2320 ip->trunc_off = vap->va_size; 2321 hammer_inode_dirty(ip); 2322 #ifdef DEBUG_TRUNCATE 2323 if (ip == HammerTruncIp) 2324 hdkprintf("truncate1 %016llx\n", 2325 (long long)ip->trunc_off); 2326 #endif 2327 } else if (ip->trunc_off > vap->va_size) { 2328 ip->trunc_off = vap->va_size; 2329 #ifdef DEBUG_TRUNCATE 2330 if (ip == HammerTruncIp) 2331 hdkprintf("truncate2 %016llx\n", 2332 (long long)ip->trunc_off); 2333 #endif 2334 } else { 2335 #ifdef DEBUG_TRUNCATE 2336 if (ip == HammerTruncIp) 2337 hdkprintf("truncate3 %016llx (ignored)\n", 2338 (long long)vap->va_size); 2339 #endif 2340 } 2341 } 2342 2343 #if 0 2344 /* 2345 * When truncating, nvtruncbuf() may have cleaned out 2346 * a portion of the last block on-disk in the buffer 2347 * cache. We must clean out any frontend records 2348 * for blocks beyond the new last block. 2349 */ 2350 aligned_size = (vap->va_size + (blksize - 1)) & 2351 ~(int64_t)(blksize - 1); 2352 if (truncating && vap->va_size < aligned_size) { 2353 aligned_size -= blksize; 2354 hammer_ip_frontend_trunc(ip, aligned_size); 2355 } 2356 #endif 2357 break; 2358 case VDATABASE: 2359 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2360 ip->flags |= HAMMER_INODE_TRUNCATED; 2361 ip->trunc_off = vap->va_size; 2362 hammer_inode_dirty(ip); 2363 } else if (ip->trunc_off > vap->va_size) { 2364 ip->trunc_off = vap->va_size; 2365 } 2366 hammer_ip_frontend_trunc(ip, vap->va_size); 2367 ip->ino_data.size = vap->va_size; 2368 ip->ino_data.mtime = trans.time; 2369 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2370 kflags |= NOTE_ATTRIB; 2371 break; 2372 default: 2373 error = EINVAL; 2374 goto done; 2375 } 2376 break; 2377 } 2378 if (vap->va_atime.tv_sec != VNOVAL) { 2379 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2380 modflags |= HAMMER_INODE_ATIME; 2381 kflags |= NOTE_ATTRIB; 2382 } 2383 if (vap->va_mtime.tv_sec != VNOVAL) { 2384 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2385 modflags |= HAMMER_INODE_MTIME; 2386 kflags |= NOTE_ATTRIB; 2387 } 2388 if (vap->va_mode != (mode_t)VNOVAL) { 2389 mode_t cur_mode = ip->ino_data.mode; 2390 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2391 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2392 2393 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2394 cur_uid, cur_gid, &cur_mode); 2395 if (error == 0 && ip->ino_data.mode != cur_mode) { 2396 ip->ino_data.mode = cur_mode; 2397 ip->ino_data.ctime = trans.time; 2398 modflags |= HAMMER_INODE_DDIRTY; 2399 kflags |= NOTE_ATTRIB; 2400 } 2401 } 2402 done: 2403 if (error == 0) 2404 hammer_modify_inode(&trans, ip, modflags); 2405 hammer_done_transaction(&trans); 2406 hammer_knote(ap->a_vp, kflags); 2407 lwkt_reltoken(&hmp->fs_token); 2408 return (error); 2409 } 2410 2411 /* 2412 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2413 */ 2414 static 2415 int 2416 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2417 { 2418 struct hammer_transaction trans; 2419 struct hammer_inode *dip; 2420 struct hammer_inode *nip; 2421 hammer_record_t record; 2422 struct nchandle *nch; 2423 hammer_mount_t hmp; 2424 int error; 2425 int bytes; 2426 2427 ap->a_vap->va_type = VLNK; 2428 2429 nch = ap->a_nch; 2430 dip = VTOI(ap->a_dvp); 2431 hmp = dip->hmp; 2432 2433 if (dip->flags & HAMMER_INODE_RO) 2434 return (EROFS); 2435 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2436 return (error); 2437 2438 /* 2439 * Create a transaction to cover the operations we perform. 2440 */ 2441 lwkt_gettoken(&hmp->fs_token); 2442 hammer_start_transaction(&trans, hmp); 2443 ++hammer_stats_file_iopsw; 2444 2445 /* 2446 * Create a new filesystem object of the requested type. The 2447 * returned inode will be referenced but not locked. 2448 */ 2449 2450 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2451 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2452 NULL, &nip); 2453 if (error) { 2454 hammer_done_transaction(&trans); 2455 *ap->a_vpp = NULL; 2456 lwkt_reltoken(&hmp->fs_token); 2457 return (error); 2458 } 2459 2460 /* 2461 * Add a record representing the symlink. symlink stores the link 2462 * as pure data, not a string, and is no \0 terminated. 2463 */ 2464 if (error == 0) { 2465 bytes = strlen(ap->a_target); 2466 2467 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2468 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2469 } else { 2470 record = hammer_alloc_mem_record(nip, bytes); 2471 record->type = HAMMER_MEM_RECORD_GENERAL; 2472 2473 record->leaf.base.localization = nip->obj_localization + 2474 HAMMER_LOCALIZE_MISC; 2475 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2476 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2477 record->leaf.data_len = bytes; 2478 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2479 bcopy(ap->a_target, record->data->symlink.name, bytes); 2480 error = hammer_ip_add_record(&trans, record); 2481 } 2482 2483 /* 2484 * Set the file size to the length of the link. 2485 */ 2486 if (error == 0) { 2487 nip->ino_data.size = bytes; 2488 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2489 } 2490 } 2491 if (error == 0) 2492 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2493 nch->ncp->nc_nlen, nip); 2494 2495 /* 2496 * Finish up. 2497 */ 2498 if (error) { 2499 hammer_rel_inode(nip, 0); 2500 *ap->a_vpp = NULL; 2501 } else { 2502 error = hammer_get_vnode(nip, ap->a_vpp); 2503 hammer_rel_inode(nip, 0); 2504 if (error == 0) { 2505 cache_setunresolved(ap->a_nch); 2506 cache_setvp(ap->a_nch, *ap->a_vpp); 2507 hammer_knote(ap->a_dvp, NOTE_WRITE); 2508 } 2509 } 2510 hammer_done_transaction(&trans); 2511 lwkt_reltoken(&hmp->fs_token); 2512 return (error); 2513 } 2514 2515 /* 2516 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2517 */ 2518 static 2519 int 2520 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2521 { 2522 struct hammer_transaction trans; 2523 struct hammer_inode *dip; 2524 hammer_mount_t hmp; 2525 int error; 2526 2527 dip = VTOI(ap->a_dvp); 2528 hmp = dip->hmp; 2529 2530 if (hammer_nohistory(dip) == 0 && 2531 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2532 return (error); 2533 } 2534 2535 lwkt_gettoken(&hmp->fs_token); 2536 hammer_start_transaction(&trans, hmp); 2537 ++hammer_stats_file_iopsw; 2538 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2539 ap->a_cred, ap->a_flags, -1); 2540 hammer_done_transaction(&trans); 2541 lwkt_reltoken(&hmp->fs_token); 2542 2543 return (error); 2544 } 2545 2546 /* 2547 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2548 */ 2549 static 2550 int 2551 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2552 { 2553 struct hammer_inode *ip = ap->a_vp->v_data; 2554 hammer_mount_t hmp = ip->hmp; 2555 int error; 2556 2557 ++hammer_stats_file_iopsr; 2558 lwkt_gettoken(&hmp->fs_token); 2559 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2560 ap->a_fflag, ap->a_cred); 2561 lwkt_reltoken(&hmp->fs_token); 2562 return (error); 2563 } 2564 2565 static 2566 int 2567 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2568 { 2569 static const struct mountctl_opt extraopt[] = { 2570 { HMNT_NOHISTORY, "nohistory" }, 2571 { HMNT_MASTERID, "master" }, 2572 { HMNT_NOMIRROR, "nomirror" }, 2573 { 0, NULL} 2574 2575 }; 2576 struct hammer_mount *hmp; 2577 struct mount *mp; 2578 int usedbytes; 2579 int error; 2580 2581 error = 0; 2582 usedbytes = 0; 2583 mp = ap->a_head.a_ops->head.vv_mount; 2584 KKASSERT(mp->mnt_data != NULL); 2585 hmp = (struct hammer_mount *)mp->mnt_data; 2586 2587 lwkt_gettoken(&hmp->fs_token); 2588 2589 switch(ap->a_op) { 2590 case MOUNTCTL_SET_EXPORT: 2591 if (ap->a_ctllen != sizeof(struct export_args)) 2592 error = EINVAL; 2593 else 2594 error = hammer_vfs_export(mp, ap->a_op, 2595 (const struct export_args *)ap->a_ctl); 2596 break; 2597 case MOUNTCTL_MOUNTFLAGS: 2598 /* 2599 * Call standard mountctl VOP function 2600 * so we get user mount flags. 2601 */ 2602 error = vop_stdmountctl(ap); 2603 if (error) 2604 break; 2605 2606 usedbytes = *ap->a_res; 2607 2608 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2609 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2610 ap->a_buf, 2611 ap->a_buflen - usedbytes, 2612 &error); 2613 } 2614 2615 *ap->a_res += usedbytes; 2616 break; 2617 default: 2618 error = vop_stdmountctl(ap); 2619 break; 2620 } 2621 lwkt_reltoken(&hmp->fs_token); 2622 return(error); 2623 } 2624 2625 /* 2626 * hammer_vop_strategy { vp, bio } 2627 * 2628 * Strategy call, used for regular file read & write only. Note that the 2629 * bp may represent a cluster. 2630 * 2631 * To simplify operation and allow better optimizations in the future, 2632 * this code does not make any assumptions with regards to buffer alignment 2633 * or size. 2634 */ 2635 static 2636 int 2637 hammer_vop_strategy(struct vop_strategy_args *ap) 2638 { 2639 struct buf *bp; 2640 int error; 2641 2642 bp = ap->a_bio->bio_buf; 2643 2644 switch(bp->b_cmd) { 2645 case BUF_CMD_READ: 2646 error = hammer_vop_strategy_read(ap); 2647 break; 2648 case BUF_CMD_WRITE: 2649 error = hammer_vop_strategy_write(ap); 2650 break; 2651 default: 2652 bp->b_error = error = EINVAL; 2653 bp->b_flags |= B_ERROR; 2654 biodone(ap->a_bio); 2655 break; 2656 } 2657 2658 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2659 2660 return (error); 2661 } 2662 2663 /* 2664 * Read from a regular file. Iterate the related records and fill in the 2665 * BIO/BUF. Gaps are zero-filled. 2666 * 2667 * The support code in hammer_object.c should be used to deal with mixed 2668 * in-memory and on-disk records. 2669 * 2670 * NOTE: Can be called from the cluster code with an oversized buf. 2671 * 2672 * XXX atime update 2673 */ 2674 static 2675 int 2676 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2677 { 2678 struct hammer_transaction trans; 2679 struct hammer_inode *ip; 2680 struct hammer_inode *dip; 2681 hammer_mount_t hmp; 2682 struct hammer_cursor cursor; 2683 hammer_base_elm_t base; 2684 hammer_off_t disk_offset; 2685 struct bio *bio; 2686 struct bio *nbio; 2687 struct buf *bp; 2688 int64_t rec_offset; 2689 int64_t ran_end; 2690 int64_t tmp64; 2691 int error; 2692 int boff; 2693 int roff; 2694 int n; 2695 int isdedupable; 2696 2697 bio = ap->a_bio; 2698 bp = bio->bio_buf; 2699 ip = ap->a_vp->v_data; 2700 hmp = ip->hmp; 2701 2702 /* 2703 * The zone-2 disk offset may have been set by the cluster code via 2704 * a BMAP operation, or else should be NOOFFSET. 2705 * 2706 * Checking the high bits for a match against zone-2 should suffice. 2707 * 2708 * In cases where a lot of data duplication is present it may be 2709 * more beneficial to drop through and doubule-buffer through the 2710 * device. 2711 */ 2712 nbio = push_bio(bio); 2713 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2714 HAMMER_ZONE_LARGE_DATA) { 2715 if (hammer_double_buffer == 0) { 2716 lwkt_gettoken(&hmp->fs_token); 2717 error = hammer_io_direct_read(hmp, nbio, NULL); 2718 lwkt_reltoken(&hmp->fs_token); 2719 return (error); 2720 } 2721 2722 /* 2723 * Try to shortcut requests for double_buffer mode too. 2724 * Since this mode runs through the device buffer cache 2725 * only compatible buffer sizes (meaning those generated 2726 * by normal filesystem buffers) are legal. 2727 */ 2728 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2729 lwkt_gettoken(&hmp->fs_token); 2730 error = hammer_io_indirect_read(hmp, nbio, NULL); 2731 lwkt_reltoken(&hmp->fs_token); 2732 return (error); 2733 } 2734 } 2735 2736 /* 2737 * Well, that sucked. Do it the hard way. If all the stars are 2738 * aligned we may still be able to issue a direct-read. 2739 */ 2740 lwkt_gettoken(&hmp->fs_token); 2741 hammer_simple_transaction(&trans, hmp); 2742 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2743 2744 /* 2745 * Key range (begin and end inclusive) to scan. Note that the key's 2746 * stored in the actual records represent BASE+LEN, not BASE. The 2747 * first record containing bio_offset will have a key > bio_offset. 2748 */ 2749 cursor.key_beg.localization = ip->obj_localization + 2750 HAMMER_LOCALIZE_MISC; 2751 cursor.key_beg.obj_id = ip->obj_id; 2752 cursor.key_beg.create_tid = 0; 2753 cursor.key_beg.delete_tid = 0; 2754 cursor.key_beg.obj_type = 0; 2755 cursor.key_beg.key = bio->bio_offset + 1; 2756 cursor.asof = ip->obj_asof; 2757 cursor.flags |= HAMMER_CURSOR_ASOF; 2758 2759 cursor.key_end = cursor.key_beg; 2760 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2761 #if 0 2762 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2763 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2764 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2765 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2766 } else 2767 #endif 2768 { 2769 ran_end = bio->bio_offset + bp->b_bufsize; 2770 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2771 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2772 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2773 if (tmp64 < ran_end) 2774 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2775 else 2776 cursor.key_end.key = ran_end + MAXPHYS + 1; 2777 } 2778 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2779 2780 /* 2781 * Set NOSWAPCACHE for cursor data extraction if double buffering 2782 * is disabled or (if the file is not marked cacheable via chflags 2783 * and vm.swapcache_use_chflags is enabled). 2784 */ 2785 if (hammer_double_buffer == 0 || 2786 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2787 vm_swapcache_use_chflags)) { 2788 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2789 } 2790 2791 error = hammer_ip_first(&cursor); 2792 boff = 0; 2793 2794 while (error == 0) { 2795 /* 2796 * Get the base file offset of the record. The key for 2797 * data records is (base + bytes) rather then (base). 2798 */ 2799 base = &cursor.leaf->base; 2800 rec_offset = base->key - cursor.leaf->data_len; 2801 2802 /* 2803 * Calculate the gap, if any, and zero-fill it. 2804 * 2805 * n is the offset of the start of the record verses our 2806 * current seek offset in the bio. 2807 */ 2808 n = (int)(rec_offset - (bio->bio_offset + boff)); 2809 if (n > 0) { 2810 if (n > bp->b_bufsize - boff) 2811 n = bp->b_bufsize - boff; 2812 bzero((char *)bp->b_data + boff, n); 2813 boff += n; 2814 n = 0; 2815 } 2816 2817 /* 2818 * Calculate the data offset in the record and the number 2819 * of bytes we can copy. 2820 * 2821 * There are two degenerate cases. First, boff may already 2822 * be at bp->b_bufsize. Secondly, the data offset within 2823 * the record may exceed the record's size. 2824 */ 2825 roff = -n; 2826 rec_offset += roff; 2827 n = cursor.leaf->data_len - roff; 2828 if (n <= 0) { 2829 hdkprintf("bad n=%d roff=%d\n", n, roff); 2830 n = 0; 2831 } else if (n > bp->b_bufsize - boff) { 2832 n = bp->b_bufsize - boff; 2833 } 2834 2835 /* 2836 * Deal with cached truncations. This cool bit of code 2837 * allows truncate()/ftruncate() to avoid having to sync 2838 * the file. 2839 * 2840 * If the frontend is truncated then all backend records are 2841 * subject to the frontend's truncation. 2842 * 2843 * If the backend is truncated then backend records on-disk 2844 * (but not in-memory) are subject to the backend's 2845 * truncation. In-memory records owned by the backend 2846 * represent data written after the truncation point on the 2847 * backend and must not be truncated. 2848 * 2849 * Truncate operations deal with frontend buffer cache 2850 * buffers and frontend-owned in-memory records synchronously. 2851 */ 2852 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2853 if (hammer_cursor_ondisk(&cursor)/* || 2854 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2855 if (ip->trunc_off <= rec_offset) 2856 n = 0; 2857 else if (ip->trunc_off < rec_offset + n) 2858 n = (int)(ip->trunc_off - rec_offset); 2859 } 2860 } 2861 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2862 if (hammer_cursor_ondisk(&cursor)) { 2863 if (ip->sync_trunc_off <= rec_offset) 2864 n = 0; 2865 else if (ip->sync_trunc_off < rec_offset + n) 2866 n = (int)(ip->sync_trunc_off - rec_offset); 2867 } 2868 } 2869 2870 /* 2871 * Try to issue a direct read into our bio if possible, 2872 * otherwise resolve the element data into a hammer_buffer 2873 * and copy. 2874 * 2875 * The buffer on-disk should be zerod past any real 2876 * truncation point, but may not be for any synthesized 2877 * truncation point from above. 2878 * 2879 * NOTE: disk_offset is only valid if the cursor data is 2880 * on-disk. 2881 */ 2882 disk_offset = cursor.leaf->data_offset + roff; 2883 isdedupable = (boff == 0 && n == bp->b_bufsize && 2884 hammer_cursor_ondisk(&cursor) && 2885 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2886 2887 if (isdedupable && hammer_double_buffer == 0) { 2888 /* 2889 * Direct read case 2890 */ 2891 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2892 HAMMER_ZONE_LARGE_DATA); 2893 nbio->bio_offset = disk_offset; 2894 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2895 if (hammer_live_dedup && error == 0) 2896 hammer_dedup_cache_add(ip, cursor.leaf); 2897 goto done; 2898 } else if (isdedupable) { 2899 /* 2900 * Async I/O case for reading from backing store 2901 * and copying the data to the filesystem buffer. 2902 * live-dedup has to verify the data anyway if it 2903 * gets a hit later so we can just add the entry 2904 * now. 2905 */ 2906 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2907 HAMMER_ZONE_LARGE_DATA); 2908 nbio->bio_offset = disk_offset; 2909 if (hammer_live_dedup) 2910 hammer_dedup_cache_add(ip, cursor.leaf); 2911 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2912 goto done; 2913 } else if (n) { 2914 error = hammer_ip_resolve_data(&cursor); 2915 if (error == 0) { 2916 if (hammer_live_dedup && isdedupable) 2917 hammer_dedup_cache_add(ip, cursor.leaf); 2918 bcopy((char *)cursor.data + roff, 2919 (char *)bp->b_data + boff, n); 2920 } 2921 } 2922 if (error) 2923 break; 2924 2925 /* 2926 * We have to be sure that the only elements added to the 2927 * dedup cache are those which are already on-media. 2928 */ 2929 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2930 hammer_dedup_cache_add(ip, cursor.leaf); 2931 2932 /* 2933 * Iterate until we have filled the request. 2934 */ 2935 boff += n; 2936 if (boff == bp->b_bufsize) 2937 break; 2938 error = hammer_ip_next(&cursor); 2939 } 2940 2941 /* 2942 * There may have been a gap after the last record 2943 */ 2944 if (error == ENOENT) 2945 error = 0; 2946 if (error == 0 && boff != bp->b_bufsize) { 2947 KKASSERT(boff < bp->b_bufsize); 2948 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2949 /* boff = bp->b_bufsize; */ 2950 } 2951 2952 /* 2953 * Disallow swapcache operation on the vnode buffer if double 2954 * buffering is enabled, the swapcache will get the data via 2955 * the block device buffer. 2956 */ 2957 if (hammer_double_buffer) 2958 bp->b_flags |= B_NOTMETA; 2959 2960 /* 2961 * Cleanup 2962 */ 2963 bp->b_resid = 0; 2964 bp->b_error = error; 2965 if (error) 2966 bp->b_flags |= B_ERROR; 2967 biodone(ap->a_bio); 2968 2969 done: 2970 /* 2971 * Cache the b-tree node for the last data read in cache[1]. 2972 * 2973 * If we hit the file EOF then also cache the node in the 2974 * governing directory's cache[3], it will be used to initialize 2975 * the new inode's cache[1] for any inodes looked up via the directory. 2976 * 2977 * This doesn't reduce disk accesses since the B-Tree chain is 2978 * likely cached, but it does reduce cpu overhead when looking 2979 * up file offsets for cpdup/tar/cpio style iterations. 2980 */ 2981 if (cursor.node) 2982 hammer_cache_node(&ip->cache[1], cursor.node); 2983 if (ran_end >= ip->ino_data.size) { 2984 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2985 ip->obj_asof, ip->obj_localization); 2986 if (dip) { 2987 hammer_cache_node(&dip->cache[3], cursor.node); 2988 hammer_rel_inode(dip, 0); 2989 } 2990 } 2991 hammer_done_cursor(&cursor); 2992 hammer_done_transaction(&trans); 2993 lwkt_reltoken(&hmp->fs_token); 2994 return(error); 2995 } 2996 2997 /* 2998 * BMAP operation - used to support cluster_read() only. 2999 * 3000 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3001 * 3002 * This routine may return EOPNOTSUPP if the opration is not supported for 3003 * the specified offset. The contents of the pointer arguments do not 3004 * need to be initialized in that case. 3005 * 3006 * If a disk address is available and properly aligned return 0 with 3007 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3008 * to the run-length relative to that offset. Callers may assume that 3009 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3010 * large, so return EOPNOTSUPP if it is not sufficiently large. 3011 */ 3012 static 3013 int 3014 hammer_vop_bmap(struct vop_bmap_args *ap) 3015 { 3016 struct hammer_transaction trans; 3017 struct hammer_inode *ip; 3018 hammer_mount_t hmp; 3019 struct hammer_cursor cursor; 3020 hammer_base_elm_t base; 3021 int64_t rec_offset; 3022 int64_t ran_end; 3023 int64_t tmp64; 3024 int64_t base_offset; 3025 int64_t base_disk_offset; 3026 int64_t last_offset; 3027 hammer_off_t last_disk_offset; 3028 hammer_off_t disk_offset; 3029 int rec_len; 3030 int error; 3031 int blksize; 3032 3033 ++hammer_stats_file_iopsr; 3034 ip = ap->a_vp->v_data; 3035 hmp = ip->hmp; 3036 3037 /* 3038 * We can only BMAP regular files. We can't BMAP database files, 3039 * directories, etc. 3040 */ 3041 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3042 return(EOPNOTSUPP); 3043 3044 /* 3045 * bmap is typically called with runp/runb both NULL when used 3046 * for writing. We do not support BMAP for writing atm. 3047 */ 3048 if (ap->a_cmd != BUF_CMD_READ) 3049 return(EOPNOTSUPP); 3050 3051 /* 3052 * Scan the B-Tree to acquire blockmap addresses, then translate 3053 * to raw addresses. 3054 */ 3055 lwkt_gettoken(&hmp->fs_token); 3056 hammer_simple_transaction(&trans, hmp); 3057 #if 0 3058 hkprintf("bmap_beg %016llx ip->cache %p\n", 3059 (long long)ap->a_loffset, ip->cache[1]); 3060 #endif 3061 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3062 3063 /* 3064 * Key range (begin and end inclusive) to scan. Note that the key's 3065 * stored in the actual records represent BASE+LEN, not BASE. The 3066 * first record containing bio_offset will have a key > bio_offset. 3067 */ 3068 cursor.key_beg.localization = ip->obj_localization + 3069 HAMMER_LOCALIZE_MISC; 3070 cursor.key_beg.obj_id = ip->obj_id; 3071 cursor.key_beg.create_tid = 0; 3072 cursor.key_beg.delete_tid = 0; 3073 cursor.key_beg.obj_type = 0; 3074 if (ap->a_runb) 3075 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3076 else 3077 cursor.key_beg.key = ap->a_loffset + 1; 3078 if (cursor.key_beg.key < 0) 3079 cursor.key_beg.key = 0; 3080 cursor.asof = ip->obj_asof; 3081 cursor.flags |= HAMMER_CURSOR_ASOF; 3082 3083 cursor.key_end = cursor.key_beg; 3084 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3085 3086 ran_end = ap->a_loffset + MAXPHYS; 3087 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3088 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3089 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3090 if (tmp64 < ran_end) 3091 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3092 else 3093 cursor.key_end.key = ran_end + MAXPHYS + 1; 3094 3095 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3096 3097 error = hammer_ip_first(&cursor); 3098 base_offset = last_offset = 0; 3099 base_disk_offset = last_disk_offset = 0; 3100 3101 while (error == 0) { 3102 /* 3103 * Get the base file offset of the record. The key for 3104 * data records is (base + bytes) rather then (base). 3105 * 3106 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3107 * The extra bytes should be zero on-disk and the BMAP op 3108 * should still be ok. 3109 */ 3110 base = &cursor.leaf->base; 3111 rec_offset = base->key - cursor.leaf->data_len; 3112 rec_len = cursor.leaf->data_len; 3113 3114 /* 3115 * Incorporate any cached truncation. 3116 * 3117 * NOTE: Modifications to rec_len based on synthesized 3118 * truncation points remove the guarantee that any extended 3119 * data on disk is zero (since the truncations may not have 3120 * taken place on-media yet). 3121 */ 3122 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3123 if (hammer_cursor_ondisk(&cursor) || 3124 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3125 if (ip->trunc_off <= rec_offset) 3126 rec_len = 0; 3127 else if (ip->trunc_off < rec_offset + rec_len) 3128 rec_len = (int)(ip->trunc_off - rec_offset); 3129 } 3130 } 3131 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3132 if (hammer_cursor_ondisk(&cursor)) { 3133 if (ip->sync_trunc_off <= rec_offset) 3134 rec_len = 0; 3135 else if (ip->sync_trunc_off < rec_offset + rec_len) 3136 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3137 } 3138 } 3139 3140 /* 3141 * Accumulate information. If we have hit a discontiguous 3142 * block reset base_offset unless we are already beyond the 3143 * requested offset. If we are, that's it, we stop. 3144 */ 3145 if (error) 3146 break; 3147 if (hammer_cursor_ondisk(&cursor)) { 3148 disk_offset = cursor.leaf->data_offset; 3149 if (rec_offset != last_offset || 3150 disk_offset != last_disk_offset) { 3151 if (rec_offset > ap->a_loffset) 3152 break; 3153 base_offset = rec_offset; 3154 base_disk_offset = disk_offset; 3155 } 3156 last_offset = rec_offset + rec_len; 3157 last_disk_offset = disk_offset + rec_len; 3158 3159 if (hammer_live_dedup) 3160 hammer_dedup_cache_add(ip, cursor.leaf); 3161 } 3162 3163 error = hammer_ip_next(&cursor); 3164 } 3165 3166 #if 0 3167 hkprintf("BMAP %016llx: %016llx - %016llx\n", 3168 (long long)ap->a_loffset, 3169 (long long)base_offset, 3170 (long long)last_offset); 3171 hkprintf("BMAP %16s: %016llx - %016llx\n", "", 3172 (long long)base_disk_offset, 3173 (long long)last_disk_offset); 3174 #endif 3175 3176 if (cursor.node) { 3177 hammer_cache_node(&ip->cache[1], cursor.node); 3178 #if 0 3179 hkprintf("bmap_end2 %016llx ip->cache %p\n", 3180 (long long)ap->a_loffset, ip->cache[1]); 3181 #endif 3182 } 3183 hammer_done_cursor(&cursor); 3184 hammer_done_transaction(&trans); 3185 lwkt_reltoken(&hmp->fs_token); 3186 3187 /* 3188 * If we couldn't find any records or the records we did find were 3189 * all behind the requested offset, return failure. A forward 3190 * truncation can leave a hole w/ no on-disk records. 3191 */ 3192 if (last_offset == 0 || last_offset < ap->a_loffset) 3193 return (EOPNOTSUPP); 3194 3195 /* 3196 * Figure out the block size at the requested offset and adjust 3197 * our limits so the cluster_read() does not create inappropriately 3198 * sized buffer cache buffers. 3199 */ 3200 blksize = hammer_blocksize(ap->a_loffset); 3201 if (hammer_blocksize(base_offset) != blksize) { 3202 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3203 } 3204 if (last_offset != ap->a_loffset && 3205 hammer_blocksize(last_offset - 1) != blksize) { 3206 last_offset = hammer_blockdemarc(ap->a_loffset, 3207 last_offset - 1); 3208 } 3209 3210 /* 3211 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3212 * from occuring. 3213 */ 3214 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3215 3216 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3217 /* 3218 * Only large-data zones can be direct-IOd 3219 */ 3220 error = EOPNOTSUPP; 3221 } else if ((disk_offset & HAMMER_BUFMASK) || 3222 (last_offset - ap->a_loffset) < blksize) { 3223 /* 3224 * doffsetp is not aligned or the forward run size does 3225 * not cover a whole buffer, disallow the direct I/O. 3226 */ 3227 error = EOPNOTSUPP; 3228 } else { 3229 /* 3230 * We're good. 3231 */ 3232 *ap->a_doffsetp = disk_offset; 3233 if (ap->a_runb) { 3234 *ap->a_runb = ap->a_loffset - base_offset; 3235 KKASSERT(*ap->a_runb >= 0); 3236 } 3237 if (ap->a_runp) { 3238 *ap->a_runp = last_offset - ap->a_loffset; 3239 KKASSERT(*ap->a_runp >= 0); 3240 } 3241 error = 0; 3242 } 3243 return(error); 3244 } 3245 3246 /* 3247 * Write to a regular file. Because this is a strategy call the OS is 3248 * trying to actually get data onto the media. 3249 */ 3250 static 3251 int 3252 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3253 { 3254 hammer_record_t record; 3255 hammer_mount_t hmp; 3256 hammer_inode_t ip; 3257 struct bio *bio; 3258 struct buf *bp; 3259 int blksize __debugvar; 3260 int bytes; 3261 int error; 3262 3263 bio = ap->a_bio; 3264 bp = bio->bio_buf; 3265 ip = ap->a_vp->v_data; 3266 hmp = ip->hmp; 3267 3268 blksize = hammer_blocksize(bio->bio_offset); 3269 KKASSERT(bp->b_bufsize == blksize); 3270 3271 if (ip->flags & HAMMER_INODE_RO) { 3272 bp->b_error = EROFS; 3273 bp->b_flags |= B_ERROR; 3274 biodone(ap->a_bio); 3275 return(EROFS); 3276 } 3277 3278 lwkt_gettoken(&hmp->fs_token); 3279 3280 /* 3281 * Disallow swapcache operation on the vnode buffer if double 3282 * buffering is enabled, the swapcache will get the data via 3283 * the block device buffer. 3284 */ 3285 if (hammer_double_buffer) 3286 bp->b_flags |= B_NOTMETA; 3287 3288 /* 3289 * Interlock with inode destruction (no in-kernel or directory 3290 * topology visibility). If we queue new IO while trying to 3291 * destroy the inode we can deadlock the vtrunc call in 3292 * hammer_inode_unloadable_check(). 3293 * 3294 * Besides, there's no point flushing a bp associated with an 3295 * inode that is being destroyed on-media and has no kernel 3296 * references. 3297 */ 3298 if ((ip->flags | ip->sync_flags) & 3299 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3300 bp->b_resid = 0; 3301 biodone(ap->a_bio); 3302 lwkt_reltoken(&hmp->fs_token); 3303 return(0); 3304 } 3305 3306 /* 3307 * Reserve space and issue a direct-write from the front-end. 3308 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3309 * allocations. 3310 * 3311 * An in-memory record will be installed to reference the storage 3312 * until the flusher can get to it. 3313 * 3314 * Since we own the high level bio the front-end will not try to 3315 * do a direct-read until the write completes. 3316 * 3317 * NOTE: The only time we do not reserve a full-sized buffers 3318 * worth of data is if the file is small. We do not try to 3319 * allocate a fragment (from the small-data zone) at the end of 3320 * an otherwise large file as this can lead to wildly separated 3321 * data. 3322 */ 3323 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3324 KKASSERT(bio->bio_offset < ip->ino_data.size); 3325 if (bio->bio_offset || ip->ino_data.size > HAMMER_HBUFSIZE) 3326 bytes = bp->b_bufsize; 3327 else 3328 bytes = ((int)ip->ino_data.size + 15) & ~15; 3329 3330 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3331 bytes, &error); 3332 3333 /* 3334 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3335 * in hammer_vop_write(). We must flag the record so the proper 3336 * REDO_TERM_WRITE entry is generated during the flush. 3337 */ 3338 if (record) { 3339 if (bp->b_flags & B_VFSFLAG1) { 3340 record->flags |= HAMMER_RECF_REDO; 3341 bp->b_flags &= ~B_VFSFLAG1; 3342 } 3343 if (record->flags & HAMMER_RECF_DEDUPED) { 3344 bp->b_resid = 0; 3345 hammer_ip_replace_bulk(hmp, record); 3346 biodone(ap->a_bio); 3347 } else { 3348 hammer_io_direct_write(hmp, bio, record); 3349 } 3350 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3351 hammer_flush_inode(ip, 0); 3352 } else { 3353 bp->b_bio2.bio_offset = NOOFFSET; 3354 bp->b_error = error; 3355 bp->b_flags |= B_ERROR; 3356 biodone(ap->a_bio); 3357 } 3358 lwkt_reltoken(&hmp->fs_token); 3359 return(error); 3360 } 3361 3362 /* 3363 * dounlink - disconnect a directory entry 3364 * 3365 * XXX whiteout support not really in yet 3366 */ 3367 static int 3368 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3369 struct vnode *dvp, struct ucred *cred, 3370 int flags, int isdir) 3371 { 3372 struct namecache *ncp; 3373 hammer_inode_t dip; 3374 hammer_inode_t ip; 3375 hammer_mount_t hmp; 3376 struct hammer_cursor cursor; 3377 int64_t namekey; 3378 u_int32_t max_iterations; 3379 int nlen, error; 3380 3381 /* 3382 * Calculate the namekey and setup the key range for the scan. This 3383 * works kinda like a chained hash table where the lower 32 bits 3384 * of the namekey synthesize the chain. 3385 * 3386 * The key range is inclusive of both key_beg and key_end. 3387 */ 3388 dip = VTOI(dvp); 3389 ncp = nch->ncp; 3390 hmp = dip->hmp; 3391 3392 if (dip->flags & HAMMER_INODE_RO) 3393 return (EROFS); 3394 3395 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3396 &max_iterations); 3397 retry: 3398 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3399 cursor.key_beg.localization = dip->obj_localization + 3400 hammer_dir_localization(dip); 3401 cursor.key_beg.obj_id = dip->obj_id; 3402 cursor.key_beg.key = namekey; 3403 cursor.key_beg.create_tid = 0; 3404 cursor.key_beg.delete_tid = 0; 3405 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3406 cursor.key_beg.obj_type = 0; 3407 3408 cursor.key_end = cursor.key_beg; 3409 cursor.key_end.key += max_iterations; 3410 cursor.asof = dip->obj_asof; 3411 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3412 3413 /* 3414 * Scan all matching records (the chain), locate the one matching 3415 * the requested path component. info->last_error contains the 3416 * error code on search termination and could be 0, ENOENT, or 3417 * something else. 3418 * 3419 * The hammer_ip_*() functions merge in-memory records with on-disk 3420 * records for the purposes of the search. 3421 */ 3422 error = hammer_ip_first(&cursor); 3423 3424 while (error == 0) { 3425 error = hammer_ip_resolve_data(&cursor); 3426 if (error) 3427 break; 3428 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3429 KKASSERT(nlen > 0); 3430 if (ncp->nc_nlen == nlen && 3431 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3432 break; 3433 } 3434 error = hammer_ip_next(&cursor); 3435 } 3436 3437 /* 3438 * If all is ok we have to get the inode so we can adjust nlinks. 3439 * To avoid a deadlock with the flusher we must release the inode 3440 * lock on the directory when acquiring the inode for the entry. 3441 * 3442 * If the target is a directory, it must be empty. 3443 */ 3444 if (error == 0) { 3445 hammer_unlock(&cursor.ip->lock); 3446 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3447 hmp->asof, 3448 cursor.data->entry.localization, 3449 0, &error); 3450 hammer_lock_sh(&cursor.ip->lock); 3451 if (error == ENOENT) { 3452 hkprintf("WARNING: Removing dirent w/missing inode " 3453 "\"%s\"\n" 3454 "\tobj_id = %016llx\n", 3455 ncp->nc_name, 3456 (long long)cursor.data->entry.obj_id); 3457 error = 0; 3458 } 3459 3460 /* 3461 * If isdir >= 0 we validate that the entry is or is not a 3462 * directory. If isdir < 0 we don't care. 3463 */ 3464 if (error == 0 && isdir >= 0 && ip) { 3465 if (isdir && 3466 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3467 error = ENOTDIR; 3468 } else if (isdir == 0 && 3469 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3470 error = EISDIR; 3471 } 3472 } 3473 3474 /* 3475 * If we are trying to remove a directory the directory must 3476 * be empty. 3477 * 3478 * The check directory code can loop and deadlock/retry. Our 3479 * own cursor's node locks must be released to avoid a 3-way 3480 * deadlock with the flusher if the check directory code 3481 * blocks. 3482 * 3483 * If any changes whatsoever have been made to the cursor 3484 * set EDEADLK and retry. 3485 * 3486 * WARNING: See warnings in hammer_unlock_cursor() 3487 * function. 3488 */ 3489 if (error == 0 && ip && ip->ino_data.obj_type == 3490 HAMMER_OBJTYPE_DIRECTORY) { 3491 hammer_unlock_cursor(&cursor); 3492 error = hammer_ip_check_directory_empty(trans, ip); 3493 hammer_lock_cursor(&cursor); 3494 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3495 hkprintf("Warning: avoided deadlock " 3496 "on rmdir '%s'\n", 3497 ncp->nc_name); 3498 error = EDEADLK; 3499 } 3500 } 3501 3502 /* 3503 * Delete the directory entry. 3504 * 3505 * WARNING: hammer_ip_del_directory() may have to terminate 3506 * the cursor to avoid a deadlock. It is ok to call 3507 * hammer_done_cursor() twice. 3508 */ 3509 if (error == 0) { 3510 error = hammer_ip_del_directory(trans, &cursor, 3511 dip, ip); 3512 } 3513 hammer_done_cursor(&cursor); 3514 if (error == 0) { 3515 /* 3516 * Tell the namecache that we are now unlinked. 3517 */ 3518 cache_unlink(nch); 3519 3520 /* 3521 * NOTE: ip->vp, if non-NULL, cannot be directly 3522 * referenced without formally acquiring the 3523 * vp since the vp might have zero refs on it, 3524 * or in the middle of a reclaim, etc. 3525 * 3526 * NOTE: The cache_setunresolved() can rip the vp 3527 * out from under us since the vp may not have 3528 * any refs, in which case ip->vp will be NULL 3529 * from the outset. 3530 */ 3531 while (ip && ip->vp) { 3532 struct vnode *vp; 3533 3534 error = hammer_get_vnode(ip, &vp); 3535 if (error == 0 && vp) { 3536 vn_unlock(vp); 3537 hammer_knote(ip->vp, NOTE_DELETE); 3538 #if 0 3539 /* 3540 * Don't do this, it can deadlock 3541 * on concurrent rm's of hardlinks. 3542 * Shouldn't be needed any more. 3543 */ 3544 cache_inval_vp(ip->vp, CINV_DESTROY); 3545 #endif 3546 vrele(vp); 3547 break; 3548 } 3549 hdkprintf("ip/vp race1 avoided\n"); 3550 } 3551 } 3552 if (ip) 3553 hammer_rel_inode(ip, 0); 3554 } else { 3555 hammer_done_cursor(&cursor); 3556 } 3557 if (error == EDEADLK) 3558 goto retry; 3559 3560 return (error); 3561 } 3562 3563 /************************************************************************ 3564 * FIFO AND SPECFS OPS * 3565 ************************************************************************ 3566 * 3567 */ 3568 static int 3569 hammer_vop_fifoclose (struct vop_close_args *ap) 3570 { 3571 /* XXX update itimes */ 3572 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3573 } 3574 3575 static int 3576 hammer_vop_fiforead (struct vop_read_args *ap) 3577 { 3578 int error; 3579 3580 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3581 /* XXX update access time */ 3582 return (error); 3583 } 3584 3585 static int 3586 hammer_vop_fifowrite (struct vop_write_args *ap) 3587 { 3588 int error; 3589 3590 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3591 /* XXX update access time */ 3592 return (error); 3593 } 3594 3595 static 3596 int 3597 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3598 { 3599 int error; 3600 3601 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3602 if (error) 3603 error = hammer_vop_kqfilter(ap); 3604 return(error); 3605 } 3606 3607 /************************************************************************ 3608 * KQFILTER OPS * 3609 ************************************************************************ 3610 * 3611 */ 3612 static void filt_hammerdetach(struct knote *kn); 3613 static int filt_hammerread(struct knote *kn, long hint); 3614 static int filt_hammerwrite(struct knote *kn, long hint); 3615 static int filt_hammervnode(struct knote *kn, long hint); 3616 3617 static struct filterops hammerread_filtops = 3618 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3619 NULL, filt_hammerdetach, filt_hammerread }; 3620 static struct filterops hammerwrite_filtops = 3621 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3622 NULL, filt_hammerdetach, filt_hammerwrite }; 3623 static struct filterops hammervnode_filtops = 3624 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3625 NULL, filt_hammerdetach, filt_hammervnode }; 3626 3627 static 3628 int 3629 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3630 { 3631 struct vnode *vp = ap->a_vp; 3632 struct knote *kn = ap->a_kn; 3633 3634 switch (kn->kn_filter) { 3635 case EVFILT_READ: 3636 kn->kn_fop = &hammerread_filtops; 3637 break; 3638 case EVFILT_WRITE: 3639 kn->kn_fop = &hammerwrite_filtops; 3640 break; 3641 case EVFILT_VNODE: 3642 kn->kn_fop = &hammervnode_filtops; 3643 break; 3644 default: 3645 return (EOPNOTSUPP); 3646 } 3647 3648 kn->kn_hook = (caddr_t)vp; 3649 3650 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3651 3652 return(0); 3653 } 3654 3655 static void 3656 filt_hammerdetach(struct knote *kn) 3657 { 3658 struct vnode *vp = (void *)kn->kn_hook; 3659 3660 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3661 } 3662 3663 static int 3664 filt_hammerread(struct knote *kn, long hint) 3665 { 3666 struct vnode *vp = (void *)kn->kn_hook; 3667 hammer_inode_t ip = VTOI(vp); 3668 hammer_mount_t hmp = ip->hmp; 3669 off_t off; 3670 3671 if (hint == NOTE_REVOKE) { 3672 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3673 return(1); 3674 } 3675 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3676 off = ip->ino_data.size - kn->kn_fp->f_offset; 3677 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3678 lwkt_reltoken(&hmp->fs_token); 3679 if (kn->kn_sfflags & NOTE_OLDAPI) 3680 return(1); 3681 return (kn->kn_data != 0); 3682 } 3683 3684 static int 3685 filt_hammerwrite(struct knote *kn, long hint) 3686 { 3687 if (hint == NOTE_REVOKE) 3688 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3689 kn->kn_data = 0; 3690 return (1); 3691 } 3692 3693 static int 3694 filt_hammervnode(struct knote *kn, long hint) 3695 { 3696 if (kn->kn_sfflags & hint) 3697 kn->kn_fflags |= hint; 3698 if (hint == NOTE_REVOKE) { 3699 kn->kn_flags |= (EV_EOF | EV_NODATA); 3700 return (1); 3701 } 3702 return (kn->kn_fflags != 0); 3703 } 3704 3705