1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/fcntl.h> 36 #include <sys/namecache.h> 37 #include <sys/event.h> 38 #include <sys/dirent.h> 39 #include <sys/file.h> 40 #include <vm/swap_pager.h> 41 #include <vfs/fifofs/fifo.h> 42 43 #include "hammer.h" 44 45 /* 46 * USERFS VNOPS 47 */ 48 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 49 static int hammer_vop_fsync(struct vop_fsync_args *); 50 static int hammer_vop_read(struct vop_read_args *); 51 static int hammer_vop_write(struct vop_write_args *); 52 static int hammer_vop_access(struct vop_access_args *); 53 static int hammer_vop_advlock(struct vop_advlock_args *); 54 static int hammer_vop_close(struct vop_close_args *); 55 static int hammer_vop_ncreate(struct vop_ncreate_args *); 56 static int hammer_vop_getattr(struct vop_getattr_args *); 57 static int hammer_vop_nresolve(struct vop_nresolve_args *); 58 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 59 static int hammer_vop_nlink(struct vop_nlink_args *); 60 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 61 static int hammer_vop_nmknod(struct vop_nmknod_args *); 62 static int hammer_vop_open(struct vop_open_args *); 63 static int hammer_vop_print(struct vop_print_args *); 64 static int hammer_vop_readdir(struct vop_readdir_args *); 65 static int hammer_vop_readlink(struct vop_readlink_args *); 66 static int hammer_vop_nremove(struct vop_nremove_args *); 67 static int hammer_vop_nrename(struct vop_nrename_args *); 68 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 69 static int hammer_vop_markatime(struct vop_markatime_args *); 70 static int hammer_vop_setattr(struct vop_setattr_args *); 71 static int hammer_vop_strategy(struct vop_strategy_args *); 72 static int hammer_vop_bmap(struct vop_bmap_args *ap); 73 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 74 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 75 static int hammer_vop_ioctl(struct vop_ioctl_args *); 76 static int hammer_vop_mountctl(struct vop_mountctl_args *); 77 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 78 79 static int hammer_vop_fifoclose (struct vop_close_args *); 80 static int hammer_vop_fiforead (struct vop_read_args *); 81 static int hammer_vop_fifowrite (struct vop_write_args *); 82 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 83 84 struct vop_ops hammer_vnode_vops = { 85 .vop_default = vop_defaultop, 86 .vop_fsync = hammer_vop_fsync, 87 .vop_getpages = vop_stdgetpages, 88 .vop_putpages = vop_stdputpages, 89 .vop_read = hammer_vop_read, 90 .vop_write = hammer_vop_write, 91 .vop_access = hammer_vop_access, 92 .vop_advlock = hammer_vop_advlock, 93 .vop_close = hammer_vop_close, 94 .vop_ncreate = hammer_vop_ncreate, 95 .vop_getattr = hammer_vop_getattr, 96 .vop_inactive = hammer_vop_inactive, 97 .vop_reclaim = hammer_vop_reclaim, 98 .vop_nresolve = hammer_vop_nresolve, 99 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 100 .vop_nlink = hammer_vop_nlink, 101 .vop_nmkdir = hammer_vop_nmkdir, 102 .vop_nmknod = hammer_vop_nmknod, 103 .vop_open = hammer_vop_open, 104 .vop_pathconf = vop_stdpathconf, 105 .vop_print = hammer_vop_print, 106 .vop_readdir = hammer_vop_readdir, 107 .vop_readlink = hammer_vop_readlink, 108 .vop_nremove = hammer_vop_nremove, 109 .vop_nrename = hammer_vop_nrename, 110 .vop_nrmdir = hammer_vop_nrmdir, 111 .vop_markatime = hammer_vop_markatime, 112 .vop_setattr = hammer_vop_setattr, 113 .vop_bmap = hammer_vop_bmap, 114 .vop_strategy = hammer_vop_strategy, 115 .vop_nsymlink = hammer_vop_nsymlink, 116 .vop_nwhiteout = hammer_vop_nwhiteout, 117 .vop_ioctl = hammer_vop_ioctl, 118 .vop_mountctl = hammer_vop_mountctl, 119 .vop_kqfilter = hammer_vop_kqfilter 120 }; 121 122 struct vop_ops hammer_spec_vops = { 123 .vop_default = vop_defaultop, 124 .vop_fsync = hammer_vop_fsync, 125 .vop_read = vop_stdnoread, 126 .vop_write = vop_stdnowrite, 127 .vop_access = hammer_vop_access, 128 .vop_close = hammer_vop_close, 129 .vop_markatime = hammer_vop_markatime, 130 .vop_getattr = hammer_vop_getattr, 131 .vop_inactive = hammer_vop_inactive, 132 .vop_reclaim = hammer_vop_reclaim, 133 .vop_setattr = hammer_vop_setattr 134 }; 135 136 struct vop_ops hammer_fifo_vops = { 137 .vop_default = fifo_vnoperate, 138 .vop_fsync = hammer_vop_fsync, 139 .vop_read = hammer_vop_fiforead, 140 .vop_write = hammer_vop_fifowrite, 141 .vop_access = hammer_vop_access, 142 .vop_close = hammer_vop_fifoclose, 143 .vop_markatime = hammer_vop_markatime, 144 .vop_getattr = hammer_vop_getattr, 145 .vop_inactive = hammer_vop_inactive, 146 .vop_reclaim = hammer_vop_reclaim, 147 .vop_setattr = hammer_vop_setattr, 148 .vop_kqfilter = hammer_vop_fifokqfilter 149 }; 150 151 static __inline 152 void 153 hammer_knote(struct vnode *vp, int flags) 154 { 155 if (flags) 156 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 157 } 158 159 #ifdef DEBUG_TRUNCATE 160 struct hammer_inode *HammerTruncIp; 161 #endif 162 163 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 164 struct vnode *dvp, struct ucred *cred, 165 int flags, int isdir); 166 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 167 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 168 169 #if 0 170 static 171 int 172 hammer_vop_vnoperate(struct vop_generic_args *) 173 { 174 return (VOCALL(&hammer_vnode_vops, ap)); 175 } 176 #endif 177 178 /* 179 * hammer_vop_fsync { vp, waitfor } 180 * 181 * fsync() an inode to disk and wait for it to be completely committed 182 * such that the information would not be undone if a crash occured after 183 * return. 184 * 185 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 186 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 187 * operation. 188 * 189 * Ultimately the combination of a REDO log and use of fast storage 190 * to front-end cluster caches will make fsync fast, but it aint 191 * here yet. And, in anycase, we need real transactional 192 * all-or-nothing features which are not restricted to a single file. 193 */ 194 static 195 int 196 hammer_vop_fsync(struct vop_fsync_args *ap) 197 { 198 hammer_inode_t ip = VTOI(ap->a_vp); 199 hammer_mount_t hmp = ip->hmp; 200 int waitfor = ap->a_waitfor; 201 int mode; 202 203 lwkt_gettoken(&hmp->fs_token); 204 205 /* 206 * Fsync rule relaxation (default is either full synchronous flush 207 * or REDO semantics with synchronous flush). 208 */ 209 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 210 switch(hammer_fsync_mode) { 211 case 0: 212 mode0: 213 /* no REDO, full synchronous flush */ 214 goto skip; 215 case 1: 216 mode1: 217 /* no REDO, full asynchronous flush */ 218 if (waitfor == MNT_WAIT) 219 waitfor = MNT_NOWAIT; 220 goto skip; 221 case 2: 222 /* REDO semantics, synchronous flush */ 223 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 224 goto mode0; 225 mode = HAMMER_FLUSH_UNDOS_AUTO; 226 break; 227 case 3: 228 /* REDO semantics, relaxed asynchronous flush */ 229 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 230 goto mode1; 231 mode = HAMMER_FLUSH_UNDOS_RELAXED; 232 if (waitfor == MNT_WAIT) 233 waitfor = MNT_NOWAIT; 234 break; 235 case 4: 236 /* ignore the fsync() system call */ 237 lwkt_reltoken(&hmp->fs_token); 238 return(0); 239 default: 240 /* we have to do something */ 241 mode = HAMMER_FLUSH_UNDOS_RELAXED; 242 if (waitfor == MNT_WAIT) 243 waitfor = MNT_NOWAIT; 244 break; 245 } 246 247 /* 248 * Fast fsync only needs to flush the UNDO/REDO fifo if 249 * HAMMER_INODE_REDO is non-zero and the only modifications 250 * made to the file are write or write-extends. 251 */ 252 if ((ip->flags & HAMMER_INODE_REDO) && 253 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0) { 254 ++hammer_count_fsyncs; 255 hammer_flusher_flush_undos(hmp, mode); 256 ip->redo_count = 0; 257 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 258 vclrisdirty(ip->vp); 259 lwkt_reltoken(&hmp->fs_token); 260 return(0); 261 } 262 263 /* 264 * REDO is enabled by fsync(), the idea being we really only 265 * want to lay down REDO records when programs are using 266 * fsync() heavily. The first fsync() on the file starts 267 * the gravy train going and later fsync()s keep it hot by 268 * resetting the redo_count. 269 * 270 * We weren't running REDOs before now so we have to fall 271 * through and do a full fsync of what we have. 272 */ 273 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 274 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 275 ip->flags |= HAMMER_INODE_REDO; 276 ip->redo_count = 0; 277 } 278 } 279 skip: 280 281 /* 282 * Do a full flush sequence. 283 * 284 * Attempt to release the vnode while waiting for the inode to 285 * finish flushing. This can really mess up inactive->reclaim 286 * sequences so only do it if the vnode is active. 287 * 288 * WARNING! The VX lock functions must be used. vn_lock() will 289 * fail when this is part of a VOP_RECLAIM sequence. 290 */ 291 ++hammer_count_fsyncs; 292 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 293 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 294 if (waitfor == MNT_WAIT) { 295 int dorelock; 296 297 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 298 vx_unlock(ap->a_vp); 299 dorelock = 1; 300 } else { 301 dorelock = 0; 302 } 303 hammer_wait_inode(ip); 304 if (dorelock) 305 vx_lock(ap->a_vp); 306 } 307 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 308 vclrisdirty(ip->vp); 309 lwkt_reltoken(&hmp->fs_token); 310 return (ip->error); 311 } 312 313 /* 314 * hammer_vop_read { vp, uio, ioflag, cred } 315 * 316 * MPSAFE (for the cache safe does not require fs_token) 317 */ 318 static 319 int 320 hammer_vop_read(struct vop_read_args *ap) 321 { 322 struct hammer_transaction trans; 323 hammer_inode_t ip; 324 hammer_mount_t hmp; 325 off_t offset; 326 struct buf *bp; 327 struct uio *uio; 328 int error; 329 int n; 330 int seqcount; 331 int ioseqcount; 332 int blksize; 333 int bigread; 334 int got_trans; 335 size_t resid; 336 337 if (ap->a_vp->v_type != VREG) 338 return (EINVAL); 339 ip = VTOI(ap->a_vp); 340 hmp = ip->hmp; 341 error = 0; 342 got_trans = 0; 343 uio = ap->a_uio; 344 345 /* 346 * Attempt to shortcut directly to the VM object using lwbufs. 347 * This is much faster than instantiating buffer cache buffers. 348 */ 349 resid = uio->uio_resid; 350 error = vop_helper_read_shortcut(ap); 351 hammer_stats_file_read += resid - uio->uio_resid; 352 if (error) 353 return (error); 354 if (uio->uio_resid == 0) 355 goto finished; 356 357 /* 358 * Allow the UIO's size to override the sequential heuristic. 359 */ 360 blksize = hammer_blocksize(uio->uio_offset); 361 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 362 ioseqcount = (ap->a_ioflag >> 16); 363 if (seqcount < ioseqcount) 364 seqcount = ioseqcount; 365 366 /* 367 * If reading or writing a huge amount of data we have to break 368 * atomicy and allow the operation to be interrupted by a signal 369 * or it can DOS the machine. 370 */ 371 bigread = (uio->uio_resid > 100 * 1024 * 1024); 372 373 /* 374 * Access the data typically in HAMMER_BUFSIZE blocks via the 375 * buffer cache, but HAMMER may use a variable block size based 376 * on the offset. 377 * 378 * XXX Temporary hack, delay the start transaction while we remain 379 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 380 * locked-shared. 381 */ 382 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 383 int64_t base_offset; 384 int64_t file_limit; 385 386 blksize = hammer_blocksize(uio->uio_offset); 387 offset = (int)uio->uio_offset & (blksize - 1); 388 base_offset = uio->uio_offset - offset; 389 390 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 391 break; 392 393 /* 394 * MPSAFE 395 */ 396 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 397 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 398 bp->b_flags &= ~B_AGE; 399 error = 0; 400 goto skip; 401 } 402 if (ap->a_ioflag & IO_NRDELAY) { 403 bqrelse(bp); 404 return (EWOULDBLOCK); 405 } 406 407 /* 408 * MPUNSAFE 409 */ 410 if (got_trans == 0) { 411 hammer_start_transaction(&trans, ip->hmp); 412 got_trans = 1; 413 } 414 415 /* 416 * NOTE: A valid bp has already been acquired, but was not 417 * B_CACHE. 418 */ 419 if (hammer_cluster_enable) { 420 /* 421 * Use file_limit to prevent cluster_read() from 422 * creating buffers of the wrong block size past 423 * the demarc. 424 */ 425 file_limit = ip->ino_data.size; 426 if (base_offset < HAMMER_XDEMARC && 427 file_limit > HAMMER_XDEMARC) { 428 file_limit = HAMMER_XDEMARC; 429 } 430 error = cluster_readx(ap->a_vp, 431 file_limit, base_offset, 432 blksize, uio->uio_resid, 433 seqcount * BKVASIZE, &bp); 434 } else { 435 error = breadnx(ap->a_vp, base_offset, blksize, 436 NULL, NULL, 0, &bp); 437 } 438 if (error) { 439 brelse(bp); 440 break; 441 } 442 skip: 443 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 444 kprintf("doff %016jx read file %016jx@%016jx\n", 445 (intmax_t)bp->b_bio2.bio_offset, 446 (intmax_t)ip->obj_id, 447 (intmax_t)bp->b_loffset); 448 } 449 bp->b_flags &= ~B_IODEBUG; 450 if (blksize == HAMMER_XBUFSIZE) 451 bp->b_flags |= B_CLUSTEROK; 452 453 n = blksize - offset; 454 if (n > uio->uio_resid) 455 n = uio->uio_resid; 456 if (n > ip->ino_data.size - uio->uio_offset) 457 n = (int)(ip->ino_data.size - uio->uio_offset); 458 459 /* 460 * Set B_AGE, data has a lower priority than meta-data. 461 * 462 * Use a hold/unlock/drop sequence to run the uiomove 463 * with the buffer unlocked, avoiding deadlocks against 464 * read()s on mmap()'d spaces. 465 */ 466 bp->b_flags |= B_AGE; 467 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 468 bqrelse(bp); 469 470 if (error) 471 break; 472 hammer_stats_file_read += n; 473 } 474 475 finished: 476 477 /* 478 * Try to update the atime with just the inode lock for maximum 479 * concurrency. If we can't shortcut it we have to get the full 480 * blown transaction. 481 */ 482 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 483 hammer_start_transaction(&trans, ip->hmp); 484 got_trans = 1; 485 } 486 487 if (got_trans) { 488 if ((ip->flags & HAMMER_INODE_RO) == 0 && 489 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 490 lwkt_gettoken(&hmp->fs_token); 491 ip->ino_data.atime = trans.time; 492 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 493 hammer_done_transaction(&trans); 494 lwkt_reltoken(&hmp->fs_token); 495 } else { 496 hammer_done_transaction(&trans); 497 } 498 } 499 return (error); 500 } 501 502 /* 503 * hammer_vop_write { vp, uio, ioflag, cred } 504 */ 505 static 506 int 507 hammer_vop_write(struct vop_write_args *ap) 508 { 509 struct hammer_transaction trans; 510 struct hammer_inode *ip; 511 hammer_mount_t hmp; 512 thread_t td; 513 struct uio *uio; 514 int offset; 515 off_t base_offset; 516 int64_t cluster_eof; 517 struct buf *bp; 518 int kflags; 519 int error; 520 int n; 521 int flags; 522 int seqcount; 523 int bigwrite; 524 525 if (ap->a_vp->v_type != VREG) 526 return (EINVAL); 527 ip = VTOI(ap->a_vp); 528 hmp = ip->hmp; 529 error = 0; 530 kflags = 0; 531 seqcount = ap->a_ioflag >> 16; 532 533 if (ip->flags & HAMMER_INODE_RO) 534 return (EROFS); 535 536 /* 537 * Create a transaction to cover the operations we perform. 538 */ 539 hammer_start_transaction(&trans, hmp); 540 uio = ap->a_uio; 541 542 /* 543 * Check append mode 544 */ 545 if (ap->a_ioflag & IO_APPEND) 546 uio->uio_offset = ip->ino_data.size; 547 548 /* 549 * Check for illegal write offsets. Valid range is 0...2^63-1. 550 * 551 * NOTE: the base_off assignment is required to work around what 552 * I consider to be a GCC-4 optimization bug. 553 */ 554 if (uio->uio_offset < 0) { 555 hammer_done_transaction(&trans); 556 return (EFBIG); 557 } 558 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 559 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 560 hammer_done_transaction(&trans); 561 return (EFBIG); 562 } 563 564 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 565 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 566 hammer_done_transaction(&trans); 567 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 568 return (EFBIG); 569 } 570 571 /* 572 * If reading or writing a huge amount of data we have to break 573 * atomicy and allow the operation to be interrupted by a signal 574 * or it can DOS the machine. 575 * 576 * Preset redo_count so we stop generating REDOs earlier if the 577 * limit is exceeded. 578 * 579 * redo_count is heuristical, SMP races are ok 580 */ 581 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 582 if ((ip->flags & HAMMER_INODE_REDO) && 583 ip->redo_count < hammer_limit_redo) { 584 ip->redo_count += uio->uio_resid; 585 } 586 587 /* 588 * Access the data typically in HAMMER_BUFSIZE blocks via the 589 * buffer cache, but HAMMER may use a variable block size based 590 * on the offset. 591 */ 592 while (uio->uio_resid > 0) { 593 int fixsize = 0; 594 int blksize; 595 int blkmask; 596 int trivial; 597 int endofblk; 598 off_t nsize; 599 600 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 601 break; 602 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 603 break; 604 605 blksize = hammer_blocksize(uio->uio_offset); 606 607 /* 608 * Control the number of pending records associated with 609 * this inode. If too many have accumulated start a 610 * flush. Try to maintain a pipeline with the flusher. 611 * 612 * NOTE: It is possible for other sources to grow the 613 * records but not necessarily issue another flush, 614 * so use a timeout and ensure that a re-flush occurs. 615 */ 616 if (ip->rsv_recs >= hammer_limit_inode_recs) { 617 lwkt_gettoken(&hmp->fs_token); 618 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 619 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 620 ip->flags |= HAMMER_INODE_RECSW; 621 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 622 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 623 } 624 lwkt_reltoken(&hmp->fs_token); 625 } 626 627 /* 628 * Do not allow HAMMER to blow out the buffer cache. Very 629 * large UIOs can lockout other processes due to bwillwrite() 630 * mechanics. 631 * 632 * The hammer inode is not locked during these operations. 633 * The vnode is locked which can interfere with the pageout 634 * daemon for non-UIO_NOCOPY writes but should not interfere 635 * with the buffer cache. Even so, we cannot afford to 636 * allow the pageout daemon to build up too many dirty buffer 637 * cache buffers. 638 * 639 * Only call this if we aren't being recursively called from 640 * a virtual disk device (vn), else we may deadlock. 641 */ 642 if ((ap->a_ioflag & IO_RECURSE) == 0) 643 bwillwrite(blksize); 644 645 /* 646 * Calculate the blocksize at the current offset and figure 647 * out how much we can actually write. 648 */ 649 blkmask = blksize - 1; 650 offset = (int)uio->uio_offset & blkmask; 651 base_offset = uio->uio_offset & ~(int64_t)blkmask; 652 n = blksize - offset; 653 if (n > uio->uio_resid) { 654 n = uio->uio_resid; 655 endofblk = 0; 656 } else { 657 endofblk = 1; 658 } 659 nsize = uio->uio_offset + n; 660 if (nsize > ip->ino_data.size) { 661 if (uio->uio_offset > ip->ino_data.size) 662 trivial = 0; 663 else 664 trivial = 1; 665 nvextendbuf(ap->a_vp, 666 ip->ino_data.size, 667 nsize, 668 hammer_blocksize(ip->ino_data.size), 669 hammer_blocksize(nsize), 670 hammer_blockoff(ip->ino_data.size), 671 hammer_blockoff(nsize), 672 trivial); 673 fixsize = 1; 674 kflags |= NOTE_EXTEND; 675 } 676 677 if (uio->uio_segflg == UIO_NOCOPY) { 678 /* 679 * Issuing a write with the same data backing the 680 * buffer. Instantiate the buffer to collect the 681 * backing vm pages, then read-in any missing bits. 682 * 683 * This case is used by vop_stdputpages(). 684 */ 685 bp = getblk(ap->a_vp, base_offset, 686 blksize, GETBLK_BHEAVY, 0); 687 if ((bp->b_flags & B_CACHE) == 0) { 688 bqrelse(bp); 689 error = bread(ap->a_vp, base_offset, 690 blksize, &bp); 691 } 692 } else if (offset == 0 && uio->uio_resid >= blksize) { 693 /* 694 * Even though we are entirely overwriting the buffer 695 * we may still have to zero it out to avoid a 696 * mmap/write visibility issue. 697 */ 698 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 699 if ((bp->b_flags & B_CACHE) == 0) 700 vfs_bio_clrbuf(bp); 701 } else if (base_offset >= ip->ino_data.size) { 702 /* 703 * If the base offset of the buffer is beyond the 704 * file EOF, we don't have to issue a read. 705 */ 706 bp = getblk(ap->a_vp, base_offset, 707 blksize, GETBLK_BHEAVY, 0); 708 vfs_bio_clrbuf(bp); 709 } else { 710 /* 711 * Partial overwrite, read in any missing bits then 712 * replace the portion being written. 713 */ 714 error = bread(ap->a_vp, base_offset, blksize, &bp); 715 if (error == 0) 716 bheavy(bp); 717 } 718 if (error == 0) 719 error = uiomovebp(bp, bp->b_data + offset, n, uio); 720 721 lwkt_gettoken(&hmp->fs_token); 722 723 /* 724 * Generate REDO records if enabled and redo_count will not 725 * exceeded the limit. 726 * 727 * If redo_count exceeds the limit we stop generating records 728 * and clear HAMMER_INODE_REDO. This will cause the next 729 * fsync() to do a full meta-data sync instead of just an 730 * UNDO/REDO fifo update. 731 * 732 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 733 * will still be tracked. The tracks will be terminated 734 * when the related meta-data (including possible data 735 * modifications which are not tracked via REDO) is 736 * flushed. 737 */ 738 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 739 if (ip->redo_count < hammer_limit_redo) { 740 bp->b_flags |= B_VFSFLAG1; 741 error = hammer_generate_redo(&trans, ip, 742 base_offset + offset, 743 HAMMER_REDO_WRITE, 744 bp->b_data + offset, 745 (size_t)n); 746 } else { 747 ip->flags &= ~HAMMER_INODE_REDO; 748 } 749 } 750 751 /* 752 * If we screwed up we have to undo any VM size changes we 753 * made. 754 */ 755 if (error) { 756 brelse(bp); 757 if (fixsize) { 758 nvtruncbuf(ap->a_vp, ip->ino_data.size, 759 hammer_blocksize(ip->ino_data.size), 760 hammer_blockoff(ip->ino_data.size), 761 0); 762 } 763 lwkt_reltoken(&hmp->fs_token); 764 break; 765 } 766 kflags |= NOTE_WRITE; 767 hammer_stats_file_write += n; 768 if (blksize == HAMMER_XBUFSIZE) 769 bp->b_flags |= B_CLUSTEROK; 770 if (ip->ino_data.size < uio->uio_offset) { 771 ip->ino_data.size = uio->uio_offset; 772 flags = HAMMER_INODE_SDIRTY; 773 } else { 774 flags = 0; 775 } 776 ip->ino_data.mtime = trans.time; 777 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 778 hammer_modify_inode(&trans, ip, flags); 779 780 /* 781 * Once we dirty the buffer any cached zone-X offset 782 * becomes invalid. HAMMER NOTE: no-history mode cannot 783 * allow overwriting over the same data sector unless 784 * we provide UNDOs for the old data, which we don't. 785 */ 786 bp->b_bio2.bio_offset = NOOFFSET; 787 788 lwkt_reltoken(&hmp->fs_token); 789 790 /* 791 * Final buffer disposition. 792 * 793 * Because meta-data updates are deferred, HAMMER is 794 * especially sensitive to excessive bdwrite()s because 795 * the I/O stream is not broken up by disk reads. So the 796 * buffer cache simply cannot keep up. 797 * 798 * WARNING! blksize is variable. cluster_write() is 799 * expected to not blow up if it encounters 800 * buffers that do not match the passed blksize. 801 * 802 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 803 * The ip->rsv_recs check should burst-flush the data. 804 * If we queue it immediately the buf could be left 805 * locked on the device queue for a very long time. 806 * 807 * However, failing to flush a dirty buffer out when 808 * issued from the pageout daemon can result in a low 809 * memory deadlock against bio_page_alloc(), so we 810 * have to bawrite() on IO_ASYNC as well. 811 * 812 * NOTE! To avoid degenerate stalls due to mismatched block 813 * sizes we only honor IO_DIRECT on the write which 814 * abuts the end of the buffer. However, we must 815 * honor IO_SYNC in case someone is silly enough to 816 * configure a HAMMER file as swap, or when HAMMER 817 * is serving NFS (for commits). Ick ick. 818 */ 819 bp->b_flags |= B_AGE; 820 if (blksize == HAMMER_XBUFSIZE) 821 bp->b_flags |= B_CLUSTEROK; 822 823 if (ap->a_ioflag & IO_SYNC) { 824 bwrite(bp); 825 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 826 bawrite(bp); 827 } else if (ap->a_ioflag & IO_ASYNC) { 828 bawrite(bp); 829 } else if (hammer_cluster_enable && 830 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 831 if (base_offset < HAMMER_XDEMARC) 832 cluster_eof = hammer_blockdemarc(base_offset, 833 ip->ino_data.size); 834 else 835 cluster_eof = ip->ino_data.size; 836 cluster_write(bp, cluster_eof, blksize, seqcount); 837 } else { 838 bdwrite(bp); 839 } 840 } 841 hammer_done_transaction(&trans); 842 hammer_knote(ap->a_vp, kflags); 843 844 return (error); 845 } 846 847 /* 848 * hammer_vop_access { vp, mode, cred } 849 * 850 * MPSAFE - does not require fs_token 851 */ 852 static 853 int 854 hammer_vop_access(struct vop_access_args *ap) 855 { 856 struct hammer_inode *ip = VTOI(ap->a_vp); 857 uid_t uid; 858 gid_t gid; 859 int error; 860 861 ++hammer_stats_file_iopsr; 862 uid = hammer_to_unix_xid(&ip->ino_data.uid); 863 gid = hammer_to_unix_xid(&ip->ino_data.gid); 864 865 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 866 ip->ino_data.uflags); 867 return (error); 868 } 869 870 /* 871 * hammer_vop_advlock { vp, id, op, fl, flags } 872 * 873 * MPSAFE - does not require fs_token 874 */ 875 static 876 int 877 hammer_vop_advlock(struct vop_advlock_args *ap) 878 { 879 hammer_inode_t ip = VTOI(ap->a_vp); 880 881 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 882 } 883 884 /* 885 * hammer_vop_close { vp, fflag } 886 * 887 * We can only sync-on-close for normal closes. XXX disabled for now. 888 */ 889 static 890 int 891 hammer_vop_close(struct vop_close_args *ap) 892 { 893 #if 0 894 struct vnode *vp = ap->a_vp; 895 hammer_inode_t ip = VTOI(vp); 896 int waitfor; 897 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 898 if (vn_islocked(vp) == LK_EXCLUSIVE && 899 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 900 if (ip->flags & HAMMER_INODE_CLOSESYNC) 901 waitfor = MNT_WAIT; 902 else 903 waitfor = MNT_NOWAIT; 904 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 905 HAMMER_INODE_CLOSEASYNC); 906 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 907 } 908 } 909 #endif 910 return (vop_stdclose(ap)); 911 } 912 913 /* 914 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 915 * 916 * The operating system has already ensured that the directory entry 917 * does not exist and done all appropriate namespace locking. 918 */ 919 static 920 int 921 hammer_vop_ncreate(struct vop_ncreate_args *ap) 922 { 923 struct hammer_transaction trans; 924 struct hammer_inode *dip; 925 struct hammer_inode *nip; 926 struct nchandle *nch; 927 hammer_mount_t hmp; 928 int error; 929 930 nch = ap->a_nch; 931 dip = VTOI(ap->a_dvp); 932 hmp = dip->hmp; 933 934 if (dip->flags & HAMMER_INODE_RO) 935 return (EROFS); 936 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 937 return (error); 938 939 /* 940 * Create a transaction to cover the operations we perform. 941 */ 942 lwkt_gettoken(&hmp->fs_token); 943 hammer_start_transaction(&trans, hmp); 944 ++hammer_stats_file_iopsw; 945 946 /* 947 * Create a new filesystem object of the requested type. The 948 * returned inode will be referenced and shared-locked to prevent 949 * it from being moved to the flusher. 950 */ 951 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 952 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 953 NULL, &nip); 954 if (error) { 955 hkprintf("hammer_create_inode error %d\n", error); 956 hammer_done_transaction(&trans); 957 *ap->a_vpp = NULL; 958 lwkt_reltoken(&hmp->fs_token); 959 return (error); 960 } 961 962 /* 963 * Add the new filesystem object to the directory. This will also 964 * bump the inode's link count. 965 */ 966 error = hammer_ip_add_directory(&trans, dip, 967 nch->ncp->nc_name, nch->ncp->nc_nlen, 968 nip); 969 if (error) 970 hkprintf("hammer_ip_add_directory error %d\n", error); 971 972 /* 973 * Finish up. 974 */ 975 if (error) { 976 hammer_rel_inode(nip, 0); 977 hammer_done_transaction(&trans); 978 *ap->a_vpp = NULL; 979 } else { 980 error = hammer_get_vnode(nip, ap->a_vpp); 981 hammer_done_transaction(&trans); 982 hammer_rel_inode(nip, 0); 983 if (error == 0) { 984 cache_setunresolved(ap->a_nch); 985 cache_setvp(ap->a_nch, *ap->a_vpp); 986 } 987 hammer_knote(ap->a_dvp, NOTE_WRITE); 988 } 989 lwkt_reltoken(&hmp->fs_token); 990 return (error); 991 } 992 993 /* 994 * hammer_vop_getattr { vp, vap } 995 * 996 * Retrieve an inode's attribute information. When accessing inodes 997 * historically we fake the atime field to ensure consistent results. 998 * The atime field is stored in the B-Tree element and allowed to be 999 * updated without cycling the element. 1000 * 1001 * MPSAFE - does not require fs_token 1002 */ 1003 static 1004 int 1005 hammer_vop_getattr(struct vop_getattr_args *ap) 1006 { 1007 struct hammer_inode *ip = VTOI(ap->a_vp); 1008 struct vattr *vap = ap->a_vap; 1009 1010 /* 1011 * We want the fsid to be different when accessing a filesystem 1012 * with different as-of's so programs like diff don't think 1013 * the files are the same. 1014 * 1015 * We also want the fsid to be the same when comparing snapshots, 1016 * or when comparing mirrors (which might be backed by different 1017 * physical devices). HAMMER fsids are based on the PFS's 1018 * shared_uuid field. 1019 * 1020 * XXX there is a chance of collision here. The va_fsid reported 1021 * by stat is different from the more involved fsid used in the 1022 * mount structure. 1023 */ 1024 ++hammer_stats_file_iopsr; 1025 hammer_lock_sh(&ip->lock); 1026 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1027 (u_int32_t)(ip->obj_asof >> 32); 1028 1029 vap->va_fileid = ip->ino_leaf.base.obj_id; 1030 vap->va_mode = ip->ino_data.mode; 1031 vap->va_nlink = ip->ino_data.nlinks; 1032 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1033 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1034 vap->va_rmajor = 0; 1035 vap->va_rminor = 0; 1036 vap->va_size = ip->ino_data.size; 1037 1038 /* 1039 * Special case for @@PFS softlinks. The actual size of the 1040 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1041 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1042 * 1043 * Note that userspace hammer command does not allow users to 1044 * create a @@PFS softlink under an existing other PFS (id!=0) 1045 * so the ip localization here for @@PFS softlink is always 0. 1046 */ 1047 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1048 ip->ino_data.size == 10 && 1049 ip->obj_asof == HAMMER_MAX_TID && 1050 ip->obj_localization == 0 && 1051 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1052 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1053 vap->va_size = 26; 1054 else 1055 vap->va_size = 10; 1056 } 1057 1058 /* 1059 * We must provide a consistent atime and mtime for snapshots 1060 * so people can do a 'tar cf - ... | md5' on them and get 1061 * consistent results. 1062 */ 1063 if (ip->flags & HAMMER_INODE_RO) { 1064 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1065 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1066 } else { 1067 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1068 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1069 } 1070 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1071 vap->va_flags = ip->ino_data.uflags; 1072 vap->va_gen = 1; /* hammer inums are unique for all time */ 1073 vap->va_blocksize = HAMMER_BUFSIZE; 1074 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1075 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1076 ~HAMMER_XBUFMASK64; 1077 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1078 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1079 ~HAMMER_BUFMASK64; 1080 } else { 1081 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1082 } 1083 1084 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1085 vap->va_filerev = 0; /* XXX */ 1086 vap->va_uid_uuid = ip->ino_data.uid; 1087 vap->va_gid_uuid = ip->ino_data.gid; 1088 vap->va_fsid_uuid = ip->hmp->fsid; 1089 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1090 VA_FSID_UUID_VALID; 1091 1092 switch (ip->ino_data.obj_type) { 1093 case HAMMER_OBJTYPE_CDEV: 1094 case HAMMER_OBJTYPE_BDEV: 1095 vap->va_rmajor = ip->ino_data.rmajor; 1096 vap->va_rminor = ip->ino_data.rminor; 1097 break; 1098 default: 1099 break; 1100 } 1101 hammer_unlock(&ip->lock); 1102 return(0); 1103 } 1104 1105 /* 1106 * hammer_vop_nresolve { nch, dvp, cred } 1107 * 1108 * Locate the requested directory entry. 1109 */ 1110 static 1111 int 1112 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1113 { 1114 struct hammer_transaction trans; 1115 struct namecache *ncp; 1116 hammer_mount_t hmp; 1117 hammer_inode_t dip; 1118 hammer_inode_t ip; 1119 hammer_tid_t asof; 1120 struct hammer_cursor cursor; 1121 struct vnode *vp; 1122 int64_t namekey; 1123 int error; 1124 int i; 1125 int nlen; 1126 int flags; 1127 int ispfs; 1128 int64_t obj_id; 1129 u_int32_t localization; 1130 u_int32_t max_iterations; 1131 1132 /* 1133 * Misc initialization, plus handle as-of name extensions. Look for 1134 * the '@@' extension. Note that as-of files and directories cannot 1135 * be modified. 1136 */ 1137 dip = VTOI(ap->a_dvp); 1138 ncp = ap->a_nch->ncp; 1139 asof = dip->obj_asof; 1140 localization = dip->obj_localization; /* for code consistency */ 1141 nlen = ncp->nc_nlen; 1142 flags = dip->flags & HAMMER_INODE_RO; 1143 ispfs = 0; 1144 hmp = dip->hmp; 1145 1146 lwkt_gettoken(&hmp->fs_token); 1147 hammer_simple_transaction(&trans, hmp); 1148 ++hammer_stats_file_iopsr; 1149 1150 for (i = 0; i < nlen; ++i) { 1151 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1152 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1153 &ispfs, &asof, &localization); 1154 if (error != 0) { 1155 i = nlen; 1156 break; 1157 } 1158 if (asof != HAMMER_MAX_TID) 1159 flags |= HAMMER_INODE_RO; 1160 break; 1161 } 1162 } 1163 nlen = i; 1164 1165 /* 1166 * If this is a PFS softlink we dive into the PFS 1167 */ 1168 if (ispfs && nlen == 0) { 1169 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1170 asof, localization, 1171 flags, &error); 1172 if (error == 0) { 1173 error = hammer_get_vnode(ip, &vp); 1174 hammer_rel_inode(ip, 0); 1175 } else { 1176 vp = NULL; 1177 } 1178 if (error == 0) { 1179 vn_unlock(vp); 1180 cache_setvp(ap->a_nch, vp); 1181 vrele(vp); 1182 } 1183 goto done; 1184 } 1185 1186 /* 1187 * If there is no path component the time extension is relative to dip. 1188 * e.g. "fubar/@@<snapshot>" 1189 * 1190 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1191 * e.g. "fubar/.@@<snapshot>" 1192 * 1193 * ".." is handled by the kernel. We do not currently handle 1194 * "..@<snapshot>". 1195 */ 1196 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1197 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1198 asof, dip->obj_localization, 1199 flags, &error); 1200 if (error == 0) { 1201 error = hammer_get_vnode(ip, &vp); 1202 hammer_rel_inode(ip, 0); 1203 } else { 1204 vp = NULL; 1205 } 1206 if (error == 0) { 1207 vn_unlock(vp); 1208 cache_setvp(ap->a_nch, vp); 1209 vrele(vp); 1210 } 1211 goto done; 1212 } 1213 1214 /* 1215 * Calculate the namekey and setup the key range for the scan. This 1216 * works kinda like a chained hash table where the lower 32 bits 1217 * of the namekey synthesize the chain. 1218 * 1219 * The key range is inclusive of both key_beg and key_end. 1220 */ 1221 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1222 &max_iterations); 1223 1224 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1225 cursor.key_beg.localization = dip->obj_localization + 1226 hammer_dir_localization(dip); 1227 cursor.key_beg.obj_id = dip->obj_id; 1228 cursor.key_beg.key = namekey; 1229 cursor.key_beg.create_tid = 0; 1230 cursor.key_beg.delete_tid = 0; 1231 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1232 cursor.key_beg.obj_type = 0; 1233 1234 cursor.key_end = cursor.key_beg; 1235 cursor.key_end.key += max_iterations; 1236 cursor.asof = asof; 1237 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1238 1239 /* 1240 * Scan all matching records (the chain), locate the one matching 1241 * the requested path component. 1242 * 1243 * The hammer_ip_*() functions merge in-memory records with on-disk 1244 * records for the purposes of the search. 1245 */ 1246 obj_id = 0; 1247 localization = HAMMER_DEF_LOCALIZATION; 1248 1249 if (error == 0) { 1250 error = hammer_ip_first(&cursor); 1251 while (error == 0) { 1252 error = hammer_ip_resolve_data(&cursor); 1253 if (error) 1254 break; 1255 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1256 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1257 obj_id = cursor.data->entry.obj_id; 1258 localization = cursor.data->entry.localization; 1259 break; 1260 } 1261 error = hammer_ip_next(&cursor); 1262 } 1263 } 1264 hammer_done_cursor(&cursor); 1265 1266 /* 1267 * Lookup the obj_id. This should always succeed. If it does not 1268 * the filesystem may be damaged and we return a dummy inode. 1269 */ 1270 if (error == 0) { 1271 ip = hammer_get_inode(&trans, dip, obj_id, 1272 asof, localization, 1273 flags, &error); 1274 if (error == ENOENT) { 1275 kprintf("HAMMER: WARNING: Missing " 1276 "inode for dirent \"%s\"\n" 1277 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1278 ncp->nc_name, 1279 (long long)obj_id, (long long)asof, 1280 localization); 1281 error = 0; 1282 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1283 asof, localization, 1284 flags, &error); 1285 } 1286 if (error == 0) { 1287 error = hammer_get_vnode(ip, &vp); 1288 hammer_rel_inode(ip, 0); 1289 } else { 1290 vp = NULL; 1291 } 1292 if (error == 0) { 1293 vn_unlock(vp); 1294 cache_setvp(ap->a_nch, vp); 1295 vrele(vp); 1296 } 1297 } else if (error == ENOENT) { 1298 cache_setvp(ap->a_nch, NULL); 1299 } 1300 done: 1301 hammer_done_transaction(&trans); 1302 lwkt_reltoken(&hmp->fs_token); 1303 return (error); 1304 } 1305 1306 /* 1307 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1308 * 1309 * Locate the parent directory of a directory vnode. 1310 * 1311 * dvp is referenced but not locked. *vpp must be returned referenced and 1312 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1313 * at the root, instead it could indicate that the directory we were in was 1314 * removed. 1315 * 1316 * NOTE: as-of sequences are not linked into the directory structure. If 1317 * we are at the root with a different asof then the mount point, reload 1318 * the same directory with the mount point's asof. I'm not sure what this 1319 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1320 * get confused, but it hasn't been tested. 1321 */ 1322 static 1323 int 1324 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1325 { 1326 struct hammer_transaction trans; 1327 struct hammer_inode *dip; 1328 struct hammer_inode *ip; 1329 hammer_mount_t hmp; 1330 int64_t parent_obj_id; 1331 u_int32_t parent_obj_localization; 1332 hammer_tid_t asof; 1333 int error; 1334 1335 dip = VTOI(ap->a_dvp); 1336 asof = dip->obj_asof; 1337 hmp = dip->hmp; 1338 1339 /* 1340 * Whos are parent? This could be the root of a pseudo-filesystem 1341 * whos parent is in another localization domain. 1342 */ 1343 lwkt_gettoken(&hmp->fs_token); 1344 parent_obj_id = dip->ino_data.parent_obj_id; 1345 if (dip->obj_id == HAMMER_OBJID_ROOT) 1346 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1347 else 1348 parent_obj_localization = dip->obj_localization; 1349 1350 /* 1351 * It's probably a PFS root when dip->ino_data.parent_obj_id is 0. 1352 */ 1353 if (parent_obj_id == 0) { 1354 if (dip->obj_id == HAMMER_OBJID_ROOT && 1355 asof != hmp->asof) { 1356 parent_obj_id = dip->obj_id; 1357 asof = hmp->asof; 1358 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1359 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1360 (long long)dip->obj_asof); 1361 } else { 1362 *ap->a_vpp = NULL; 1363 lwkt_reltoken(&hmp->fs_token); 1364 return ENOENT; 1365 } 1366 } 1367 1368 hammer_simple_transaction(&trans, hmp); 1369 ++hammer_stats_file_iopsr; 1370 1371 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1372 asof, parent_obj_localization, 1373 dip->flags, &error); 1374 if (ip) { 1375 error = hammer_get_vnode(ip, ap->a_vpp); 1376 hammer_rel_inode(ip, 0); 1377 } else { 1378 *ap->a_vpp = NULL; 1379 } 1380 hammer_done_transaction(&trans); 1381 lwkt_reltoken(&hmp->fs_token); 1382 return (error); 1383 } 1384 1385 /* 1386 * hammer_vop_nlink { nch, dvp, vp, cred } 1387 */ 1388 static 1389 int 1390 hammer_vop_nlink(struct vop_nlink_args *ap) 1391 { 1392 struct hammer_transaction trans; 1393 struct hammer_inode *dip; 1394 struct hammer_inode *ip; 1395 struct nchandle *nch; 1396 hammer_mount_t hmp; 1397 int error; 1398 1399 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1400 return(EXDEV); 1401 1402 nch = ap->a_nch; 1403 dip = VTOI(ap->a_dvp); 1404 ip = VTOI(ap->a_vp); 1405 hmp = dip->hmp; 1406 1407 if (dip->obj_localization != ip->obj_localization) 1408 return(EXDEV); 1409 1410 if (dip->flags & HAMMER_INODE_RO) 1411 return (EROFS); 1412 if (ip->flags & HAMMER_INODE_RO) 1413 return (EROFS); 1414 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1415 return (error); 1416 1417 /* 1418 * Create a transaction to cover the operations we perform. 1419 */ 1420 lwkt_gettoken(&hmp->fs_token); 1421 hammer_start_transaction(&trans, hmp); 1422 ++hammer_stats_file_iopsw; 1423 1424 /* 1425 * Add the filesystem object to the directory. Note that neither 1426 * dip nor ip are referenced or locked, but their vnodes are 1427 * referenced. This function will bump the inode's link count. 1428 */ 1429 error = hammer_ip_add_directory(&trans, dip, 1430 nch->ncp->nc_name, nch->ncp->nc_nlen, 1431 ip); 1432 1433 /* 1434 * Finish up. 1435 */ 1436 if (error == 0) { 1437 cache_setunresolved(nch); 1438 cache_setvp(nch, ap->a_vp); 1439 } 1440 hammer_done_transaction(&trans); 1441 hammer_knote(ap->a_vp, NOTE_LINK); 1442 hammer_knote(ap->a_dvp, NOTE_WRITE); 1443 lwkt_reltoken(&hmp->fs_token); 1444 return (error); 1445 } 1446 1447 /* 1448 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1449 * 1450 * The operating system has already ensured that the directory entry 1451 * does not exist and done all appropriate namespace locking. 1452 */ 1453 static 1454 int 1455 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1456 { 1457 struct hammer_transaction trans; 1458 struct hammer_inode *dip; 1459 struct hammer_inode *nip; 1460 struct nchandle *nch; 1461 hammer_mount_t hmp; 1462 int error; 1463 1464 nch = ap->a_nch; 1465 dip = VTOI(ap->a_dvp); 1466 hmp = dip->hmp; 1467 1468 if (dip->flags & HAMMER_INODE_RO) 1469 return (EROFS); 1470 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1471 return (error); 1472 1473 /* 1474 * Create a transaction to cover the operations we perform. 1475 */ 1476 lwkt_gettoken(&hmp->fs_token); 1477 hammer_start_transaction(&trans, hmp); 1478 ++hammer_stats_file_iopsw; 1479 1480 /* 1481 * Create a new filesystem object of the requested type. The 1482 * returned inode will be referenced but not locked. 1483 */ 1484 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1485 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1486 NULL, &nip); 1487 if (error) { 1488 hkprintf("hammer_mkdir error %d\n", error); 1489 hammer_done_transaction(&trans); 1490 *ap->a_vpp = NULL; 1491 lwkt_reltoken(&hmp->fs_token); 1492 return (error); 1493 } 1494 /* 1495 * Add the new filesystem object to the directory. This will also 1496 * bump the inode's link count. 1497 */ 1498 error = hammer_ip_add_directory(&trans, dip, 1499 nch->ncp->nc_name, nch->ncp->nc_nlen, 1500 nip); 1501 if (error) 1502 hkprintf("hammer_mkdir (add) error %d\n", error); 1503 1504 /* 1505 * Finish up. 1506 */ 1507 if (error) { 1508 hammer_rel_inode(nip, 0); 1509 *ap->a_vpp = NULL; 1510 } else { 1511 error = hammer_get_vnode(nip, ap->a_vpp); 1512 hammer_rel_inode(nip, 0); 1513 if (error == 0) { 1514 cache_setunresolved(ap->a_nch); 1515 cache_setvp(ap->a_nch, *ap->a_vpp); 1516 } 1517 } 1518 hammer_done_transaction(&trans); 1519 if (error == 0) 1520 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1521 lwkt_reltoken(&hmp->fs_token); 1522 return (error); 1523 } 1524 1525 /* 1526 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1527 * 1528 * The operating system has already ensured that the directory entry 1529 * does not exist and done all appropriate namespace locking. 1530 */ 1531 static 1532 int 1533 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1534 { 1535 struct hammer_transaction trans; 1536 struct hammer_inode *dip; 1537 struct hammer_inode *nip; 1538 struct nchandle *nch; 1539 hammer_mount_t hmp; 1540 int error; 1541 1542 nch = ap->a_nch; 1543 dip = VTOI(ap->a_dvp); 1544 hmp = dip->hmp; 1545 1546 if (dip->flags & HAMMER_INODE_RO) 1547 return (EROFS); 1548 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1549 return (error); 1550 1551 /* 1552 * Create a transaction to cover the operations we perform. 1553 */ 1554 lwkt_gettoken(&hmp->fs_token); 1555 hammer_start_transaction(&trans, hmp); 1556 ++hammer_stats_file_iopsw; 1557 1558 /* 1559 * Create a new filesystem object of the requested type. The 1560 * returned inode will be referenced but not locked. 1561 * 1562 * If mknod specifies a directory a pseudo-fs is created. 1563 */ 1564 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1565 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1566 NULL, &nip); 1567 if (error) { 1568 hammer_done_transaction(&trans); 1569 *ap->a_vpp = NULL; 1570 lwkt_reltoken(&hmp->fs_token); 1571 return (error); 1572 } 1573 1574 /* 1575 * Add the new filesystem object to the directory. This will also 1576 * bump the inode's link count. 1577 */ 1578 error = hammer_ip_add_directory(&trans, dip, 1579 nch->ncp->nc_name, nch->ncp->nc_nlen, 1580 nip); 1581 1582 /* 1583 * Finish up. 1584 */ 1585 if (error) { 1586 hammer_rel_inode(nip, 0); 1587 *ap->a_vpp = NULL; 1588 } else { 1589 error = hammer_get_vnode(nip, ap->a_vpp); 1590 hammer_rel_inode(nip, 0); 1591 if (error == 0) { 1592 cache_setunresolved(ap->a_nch); 1593 cache_setvp(ap->a_nch, *ap->a_vpp); 1594 } 1595 } 1596 hammer_done_transaction(&trans); 1597 if (error == 0) 1598 hammer_knote(ap->a_dvp, NOTE_WRITE); 1599 lwkt_reltoken(&hmp->fs_token); 1600 return (error); 1601 } 1602 1603 /* 1604 * hammer_vop_open { vp, mode, cred, fp } 1605 * 1606 * MPSAFE (does not require fs_token) 1607 */ 1608 static 1609 int 1610 hammer_vop_open(struct vop_open_args *ap) 1611 { 1612 hammer_inode_t ip; 1613 1614 ++hammer_stats_file_iopsr; 1615 ip = VTOI(ap->a_vp); 1616 1617 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1618 return (EROFS); 1619 return(vop_stdopen(ap)); 1620 } 1621 1622 /* 1623 * hammer_vop_print { vp } 1624 */ 1625 static 1626 int 1627 hammer_vop_print(struct vop_print_args *ap) 1628 { 1629 return EOPNOTSUPP; 1630 } 1631 1632 /* 1633 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1634 */ 1635 static 1636 int 1637 hammer_vop_readdir(struct vop_readdir_args *ap) 1638 { 1639 struct hammer_transaction trans; 1640 struct hammer_cursor cursor; 1641 struct hammer_inode *ip; 1642 hammer_mount_t hmp; 1643 struct uio *uio; 1644 hammer_base_elm_t base; 1645 int error; 1646 int cookie_index; 1647 int ncookies; 1648 off_t *cookies; 1649 off_t saveoff; 1650 int r; 1651 int dtype; 1652 1653 ++hammer_stats_file_iopsr; 1654 ip = VTOI(ap->a_vp); 1655 uio = ap->a_uio; 1656 saveoff = uio->uio_offset; 1657 hmp = ip->hmp; 1658 1659 if (ap->a_ncookies) { 1660 ncookies = uio->uio_resid / 16 + 1; 1661 if (ncookies > 1024) 1662 ncookies = 1024; 1663 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1664 cookie_index = 0; 1665 } else { 1666 ncookies = -1; 1667 cookies = NULL; 1668 cookie_index = 0; 1669 } 1670 1671 lwkt_gettoken(&hmp->fs_token); 1672 hammer_simple_transaction(&trans, hmp); 1673 1674 /* 1675 * Handle artificial entries 1676 * 1677 * It should be noted that the minimum value for a directory 1678 * hash key on-media is 0x0000000100000000, so we can use anything 1679 * less then that to represent our 'special' key space. 1680 */ 1681 error = 0; 1682 if (saveoff == 0) { 1683 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1684 if (r) 1685 goto done; 1686 if (cookies) 1687 cookies[cookie_index] = saveoff; 1688 ++saveoff; 1689 ++cookie_index; 1690 if (cookie_index == ncookies) 1691 goto done; 1692 } 1693 if (saveoff == 1) { 1694 if (ip->ino_data.parent_obj_id) { 1695 r = vop_write_dirent(&error, uio, 1696 ip->ino_data.parent_obj_id, 1697 DT_DIR, 2, ".."); 1698 } else { 1699 r = vop_write_dirent(&error, uio, 1700 ip->obj_id, DT_DIR, 2, ".."); 1701 } 1702 if (r) 1703 goto done; 1704 if (cookies) 1705 cookies[cookie_index] = saveoff; 1706 ++saveoff; 1707 ++cookie_index; 1708 if (cookie_index == ncookies) 1709 goto done; 1710 } 1711 1712 /* 1713 * Key range (begin and end inclusive) to scan. Directory keys 1714 * directly translate to a 64 bit 'seek' position. 1715 */ 1716 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1717 cursor.key_beg.localization = ip->obj_localization + 1718 hammer_dir_localization(ip); 1719 cursor.key_beg.obj_id = ip->obj_id; 1720 cursor.key_beg.create_tid = 0; 1721 cursor.key_beg.delete_tid = 0; 1722 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1723 cursor.key_beg.obj_type = 0; 1724 cursor.key_beg.key = saveoff; 1725 1726 cursor.key_end = cursor.key_beg; 1727 cursor.key_end.key = HAMMER_MAX_KEY; 1728 cursor.asof = ip->obj_asof; 1729 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1730 1731 error = hammer_ip_first(&cursor); 1732 1733 while (error == 0) { 1734 error = hammer_ip_resolve_data(&cursor); 1735 if (error) 1736 break; 1737 base = &cursor.leaf->base; 1738 saveoff = base->key; 1739 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1740 1741 if (base->obj_id != ip->obj_id) 1742 panic("readdir: bad record at %p", cursor.node); 1743 1744 /* 1745 * Convert pseudo-filesystems into softlinks 1746 */ 1747 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1748 r = vop_write_dirent( 1749 &error, uio, cursor.data->entry.obj_id, 1750 dtype, 1751 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1752 (void *)cursor.data->entry.name); 1753 if (r) 1754 break; 1755 ++saveoff; 1756 if (cookies) 1757 cookies[cookie_index] = base->key; 1758 ++cookie_index; 1759 if (cookie_index == ncookies) 1760 break; 1761 error = hammer_ip_next(&cursor); 1762 } 1763 hammer_done_cursor(&cursor); 1764 1765 done: 1766 hammer_done_transaction(&trans); 1767 1768 if (ap->a_eofflag) 1769 *ap->a_eofflag = (error == ENOENT); 1770 uio->uio_offset = saveoff; 1771 if (error && cookie_index == 0) { 1772 if (error == ENOENT) 1773 error = 0; 1774 if (cookies) { 1775 kfree(cookies, M_TEMP); 1776 *ap->a_ncookies = 0; 1777 *ap->a_cookies = NULL; 1778 } 1779 } else { 1780 if (error == ENOENT) 1781 error = 0; 1782 if (cookies) { 1783 *ap->a_ncookies = cookie_index; 1784 *ap->a_cookies = cookies; 1785 } 1786 } 1787 lwkt_reltoken(&hmp->fs_token); 1788 return(error); 1789 } 1790 1791 /* 1792 * hammer_vop_readlink { vp, uio, cred } 1793 */ 1794 static 1795 int 1796 hammer_vop_readlink(struct vop_readlink_args *ap) 1797 { 1798 struct hammer_transaction trans; 1799 struct hammer_cursor cursor; 1800 struct hammer_inode *ip; 1801 hammer_mount_t hmp; 1802 char buf[32]; 1803 u_int32_t localization; 1804 hammer_pseudofs_inmem_t pfsm; 1805 int error; 1806 1807 ip = VTOI(ap->a_vp); 1808 hmp = ip->hmp; 1809 1810 lwkt_gettoken(&hmp->fs_token); 1811 1812 /* 1813 * Shortcut if the symlink data was stuffed into ino_data. 1814 * 1815 * Also expand special "@@PFS%05d" softlinks (expansion only 1816 * occurs for non-historical (current) accesses made from the 1817 * primary filesystem). 1818 * 1819 * Note that userspace hammer command does not allow users to 1820 * create a @@PFS softlink under an existing other PFS (id!=0) 1821 * so the ip localization here for @@PFS softlink is always 0. 1822 */ 1823 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1824 char *ptr; 1825 int bytes; 1826 1827 ptr = ip->ino_data.ext.symlink; 1828 bytes = (int)ip->ino_data.size; 1829 if (bytes == 10 && 1830 ip->obj_asof == HAMMER_MAX_TID && 1831 ip->obj_localization == 0 && 1832 strncmp(ptr, "@@PFS", 5) == 0) { 1833 hammer_simple_transaction(&trans, hmp); 1834 bcopy(ptr + 5, buf, 5); 1835 buf[5] = 0; 1836 localization = strtoul(buf, NULL, 10) << 16; 1837 pfsm = hammer_load_pseudofs(&trans, localization, 1838 &error); 1839 if (error == 0) { 1840 if (pfsm->pfsd.mirror_flags & 1841 HAMMER_PFSD_SLAVE) { 1842 /* vap->va_size == 26 */ 1843 ksnprintf(buf, sizeof(buf), 1844 "@@0x%016llx:%05d", 1845 (long long)pfsm->pfsd.sync_end_tid, 1846 localization >> 16); 1847 } else { 1848 /* vap->va_size == 10 */ 1849 ksnprintf(buf, sizeof(buf), 1850 "@@-1:%05d", 1851 localization >> 16); 1852 #if 0 1853 ksnprintf(buf, sizeof(buf), 1854 "@@0x%016llx:%05d", 1855 (long long)HAMMER_MAX_TID, 1856 localization >> 16); 1857 #endif 1858 } 1859 ptr = buf; 1860 bytes = strlen(buf); 1861 } 1862 if (pfsm) 1863 hammer_rel_pseudofs(hmp, pfsm); 1864 hammer_done_transaction(&trans); 1865 } 1866 error = uiomove(ptr, bytes, ap->a_uio); 1867 lwkt_reltoken(&hmp->fs_token); 1868 return(error); 1869 } 1870 1871 /* 1872 * Long version 1873 */ 1874 hammer_simple_transaction(&trans, hmp); 1875 ++hammer_stats_file_iopsr; 1876 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1877 1878 /* 1879 * Key range (begin and end inclusive) to scan. Directory keys 1880 * directly translate to a 64 bit 'seek' position. 1881 */ 1882 cursor.key_beg.localization = ip->obj_localization + 1883 HAMMER_LOCALIZE_MISC; 1884 cursor.key_beg.obj_id = ip->obj_id; 1885 cursor.key_beg.create_tid = 0; 1886 cursor.key_beg.delete_tid = 0; 1887 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1888 cursor.key_beg.obj_type = 0; 1889 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1890 cursor.asof = ip->obj_asof; 1891 cursor.flags |= HAMMER_CURSOR_ASOF; 1892 1893 error = hammer_ip_lookup(&cursor); 1894 if (error == 0) { 1895 error = hammer_ip_resolve_data(&cursor); 1896 if (error == 0) { 1897 KKASSERT(cursor.leaf->data_len >= 1898 HAMMER_SYMLINK_NAME_OFF); 1899 error = uiomove(cursor.data->symlink.name, 1900 cursor.leaf->data_len - 1901 HAMMER_SYMLINK_NAME_OFF, 1902 ap->a_uio); 1903 } 1904 } 1905 hammer_done_cursor(&cursor); 1906 hammer_done_transaction(&trans); 1907 lwkt_reltoken(&hmp->fs_token); 1908 return(error); 1909 } 1910 1911 /* 1912 * hammer_vop_nremove { nch, dvp, cred } 1913 */ 1914 static 1915 int 1916 hammer_vop_nremove(struct vop_nremove_args *ap) 1917 { 1918 struct hammer_transaction trans; 1919 struct hammer_inode *dip; 1920 hammer_mount_t hmp; 1921 int error; 1922 1923 dip = VTOI(ap->a_dvp); 1924 hmp = dip->hmp; 1925 1926 if (hammer_nohistory(dip) == 0 && 1927 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1928 return (error); 1929 } 1930 1931 lwkt_gettoken(&hmp->fs_token); 1932 hammer_start_transaction(&trans, hmp); 1933 ++hammer_stats_file_iopsw; 1934 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1935 hammer_done_transaction(&trans); 1936 if (error == 0) 1937 hammer_knote(ap->a_dvp, NOTE_WRITE); 1938 lwkt_reltoken(&hmp->fs_token); 1939 return (error); 1940 } 1941 1942 /* 1943 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1944 */ 1945 static 1946 int 1947 hammer_vop_nrename(struct vop_nrename_args *ap) 1948 { 1949 struct hammer_transaction trans; 1950 struct namecache *fncp; 1951 struct namecache *tncp; 1952 struct hammer_inode *fdip; 1953 struct hammer_inode *tdip; 1954 struct hammer_inode *ip; 1955 hammer_mount_t hmp; 1956 struct hammer_cursor cursor; 1957 int64_t namekey; 1958 u_int32_t max_iterations; 1959 int nlen, error; 1960 1961 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1962 return(EXDEV); 1963 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1964 return(EXDEV); 1965 1966 fdip = VTOI(ap->a_fdvp); 1967 tdip = VTOI(ap->a_tdvp); 1968 fncp = ap->a_fnch->ncp; 1969 tncp = ap->a_tnch->ncp; 1970 ip = VTOI(fncp->nc_vp); 1971 KKASSERT(ip != NULL); 1972 1973 hmp = ip->hmp; 1974 1975 if (fdip->obj_localization != tdip->obj_localization) 1976 return(EXDEV); 1977 if (fdip->obj_localization != ip->obj_localization) 1978 return(EXDEV); 1979 1980 if (fdip->flags & HAMMER_INODE_RO) 1981 return (EROFS); 1982 if (tdip->flags & HAMMER_INODE_RO) 1983 return (EROFS); 1984 if (ip->flags & HAMMER_INODE_RO) 1985 return (EROFS); 1986 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1987 return (error); 1988 1989 lwkt_gettoken(&hmp->fs_token); 1990 hammer_start_transaction(&trans, hmp); 1991 ++hammer_stats_file_iopsw; 1992 1993 /* 1994 * Remove tncp from the target directory and then link ip as 1995 * tncp. XXX pass trans to dounlink 1996 * 1997 * Force the inode sync-time to match the transaction so it is 1998 * in-sync with the creation of the target directory entry. 1999 */ 2000 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 2001 ap->a_cred, 0, -1); 2002 if (error == 0 || error == ENOENT) { 2003 error = hammer_ip_add_directory(&trans, tdip, 2004 tncp->nc_name, tncp->nc_nlen, 2005 ip); 2006 if (error == 0) { 2007 ip->ino_data.parent_obj_id = tdip->obj_id; 2008 ip->ino_data.ctime = trans.time; 2009 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2010 } 2011 } 2012 if (error) 2013 goto failed; /* XXX */ 2014 2015 /* 2016 * Locate the record in the originating directory and remove it. 2017 * 2018 * Calculate the namekey and setup the key range for the scan. This 2019 * works kinda like a chained hash table where the lower 32 bits 2020 * of the namekey synthesize the chain. 2021 * 2022 * The key range is inclusive of both key_beg and key_end. 2023 */ 2024 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2025 &max_iterations); 2026 retry: 2027 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2028 cursor.key_beg.localization = fdip->obj_localization + 2029 hammer_dir_localization(fdip); 2030 cursor.key_beg.obj_id = fdip->obj_id; 2031 cursor.key_beg.key = namekey; 2032 cursor.key_beg.create_tid = 0; 2033 cursor.key_beg.delete_tid = 0; 2034 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2035 cursor.key_beg.obj_type = 0; 2036 2037 cursor.key_end = cursor.key_beg; 2038 cursor.key_end.key += max_iterations; 2039 cursor.asof = fdip->obj_asof; 2040 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2041 2042 /* 2043 * Scan all matching records (the chain), locate the one matching 2044 * the requested path component. 2045 * 2046 * The hammer_ip_*() functions merge in-memory records with on-disk 2047 * records for the purposes of the search. 2048 */ 2049 error = hammer_ip_first(&cursor); 2050 while (error == 0) { 2051 if (hammer_ip_resolve_data(&cursor) != 0) 2052 break; 2053 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2054 KKASSERT(nlen > 0); 2055 if (fncp->nc_nlen == nlen && 2056 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2057 break; 2058 } 2059 error = hammer_ip_next(&cursor); 2060 } 2061 2062 /* 2063 * If all is ok we have to get the inode so we can adjust nlinks. 2064 * 2065 * WARNING: hammer_ip_del_directory() may have to terminate the 2066 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2067 * twice. 2068 */ 2069 if (error == 0) 2070 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2071 2072 /* 2073 * XXX A deadlock here will break rename's atomicy for the purposes 2074 * of crash recovery. 2075 */ 2076 if (error == EDEADLK) { 2077 hammer_done_cursor(&cursor); 2078 goto retry; 2079 } 2080 2081 /* 2082 * Cleanup and tell the kernel that the rename succeeded. 2083 * 2084 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2085 * without formally acquiring the vp since the vp might 2086 * have zero refs on it, or in the middle of a reclaim, 2087 * etc. 2088 */ 2089 hammer_done_cursor(&cursor); 2090 if (error == 0) { 2091 cache_rename(ap->a_fnch, ap->a_tnch); 2092 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2093 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2094 while (ip->vp) { 2095 struct vnode *vp; 2096 2097 error = hammer_get_vnode(ip, &vp); 2098 if (error == 0 && vp) { 2099 vn_unlock(vp); 2100 hammer_knote(ip->vp, NOTE_RENAME); 2101 vrele(vp); 2102 break; 2103 } 2104 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2105 } 2106 } 2107 2108 failed: 2109 hammer_done_transaction(&trans); 2110 lwkt_reltoken(&hmp->fs_token); 2111 return (error); 2112 } 2113 2114 /* 2115 * hammer_vop_nrmdir { nch, dvp, cred } 2116 */ 2117 static 2118 int 2119 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2120 { 2121 struct hammer_transaction trans; 2122 struct hammer_inode *dip; 2123 hammer_mount_t hmp; 2124 int error; 2125 2126 dip = VTOI(ap->a_dvp); 2127 hmp = dip->hmp; 2128 2129 if (hammer_nohistory(dip) == 0 && 2130 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2131 return (error); 2132 } 2133 2134 lwkt_gettoken(&hmp->fs_token); 2135 hammer_start_transaction(&trans, hmp); 2136 ++hammer_stats_file_iopsw; 2137 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2138 hammer_done_transaction(&trans); 2139 if (error == 0) 2140 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2141 lwkt_reltoken(&hmp->fs_token); 2142 return (error); 2143 } 2144 2145 /* 2146 * hammer_vop_markatime { vp, cred } 2147 */ 2148 static 2149 int 2150 hammer_vop_markatime(struct vop_markatime_args *ap) 2151 { 2152 struct hammer_transaction trans; 2153 struct hammer_inode *ip; 2154 hammer_mount_t hmp; 2155 2156 ip = VTOI(ap->a_vp); 2157 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2158 return (EROFS); 2159 if (ip->flags & HAMMER_INODE_RO) 2160 return (EROFS); 2161 hmp = ip->hmp; 2162 if (hmp->mp->mnt_flag & MNT_NOATIME) 2163 return (0); 2164 lwkt_gettoken(&hmp->fs_token); 2165 hammer_start_transaction(&trans, hmp); 2166 ++hammer_stats_file_iopsw; 2167 2168 ip->ino_data.atime = trans.time; 2169 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2170 hammer_done_transaction(&trans); 2171 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2172 lwkt_reltoken(&hmp->fs_token); 2173 return (0); 2174 } 2175 2176 /* 2177 * hammer_vop_setattr { vp, vap, cred } 2178 */ 2179 static 2180 int 2181 hammer_vop_setattr(struct vop_setattr_args *ap) 2182 { 2183 struct hammer_transaction trans; 2184 struct hammer_inode *ip; 2185 struct vattr *vap; 2186 hammer_mount_t hmp; 2187 int modflags; 2188 int error; 2189 int truncating; 2190 int blksize; 2191 int kflags; 2192 #if 0 2193 int64_t aligned_size; 2194 #endif 2195 u_int32_t flags; 2196 2197 vap = ap->a_vap; 2198 ip = ap->a_vp->v_data; 2199 modflags = 0; 2200 kflags = 0; 2201 hmp = ip->hmp; 2202 2203 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2204 return(EROFS); 2205 if (ip->flags & HAMMER_INODE_RO) 2206 return (EROFS); 2207 if (hammer_nohistory(ip) == 0 && 2208 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2209 return (error); 2210 } 2211 2212 lwkt_gettoken(&hmp->fs_token); 2213 hammer_start_transaction(&trans, hmp); 2214 ++hammer_stats_file_iopsw; 2215 error = 0; 2216 2217 if (vap->va_flags != VNOVAL) { 2218 flags = ip->ino_data.uflags; 2219 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2220 hammer_to_unix_xid(&ip->ino_data.uid), 2221 ap->a_cred); 2222 if (error == 0) { 2223 if (ip->ino_data.uflags != flags) { 2224 ip->ino_data.uflags = flags; 2225 ip->ino_data.ctime = trans.time; 2226 modflags |= HAMMER_INODE_DDIRTY; 2227 kflags |= NOTE_ATTRIB; 2228 } 2229 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2230 error = 0; 2231 goto done; 2232 } 2233 } 2234 goto done; 2235 } 2236 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2237 error = EPERM; 2238 goto done; 2239 } 2240 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2241 mode_t cur_mode = ip->ino_data.mode; 2242 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2243 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2244 uuid_t uuid_uid; 2245 uuid_t uuid_gid; 2246 2247 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2248 ap->a_cred, 2249 &cur_uid, &cur_gid, &cur_mode); 2250 if (error == 0) { 2251 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2252 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2253 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2254 sizeof(uuid_uid)) || 2255 bcmp(&uuid_gid, &ip->ino_data.gid, 2256 sizeof(uuid_gid)) || 2257 ip->ino_data.mode != cur_mode) { 2258 ip->ino_data.uid = uuid_uid; 2259 ip->ino_data.gid = uuid_gid; 2260 ip->ino_data.mode = cur_mode; 2261 ip->ino_data.ctime = trans.time; 2262 modflags |= HAMMER_INODE_DDIRTY; 2263 } 2264 kflags |= NOTE_ATTRIB; 2265 } 2266 } 2267 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2268 switch(ap->a_vp->v_type) { 2269 case VREG: 2270 if (vap->va_size == ip->ino_data.size) 2271 break; 2272 2273 /* 2274 * Log the operation if in fast-fsync mode or if 2275 * there are unterminated redo write records present. 2276 * 2277 * The second check is needed so the recovery code 2278 * properly truncates write redos even if nominal 2279 * REDO operations is turned off due to excessive 2280 * writes, because the related records might be 2281 * destroyed and never lay down a TERM_WRITE. 2282 */ 2283 if ((ip->flags & HAMMER_INODE_REDO) || 2284 (ip->flags & HAMMER_INODE_RDIRTY)) { 2285 error = hammer_generate_redo(&trans, ip, 2286 vap->va_size, 2287 HAMMER_REDO_TRUNC, 2288 NULL, 0); 2289 } 2290 blksize = hammer_blocksize(vap->va_size); 2291 2292 /* 2293 * XXX break atomicy, we can deadlock the backend 2294 * if we do not release the lock. Probably not a 2295 * big deal here. 2296 */ 2297 if (vap->va_size < ip->ino_data.size) { 2298 nvtruncbuf(ap->a_vp, vap->va_size, 2299 blksize, 2300 hammer_blockoff(vap->va_size), 2301 0); 2302 truncating = 1; 2303 kflags |= NOTE_WRITE; 2304 } else { 2305 nvextendbuf(ap->a_vp, 2306 ip->ino_data.size, 2307 vap->va_size, 2308 hammer_blocksize(ip->ino_data.size), 2309 hammer_blocksize(vap->va_size), 2310 hammer_blockoff(ip->ino_data.size), 2311 hammer_blockoff(vap->va_size), 2312 0); 2313 truncating = 0; 2314 kflags |= NOTE_WRITE | NOTE_EXTEND; 2315 } 2316 ip->ino_data.size = vap->va_size; 2317 ip->ino_data.mtime = trans.time; 2318 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2319 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2320 2321 /* 2322 * On-media truncation is cached in the inode until 2323 * the inode is synchronized. We must immediately 2324 * handle any frontend records. 2325 */ 2326 if (truncating) { 2327 hammer_ip_frontend_trunc(ip, vap->va_size); 2328 #ifdef DEBUG_TRUNCATE 2329 if (HammerTruncIp == NULL) 2330 HammerTruncIp = ip; 2331 #endif 2332 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2333 ip->flags |= HAMMER_INODE_TRUNCATED; 2334 ip->trunc_off = vap->va_size; 2335 hammer_inode_dirty(ip); 2336 #ifdef DEBUG_TRUNCATE 2337 if (ip == HammerTruncIp) 2338 kprintf("truncate1 %016llx\n", 2339 (long long)ip->trunc_off); 2340 #endif 2341 } else if (ip->trunc_off > vap->va_size) { 2342 ip->trunc_off = vap->va_size; 2343 #ifdef DEBUG_TRUNCATE 2344 if (ip == HammerTruncIp) 2345 kprintf("truncate2 %016llx\n", 2346 (long long)ip->trunc_off); 2347 #endif 2348 } else { 2349 #ifdef DEBUG_TRUNCATE 2350 if (ip == HammerTruncIp) 2351 kprintf("truncate3 %016llx (ignored)\n", 2352 (long long)vap->va_size); 2353 #endif 2354 } 2355 } 2356 2357 #if 0 2358 /* 2359 * When truncating, nvtruncbuf() may have cleaned out 2360 * a portion of the last block on-disk in the buffer 2361 * cache. We must clean out any frontend records 2362 * for blocks beyond the new last block. 2363 */ 2364 aligned_size = (vap->va_size + (blksize - 1)) & 2365 ~(int64_t)(blksize - 1); 2366 if (truncating && vap->va_size < aligned_size) { 2367 aligned_size -= blksize; 2368 hammer_ip_frontend_trunc(ip, aligned_size); 2369 } 2370 #endif 2371 break; 2372 case VDATABASE: 2373 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2374 ip->flags |= HAMMER_INODE_TRUNCATED; 2375 ip->trunc_off = vap->va_size; 2376 hammer_inode_dirty(ip); 2377 } else if (ip->trunc_off > vap->va_size) { 2378 ip->trunc_off = vap->va_size; 2379 } 2380 hammer_ip_frontend_trunc(ip, vap->va_size); 2381 ip->ino_data.size = vap->va_size; 2382 ip->ino_data.mtime = trans.time; 2383 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2384 kflags |= NOTE_ATTRIB; 2385 break; 2386 default: 2387 error = EINVAL; 2388 goto done; 2389 } 2390 break; 2391 } 2392 if (vap->va_atime.tv_sec != VNOVAL) { 2393 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2394 modflags |= HAMMER_INODE_ATIME; 2395 kflags |= NOTE_ATTRIB; 2396 } 2397 if (vap->va_mtime.tv_sec != VNOVAL) { 2398 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2399 modflags |= HAMMER_INODE_MTIME; 2400 kflags |= NOTE_ATTRIB; 2401 } 2402 if (vap->va_mode != (mode_t)VNOVAL) { 2403 mode_t cur_mode = ip->ino_data.mode; 2404 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2405 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2406 2407 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2408 cur_uid, cur_gid, &cur_mode); 2409 if (error == 0 && ip->ino_data.mode != cur_mode) { 2410 ip->ino_data.mode = cur_mode; 2411 ip->ino_data.ctime = trans.time; 2412 modflags |= HAMMER_INODE_DDIRTY; 2413 kflags |= NOTE_ATTRIB; 2414 } 2415 } 2416 done: 2417 if (error == 0) 2418 hammer_modify_inode(&trans, ip, modflags); 2419 hammer_done_transaction(&trans); 2420 hammer_knote(ap->a_vp, kflags); 2421 lwkt_reltoken(&hmp->fs_token); 2422 return (error); 2423 } 2424 2425 /* 2426 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2427 */ 2428 static 2429 int 2430 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2431 { 2432 struct hammer_transaction trans; 2433 struct hammer_inode *dip; 2434 struct hammer_inode *nip; 2435 hammer_record_t record; 2436 struct nchandle *nch; 2437 hammer_mount_t hmp; 2438 int error; 2439 int bytes; 2440 2441 ap->a_vap->va_type = VLNK; 2442 2443 nch = ap->a_nch; 2444 dip = VTOI(ap->a_dvp); 2445 hmp = dip->hmp; 2446 2447 if (dip->flags & HAMMER_INODE_RO) 2448 return (EROFS); 2449 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2450 return (error); 2451 2452 /* 2453 * Create a transaction to cover the operations we perform. 2454 */ 2455 lwkt_gettoken(&hmp->fs_token); 2456 hammer_start_transaction(&trans, hmp); 2457 ++hammer_stats_file_iopsw; 2458 2459 /* 2460 * Create a new filesystem object of the requested type. The 2461 * returned inode will be referenced but not locked. 2462 */ 2463 2464 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2465 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2466 NULL, &nip); 2467 if (error) { 2468 hammer_done_transaction(&trans); 2469 *ap->a_vpp = NULL; 2470 lwkt_reltoken(&hmp->fs_token); 2471 return (error); 2472 } 2473 2474 /* 2475 * Add a record representing the symlink. symlink stores the link 2476 * as pure data, not a string, and is no \0 terminated. 2477 */ 2478 if (error == 0) { 2479 bytes = strlen(ap->a_target); 2480 2481 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2482 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2483 } else { 2484 record = hammer_alloc_mem_record(nip, bytes); 2485 record->type = HAMMER_MEM_RECORD_GENERAL; 2486 2487 record->leaf.base.localization = nip->obj_localization + 2488 HAMMER_LOCALIZE_MISC; 2489 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2490 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2491 record->leaf.data_len = bytes; 2492 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2493 bcopy(ap->a_target, record->data->symlink.name, bytes); 2494 error = hammer_ip_add_record(&trans, record); 2495 } 2496 2497 /* 2498 * Set the file size to the length of the link. 2499 */ 2500 if (error == 0) { 2501 nip->ino_data.size = bytes; 2502 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2503 } 2504 } 2505 if (error == 0) 2506 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2507 nch->ncp->nc_nlen, nip); 2508 2509 /* 2510 * Finish up. 2511 */ 2512 if (error) { 2513 hammer_rel_inode(nip, 0); 2514 *ap->a_vpp = NULL; 2515 } else { 2516 error = hammer_get_vnode(nip, ap->a_vpp); 2517 hammer_rel_inode(nip, 0); 2518 if (error == 0) { 2519 cache_setunresolved(ap->a_nch); 2520 cache_setvp(ap->a_nch, *ap->a_vpp); 2521 hammer_knote(ap->a_dvp, NOTE_WRITE); 2522 } 2523 } 2524 hammer_done_transaction(&trans); 2525 lwkt_reltoken(&hmp->fs_token); 2526 return (error); 2527 } 2528 2529 /* 2530 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2531 */ 2532 static 2533 int 2534 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2535 { 2536 struct hammer_transaction trans; 2537 struct hammer_inode *dip; 2538 hammer_mount_t hmp; 2539 int error; 2540 2541 dip = VTOI(ap->a_dvp); 2542 hmp = dip->hmp; 2543 2544 if (hammer_nohistory(dip) == 0 && 2545 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2546 return (error); 2547 } 2548 2549 lwkt_gettoken(&hmp->fs_token); 2550 hammer_start_transaction(&trans, hmp); 2551 ++hammer_stats_file_iopsw; 2552 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2553 ap->a_cred, ap->a_flags, -1); 2554 hammer_done_transaction(&trans); 2555 lwkt_reltoken(&hmp->fs_token); 2556 2557 return (error); 2558 } 2559 2560 /* 2561 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2562 */ 2563 static 2564 int 2565 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2566 { 2567 struct hammer_inode *ip = ap->a_vp->v_data; 2568 hammer_mount_t hmp = ip->hmp; 2569 int error; 2570 2571 ++hammer_stats_file_iopsr; 2572 lwkt_gettoken(&hmp->fs_token); 2573 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2574 ap->a_fflag, ap->a_cred); 2575 lwkt_reltoken(&hmp->fs_token); 2576 return (error); 2577 } 2578 2579 static 2580 int 2581 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2582 { 2583 static const struct mountctl_opt extraopt[] = { 2584 { HMNT_NOHISTORY, "nohistory" }, 2585 { HMNT_MASTERID, "master" }, 2586 { 0, NULL} 2587 2588 }; 2589 struct hammer_mount *hmp; 2590 struct mount *mp; 2591 int usedbytes; 2592 int error; 2593 2594 error = 0; 2595 usedbytes = 0; 2596 mp = ap->a_head.a_ops->head.vv_mount; 2597 KKASSERT(mp->mnt_data != NULL); 2598 hmp = (struct hammer_mount *)mp->mnt_data; 2599 2600 lwkt_gettoken(&hmp->fs_token); 2601 2602 switch(ap->a_op) { 2603 case MOUNTCTL_SET_EXPORT: 2604 if (ap->a_ctllen != sizeof(struct export_args)) 2605 error = EINVAL; 2606 else 2607 error = hammer_vfs_export(mp, ap->a_op, 2608 (const struct export_args *)ap->a_ctl); 2609 break; 2610 case MOUNTCTL_MOUNTFLAGS: 2611 { 2612 /* 2613 * Call standard mountctl VOP function 2614 * so we get user mount flags. 2615 */ 2616 error = vop_stdmountctl(ap); 2617 if (error) 2618 break; 2619 2620 usedbytes = *ap->a_res; 2621 2622 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2623 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2624 ap->a_buf, 2625 ap->a_buflen - usedbytes, 2626 &error); 2627 } 2628 2629 *ap->a_res += usedbytes; 2630 break; 2631 } 2632 default: 2633 error = vop_stdmountctl(ap); 2634 break; 2635 } 2636 lwkt_reltoken(&hmp->fs_token); 2637 return(error); 2638 } 2639 2640 /* 2641 * hammer_vop_strategy { vp, bio } 2642 * 2643 * Strategy call, used for regular file read & write only. Note that the 2644 * bp may represent a cluster. 2645 * 2646 * To simplify operation and allow better optimizations in the future, 2647 * this code does not make any assumptions with regards to buffer alignment 2648 * or size. 2649 */ 2650 static 2651 int 2652 hammer_vop_strategy(struct vop_strategy_args *ap) 2653 { 2654 struct buf *bp; 2655 int error; 2656 2657 bp = ap->a_bio->bio_buf; 2658 2659 switch(bp->b_cmd) { 2660 case BUF_CMD_READ: 2661 error = hammer_vop_strategy_read(ap); 2662 break; 2663 case BUF_CMD_WRITE: 2664 error = hammer_vop_strategy_write(ap); 2665 break; 2666 default: 2667 bp->b_error = error = EINVAL; 2668 bp->b_flags |= B_ERROR; 2669 biodone(ap->a_bio); 2670 break; 2671 } 2672 2673 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2674 2675 return (error); 2676 } 2677 2678 /* 2679 * Read from a regular file. Iterate the related records and fill in the 2680 * BIO/BUF. Gaps are zero-filled. 2681 * 2682 * The support code in hammer_object.c should be used to deal with mixed 2683 * in-memory and on-disk records. 2684 * 2685 * NOTE: Can be called from the cluster code with an oversized buf. 2686 * 2687 * XXX atime update 2688 */ 2689 static 2690 int 2691 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2692 { 2693 struct hammer_transaction trans; 2694 struct hammer_inode *ip; 2695 struct hammer_inode *dip; 2696 hammer_mount_t hmp; 2697 struct hammer_cursor cursor; 2698 hammer_base_elm_t base; 2699 hammer_off_t disk_offset; 2700 struct bio *bio; 2701 struct bio *nbio; 2702 struct buf *bp; 2703 int64_t rec_offset; 2704 int64_t ran_end; 2705 int64_t tmp64; 2706 int error; 2707 int boff; 2708 int roff; 2709 int n; 2710 int isdedupable; 2711 2712 bio = ap->a_bio; 2713 bp = bio->bio_buf; 2714 ip = ap->a_vp->v_data; 2715 hmp = ip->hmp; 2716 2717 /* 2718 * The zone-2 disk offset may have been set by the cluster code via 2719 * a BMAP operation, or else should be NOOFFSET. 2720 * 2721 * Checking the high bits for a match against zone-2 should suffice. 2722 * 2723 * In cases where a lot of data duplication is present it may be 2724 * more beneficial to drop through and doubule-buffer through the 2725 * device. 2726 */ 2727 nbio = push_bio(bio); 2728 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2729 HAMMER_ZONE_LARGE_DATA) { 2730 if (hammer_double_buffer == 0) { 2731 lwkt_gettoken(&hmp->fs_token); 2732 error = hammer_io_direct_read(hmp, nbio, NULL); 2733 lwkt_reltoken(&hmp->fs_token); 2734 return (error); 2735 } 2736 2737 /* 2738 * Try to shortcut requests for double_buffer mode too. 2739 * Since this mode runs through the device buffer cache 2740 * only compatible buffer sizes (meaning those generated 2741 * by normal filesystem buffers) are legal. 2742 */ 2743 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2744 lwkt_gettoken(&hmp->fs_token); 2745 error = hammer_io_indirect_read(hmp, nbio, NULL); 2746 lwkt_reltoken(&hmp->fs_token); 2747 return (error); 2748 } 2749 } 2750 2751 /* 2752 * Well, that sucked. Do it the hard way. If all the stars are 2753 * aligned we may still be able to issue a direct-read. 2754 */ 2755 lwkt_gettoken(&hmp->fs_token); 2756 hammer_simple_transaction(&trans, hmp); 2757 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2758 2759 /* 2760 * Key range (begin and end inclusive) to scan. Note that the key's 2761 * stored in the actual records represent BASE+LEN, not BASE. The 2762 * first record containing bio_offset will have a key > bio_offset. 2763 */ 2764 cursor.key_beg.localization = ip->obj_localization + 2765 HAMMER_LOCALIZE_MISC; 2766 cursor.key_beg.obj_id = ip->obj_id; 2767 cursor.key_beg.create_tid = 0; 2768 cursor.key_beg.delete_tid = 0; 2769 cursor.key_beg.obj_type = 0; 2770 cursor.key_beg.key = bio->bio_offset + 1; 2771 cursor.asof = ip->obj_asof; 2772 cursor.flags |= HAMMER_CURSOR_ASOF; 2773 2774 cursor.key_end = cursor.key_beg; 2775 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2776 #if 0 2777 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2778 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2779 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2780 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2781 } else 2782 #endif 2783 { 2784 ran_end = bio->bio_offset + bp->b_bufsize; 2785 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2786 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2787 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2788 if (tmp64 < ran_end) 2789 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2790 else 2791 cursor.key_end.key = ran_end + MAXPHYS + 1; 2792 } 2793 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2794 2795 /* 2796 * Set NOSWAPCACHE for cursor data extraction if double buffering 2797 * is disabled or (if the file is not marked cacheable via chflags 2798 * and vm.swapcache_use_chflags is enabled). 2799 */ 2800 if (hammer_double_buffer == 0 || 2801 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2802 vm_swapcache_use_chflags)) { 2803 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2804 } 2805 2806 error = hammer_ip_first(&cursor); 2807 boff = 0; 2808 2809 while (error == 0) { 2810 /* 2811 * Get the base file offset of the record. The key for 2812 * data records is (base + bytes) rather then (base). 2813 */ 2814 base = &cursor.leaf->base; 2815 rec_offset = base->key - cursor.leaf->data_len; 2816 2817 /* 2818 * Calculate the gap, if any, and zero-fill it. 2819 * 2820 * n is the offset of the start of the record verses our 2821 * current seek offset in the bio. 2822 */ 2823 n = (int)(rec_offset - (bio->bio_offset + boff)); 2824 if (n > 0) { 2825 if (n > bp->b_bufsize - boff) 2826 n = bp->b_bufsize - boff; 2827 bzero((char *)bp->b_data + boff, n); 2828 boff += n; 2829 n = 0; 2830 } 2831 2832 /* 2833 * Calculate the data offset in the record and the number 2834 * of bytes we can copy. 2835 * 2836 * There are two degenerate cases. First, boff may already 2837 * be at bp->b_bufsize. Secondly, the data offset within 2838 * the record may exceed the record's size. 2839 */ 2840 roff = -n; 2841 rec_offset += roff; 2842 n = cursor.leaf->data_len - roff; 2843 if (n <= 0) { 2844 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2845 n = 0; 2846 } else if (n > bp->b_bufsize - boff) { 2847 n = bp->b_bufsize - boff; 2848 } 2849 2850 /* 2851 * Deal with cached truncations. This cool bit of code 2852 * allows truncate()/ftruncate() to avoid having to sync 2853 * the file. 2854 * 2855 * If the frontend is truncated then all backend records are 2856 * subject to the frontend's truncation. 2857 * 2858 * If the backend is truncated then backend records on-disk 2859 * (but not in-memory) are subject to the backend's 2860 * truncation. In-memory records owned by the backend 2861 * represent data written after the truncation point on the 2862 * backend and must not be truncated. 2863 * 2864 * Truncate operations deal with frontend buffer cache 2865 * buffers and frontend-owned in-memory records synchronously. 2866 */ 2867 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2868 if (hammer_cursor_ondisk(&cursor)/* || 2869 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2870 if (ip->trunc_off <= rec_offset) 2871 n = 0; 2872 else if (ip->trunc_off < rec_offset + n) 2873 n = (int)(ip->trunc_off - rec_offset); 2874 } 2875 } 2876 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2877 if (hammer_cursor_ondisk(&cursor)) { 2878 if (ip->sync_trunc_off <= rec_offset) 2879 n = 0; 2880 else if (ip->sync_trunc_off < rec_offset + n) 2881 n = (int)(ip->sync_trunc_off - rec_offset); 2882 } 2883 } 2884 2885 /* 2886 * Try to issue a direct read into our bio if possible, 2887 * otherwise resolve the element data into a hammer_buffer 2888 * and copy. 2889 * 2890 * The buffer on-disk should be zerod past any real 2891 * truncation point, but may not be for any synthesized 2892 * truncation point from above. 2893 * 2894 * NOTE: disk_offset is only valid if the cursor data is 2895 * on-disk. 2896 */ 2897 disk_offset = cursor.leaf->data_offset + roff; 2898 isdedupable = (boff == 0 && n == bp->b_bufsize && 2899 hammer_cursor_ondisk(&cursor) && 2900 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2901 2902 if (isdedupable && hammer_double_buffer == 0) { 2903 /* 2904 * Direct read case 2905 */ 2906 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2907 HAMMER_ZONE_LARGE_DATA); 2908 nbio->bio_offset = disk_offset; 2909 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2910 if (hammer_live_dedup && error == 0) 2911 hammer_dedup_cache_add(ip, cursor.leaf); 2912 goto done; 2913 } else if (isdedupable) { 2914 /* 2915 * Async I/O case for reading from backing store 2916 * and copying the data to the filesystem buffer. 2917 * live-dedup has to verify the data anyway if it 2918 * gets a hit later so we can just add the entry 2919 * now. 2920 */ 2921 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2922 HAMMER_ZONE_LARGE_DATA); 2923 nbio->bio_offset = disk_offset; 2924 if (hammer_live_dedup) 2925 hammer_dedup_cache_add(ip, cursor.leaf); 2926 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2927 goto done; 2928 } else if (n) { 2929 error = hammer_ip_resolve_data(&cursor); 2930 if (error == 0) { 2931 if (hammer_live_dedup && isdedupable) 2932 hammer_dedup_cache_add(ip, cursor.leaf); 2933 bcopy((char *)cursor.data + roff, 2934 (char *)bp->b_data + boff, n); 2935 } 2936 } 2937 if (error) 2938 break; 2939 2940 /* 2941 * We have to be sure that the only elements added to the 2942 * dedup cache are those which are already on-media. 2943 */ 2944 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2945 hammer_dedup_cache_add(ip, cursor.leaf); 2946 2947 /* 2948 * Iterate until we have filled the request. 2949 */ 2950 boff += n; 2951 if (boff == bp->b_bufsize) 2952 break; 2953 error = hammer_ip_next(&cursor); 2954 } 2955 2956 /* 2957 * There may have been a gap after the last record 2958 */ 2959 if (error == ENOENT) 2960 error = 0; 2961 if (error == 0 && boff != bp->b_bufsize) { 2962 KKASSERT(boff < bp->b_bufsize); 2963 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2964 /* boff = bp->b_bufsize; */ 2965 } 2966 2967 /* 2968 * Disallow swapcache operation on the vnode buffer if double 2969 * buffering is enabled, the swapcache will get the data via 2970 * the block device buffer. 2971 */ 2972 if (hammer_double_buffer) 2973 bp->b_flags |= B_NOTMETA; 2974 2975 /* 2976 * Cleanup 2977 */ 2978 bp->b_resid = 0; 2979 bp->b_error = error; 2980 if (error) 2981 bp->b_flags |= B_ERROR; 2982 biodone(ap->a_bio); 2983 2984 done: 2985 /* 2986 * Cache the b-tree node for the last data read in cache[1]. 2987 * 2988 * If we hit the file EOF then also cache the node in the 2989 * governing director's cache[3], it will be used to initialize 2990 * the inode's cache[1] for any inodes looked up via the directory. 2991 * 2992 * This doesn't reduce disk accesses since the B-Tree chain is 2993 * likely cached, but it does reduce cpu overhead when looking 2994 * up file offsets for cpdup/tar/cpio style iterations. 2995 */ 2996 if (cursor.node) 2997 hammer_cache_node(&ip->cache[1], cursor.node); 2998 if (ran_end >= ip->ino_data.size) { 2999 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 3000 ip->obj_asof, ip->obj_localization); 3001 if (dip) { 3002 hammer_cache_node(&dip->cache[3], cursor.node); 3003 hammer_rel_inode(dip, 0); 3004 } 3005 } 3006 hammer_done_cursor(&cursor); 3007 hammer_done_transaction(&trans); 3008 lwkt_reltoken(&hmp->fs_token); 3009 return(error); 3010 } 3011 3012 /* 3013 * BMAP operation - used to support cluster_read() only. 3014 * 3015 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3016 * 3017 * This routine may return EOPNOTSUPP if the opration is not supported for 3018 * the specified offset. The contents of the pointer arguments do not 3019 * need to be initialized in that case. 3020 * 3021 * If a disk address is available and properly aligned return 0 with 3022 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3023 * to the run-length relative to that offset. Callers may assume that 3024 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3025 * large, so return EOPNOTSUPP if it is not sufficiently large. 3026 */ 3027 static 3028 int 3029 hammer_vop_bmap(struct vop_bmap_args *ap) 3030 { 3031 struct hammer_transaction trans; 3032 struct hammer_inode *ip; 3033 hammer_mount_t hmp; 3034 struct hammer_cursor cursor; 3035 hammer_base_elm_t base; 3036 int64_t rec_offset; 3037 int64_t ran_end; 3038 int64_t tmp64; 3039 int64_t base_offset; 3040 int64_t base_disk_offset; 3041 int64_t last_offset; 3042 hammer_off_t last_disk_offset; 3043 hammer_off_t disk_offset; 3044 int rec_len; 3045 int error; 3046 int blksize; 3047 3048 ++hammer_stats_file_iopsr; 3049 ip = ap->a_vp->v_data; 3050 hmp = ip->hmp; 3051 3052 /* 3053 * We can only BMAP regular files. We can't BMAP database files, 3054 * directories, etc. 3055 */ 3056 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3057 return(EOPNOTSUPP); 3058 3059 /* 3060 * bmap is typically called with runp/runb both NULL when used 3061 * for writing. We do not support BMAP for writing atm. 3062 */ 3063 if (ap->a_cmd != BUF_CMD_READ) 3064 return(EOPNOTSUPP); 3065 3066 /* 3067 * Scan the B-Tree to acquire blockmap addresses, then translate 3068 * to raw addresses. 3069 */ 3070 lwkt_gettoken(&hmp->fs_token); 3071 hammer_simple_transaction(&trans, hmp); 3072 #if 0 3073 kprintf("bmap_beg %016llx ip->cache %p\n", 3074 (long long)ap->a_loffset, ip->cache[1]); 3075 #endif 3076 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3077 3078 /* 3079 * Key range (begin and end inclusive) to scan. Note that the key's 3080 * stored in the actual records represent BASE+LEN, not BASE. The 3081 * first record containing bio_offset will have a key > bio_offset. 3082 */ 3083 cursor.key_beg.localization = ip->obj_localization + 3084 HAMMER_LOCALIZE_MISC; 3085 cursor.key_beg.obj_id = ip->obj_id; 3086 cursor.key_beg.create_tid = 0; 3087 cursor.key_beg.delete_tid = 0; 3088 cursor.key_beg.obj_type = 0; 3089 if (ap->a_runb) 3090 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3091 else 3092 cursor.key_beg.key = ap->a_loffset + 1; 3093 if (cursor.key_beg.key < 0) 3094 cursor.key_beg.key = 0; 3095 cursor.asof = ip->obj_asof; 3096 cursor.flags |= HAMMER_CURSOR_ASOF; 3097 3098 cursor.key_end = cursor.key_beg; 3099 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3100 3101 ran_end = ap->a_loffset + MAXPHYS; 3102 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3103 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3104 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3105 if (tmp64 < ran_end) 3106 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3107 else 3108 cursor.key_end.key = ran_end + MAXPHYS + 1; 3109 3110 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3111 3112 error = hammer_ip_first(&cursor); 3113 base_offset = last_offset = 0; 3114 base_disk_offset = last_disk_offset = 0; 3115 3116 while (error == 0) { 3117 /* 3118 * Get the base file offset of the record. The key for 3119 * data records is (base + bytes) rather then (base). 3120 * 3121 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3122 * The extra bytes should be zero on-disk and the BMAP op 3123 * should still be ok. 3124 */ 3125 base = &cursor.leaf->base; 3126 rec_offset = base->key - cursor.leaf->data_len; 3127 rec_len = cursor.leaf->data_len; 3128 3129 /* 3130 * Incorporate any cached truncation. 3131 * 3132 * NOTE: Modifications to rec_len based on synthesized 3133 * truncation points remove the guarantee that any extended 3134 * data on disk is zero (since the truncations may not have 3135 * taken place on-media yet). 3136 */ 3137 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3138 if (hammer_cursor_ondisk(&cursor) || 3139 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3140 if (ip->trunc_off <= rec_offset) 3141 rec_len = 0; 3142 else if (ip->trunc_off < rec_offset + rec_len) 3143 rec_len = (int)(ip->trunc_off - rec_offset); 3144 } 3145 } 3146 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3147 if (hammer_cursor_ondisk(&cursor)) { 3148 if (ip->sync_trunc_off <= rec_offset) 3149 rec_len = 0; 3150 else if (ip->sync_trunc_off < rec_offset + rec_len) 3151 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3152 } 3153 } 3154 3155 /* 3156 * Accumulate information. If we have hit a discontiguous 3157 * block reset base_offset unless we are already beyond the 3158 * requested offset. If we are, that's it, we stop. 3159 */ 3160 if (error) 3161 break; 3162 if (hammer_cursor_ondisk(&cursor)) { 3163 disk_offset = cursor.leaf->data_offset; 3164 if (rec_offset != last_offset || 3165 disk_offset != last_disk_offset) { 3166 if (rec_offset > ap->a_loffset) 3167 break; 3168 base_offset = rec_offset; 3169 base_disk_offset = disk_offset; 3170 } 3171 last_offset = rec_offset + rec_len; 3172 last_disk_offset = disk_offset + rec_len; 3173 3174 if (hammer_live_dedup) 3175 hammer_dedup_cache_add(ip, cursor.leaf); 3176 } 3177 3178 error = hammer_ip_next(&cursor); 3179 } 3180 3181 #if 0 3182 kprintf("BMAP %016llx: %016llx - %016llx\n", 3183 (long long)ap->a_loffset, 3184 (long long)base_offset, 3185 (long long)last_offset); 3186 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3187 (long long)base_disk_offset, 3188 (long long)last_disk_offset); 3189 #endif 3190 3191 if (cursor.node) { 3192 hammer_cache_node(&ip->cache[1], cursor.node); 3193 #if 0 3194 kprintf("bmap_end2 %016llx ip->cache %p\n", 3195 (long long)ap->a_loffset, ip->cache[1]); 3196 #endif 3197 } 3198 hammer_done_cursor(&cursor); 3199 hammer_done_transaction(&trans); 3200 lwkt_reltoken(&hmp->fs_token); 3201 3202 /* 3203 * If we couldn't find any records or the records we did find were 3204 * all behind the requested offset, return failure. A forward 3205 * truncation can leave a hole w/ no on-disk records. 3206 */ 3207 if (last_offset == 0 || last_offset < ap->a_loffset) 3208 return (EOPNOTSUPP); 3209 3210 /* 3211 * Figure out the block size at the requested offset and adjust 3212 * our limits so the cluster_read() does not create inappropriately 3213 * sized buffer cache buffers. 3214 */ 3215 blksize = hammer_blocksize(ap->a_loffset); 3216 if (hammer_blocksize(base_offset) != blksize) { 3217 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3218 } 3219 if (last_offset != ap->a_loffset && 3220 hammer_blocksize(last_offset - 1) != blksize) { 3221 last_offset = hammer_blockdemarc(ap->a_loffset, 3222 last_offset - 1); 3223 } 3224 3225 /* 3226 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3227 * from occuring. 3228 */ 3229 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3230 3231 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3232 /* 3233 * Only large-data zones can be direct-IOd 3234 */ 3235 error = EOPNOTSUPP; 3236 } else if ((disk_offset & HAMMER_BUFMASK) || 3237 (last_offset - ap->a_loffset) < blksize) { 3238 /* 3239 * doffsetp is not aligned or the forward run size does 3240 * not cover a whole buffer, disallow the direct I/O. 3241 */ 3242 error = EOPNOTSUPP; 3243 } else { 3244 /* 3245 * We're good. 3246 */ 3247 *ap->a_doffsetp = disk_offset; 3248 if (ap->a_runb) { 3249 *ap->a_runb = ap->a_loffset - base_offset; 3250 KKASSERT(*ap->a_runb >= 0); 3251 } 3252 if (ap->a_runp) { 3253 *ap->a_runp = last_offset - ap->a_loffset; 3254 KKASSERT(*ap->a_runp >= 0); 3255 } 3256 error = 0; 3257 } 3258 return(error); 3259 } 3260 3261 /* 3262 * Write to a regular file. Because this is a strategy call the OS is 3263 * trying to actually get data onto the media. 3264 */ 3265 static 3266 int 3267 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3268 { 3269 hammer_record_t record; 3270 hammer_mount_t hmp; 3271 hammer_inode_t ip; 3272 struct bio *bio; 3273 struct buf *bp; 3274 int blksize __debugvar; 3275 int bytes; 3276 int error; 3277 3278 bio = ap->a_bio; 3279 bp = bio->bio_buf; 3280 ip = ap->a_vp->v_data; 3281 hmp = ip->hmp; 3282 3283 blksize = hammer_blocksize(bio->bio_offset); 3284 KKASSERT(bp->b_bufsize == blksize); 3285 3286 if (ip->flags & HAMMER_INODE_RO) { 3287 bp->b_error = EROFS; 3288 bp->b_flags |= B_ERROR; 3289 biodone(ap->a_bio); 3290 return(EROFS); 3291 } 3292 3293 lwkt_gettoken(&hmp->fs_token); 3294 3295 /* 3296 * Disallow swapcache operation on the vnode buffer if double 3297 * buffering is enabled, the swapcache will get the data via 3298 * the block device buffer. 3299 */ 3300 if (hammer_double_buffer) 3301 bp->b_flags |= B_NOTMETA; 3302 3303 /* 3304 * Interlock with inode destruction (no in-kernel or directory 3305 * topology visibility). If we queue new IO while trying to 3306 * destroy the inode we can deadlock the vtrunc call in 3307 * hammer_inode_unloadable_check(). 3308 * 3309 * Besides, there's no point flushing a bp associated with an 3310 * inode that is being destroyed on-media and has no kernel 3311 * references. 3312 */ 3313 if ((ip->flags | ip->sync_flags) & 3314 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3315 bp->b_resid = 0; 3316 biodone(ap->a_bio); 3317 lwkt_reltoken(&hmp->fs_token); 3318 return(0); 3319 } 3320 3321 /* 3322 * Reserve space and issue a direct-write from the front-end. 3323 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3324 * allocations. 3325 * 3326 * An in-memory record will be installed to reference the storage 3327 * until the flusher can get to it. 3328 * 3329 * Since we own the high level bio the front-end will not try to 3330 * do a direct-read until the write completes. 3331 * 3332 * NOTE: The only time we do not reserve a full-sized buffers 3333 * worth of data is if the file is small. We do not try to 3334 * allocate a fragment (from the small-data zone) at the end of 3335 * an otherwise large file as this can lead to wildly separated 3336 * data. 3337 */ 3338 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3339 KKASSERT(bio->bio_offset < ip->ino_data.size); 3340 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3341 bytes = bp->b_bufsize; 3342 else 3343 bytes = ((int)ip->ino_data.size + 15) & ~15; 3344 3345 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3346 bytes, &error); 3347 3348 /* 3349 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3350 * in hammer_vop_write(). We must flag the record so the proper 3351 * REDO_TERM_WRITE entry is generated during the flush. 3352 */ 3353 if (record) { 3354 if (bp->b_flags & B_VFSFLAG1) { 3355 record->flags |= HAMMER_RECF_REDO; 3356 bp->b_flags &= ~B_VFSFLAG1; 3357 } 3358 if (record->flags & HAMMER_RECF_DEDUPED) { 3359 bp->b_resid = 0; 3360 hammer_ip_replace_bulk(hmp, record); 3361 biodone(ap->a_bio); 3362 } else { 3363 hammer_io_direct_write(hmp, bio, record); 3364 } 3365 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3366 hammer_flush_inode(ip, 0); 3367 } else { 3368 bp->b_bio2.bio_offset = NOOFFSET; 3369 bp->b_error = error; 3370 bp->b_flags |= B_ERROR; 3371 biodone(ap->a_bio); 3372 } 3373 lwkt_reltoken(&hmp->fs_token); 3374 return(error); 3375 } 3376 3377 /* 3378 * dounlink - disconnect a directory entry 3379 * 3380 * XXX whiteout support not really in yet 3381 */ 3382 static int 3383 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3384 struct vnode *dvp, struct ucred *cred, 3385 int flags, int isdir) 3386 { 3387 struct namecache *ncp; 3388 hammer_inode_t dip; 3389 hammer_inode_t ip; 3390 hammer_mount_t hmp; 3391 struct hammer_cursor cursor; 3392 int64_t namekey; 3393 u_int32_t max_iterations; 3394 int nlen, error; 3395 3396 /* 3397 * Calculate the namekey and setup the key range for the scan. This 3398 * works kinda like a chained hash table where the lower 32 bits 3399 * of the namekey synthesize the chain. 3400 * 3401 * The key range is inclusive of both key_beg and key_end. 3402 */ 3403 dip = VTOI(dvp); 3404 ncp = nch->ncp; 3405 hmp = dip->hmp; 3406 3407 if (dip->flags & HAMMER_INODE_RO) 3408 return (EROFS); 3409 3410 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3411 &max_iterations); 3412 retry: 3413 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3414 cursor.key_beg.localization = dip->obj_localization + 3415 hammer_dir_localization(dip); 3416 cursor.key_beg.obj_id = dip->obj_id; 3417 cursor.key_beg.key = namekey; 3418 cursor.key_beg.create_tid = 0; 3419 cursor.key_beg.delete_tid = 0; 3420 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3421 cursor.key_beg.obj_type = 0; 3422 3423 cursor.key_end = cursor.key_beg; 3424 cursor.key_end.key += max_iterations; 3425 cursor.asof = dip->obj_asof; 3426 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3427 3428 /* 3429 * Scan all matching records (the chain), locate the one matching 3430 * the requested path component. info->last_error contains the 3431 * error code on search termination and could be 0, ENOENT, or 3432 * something else. 3433 * 3434 * The hammer_ip_*() functions merge in-memory records with on-disk 3435 * records for the purposes of the search. 3436 */ 3437 error = hammer_ip_first(&cursor); 3438 3439 while (error == 0) { 3440 error = hammer_ip_resolve_data(&cursor); 3441 if (error) 3442 break; 3443 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3444 KKASSERT(nlen > 0); 3445 if (ncp->nc_nlen == nlen && 3446 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3447 break; 3448 } 3449 error = hammer_ip_next(&cursor); 3450 } 3451 3452 /* 3453 * If all is ok we have to get the inode so we can adjust nlinks. 3454 * To avoid a deadlock with the flusher we must release the inode 3455 * lock on the directory when acquiring the inode for the entry. 3456 * 3457 * If the target is a directory, it must be empty. 3458 */ 3459 if (error == 0) { 3460 hammer_unlock(&cursor.ip->lock); 3461 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3462 hmp->asof, 3463 cursor.data->entry.localization, 3464 0, &error); 3465 hammer_lock_sh(&cursor.ip->lock); 3466 if (error == ENOENT) { 3467 kprintf("HAMMER: WARNING: Removing " 3468 "dirent w/missing inode \"%s\"\n" 3469 "\tobj_id = %016llx\n", 3470 ncp->nc_name, 3471 (long long)cursor.data->entry.obj_id); 3472 error = 0; 3473 } 3474 3475 /* 3476 * If isdir >= 0 we validate that the entry is or is not a 3477 * directory. If isdir < 0 we don't care. 3478 */ 3479 if (error == 0 && isdir >= 0 && ip) { 3480 if (isdir && 3481 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3482 error = ENOTDIR; 3483 } else if (isdir == 0 && 3484 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3485 error = EISDIR; 3486 } 3487 } 3488 3489 /* 3490 * If we are trying to remove a directory the directory must 3491 * be empty. 3492 * 3493 * The check directory code can loop and deadlock/retry. Our 3494 * own cursor's node locks must be released to avoid a 3-way 3495 * deadlock with the flusher if the check directory code 3496 * blocks. 3497 * 3498 * If any changes whatsoever have been made to the cursor 3499 * set EDEADLK and retry. 3500 * 3501 * WARNING: See warnings in hammer_unlock_cursor() 3502 * function. 3503 */ 3504 if (error == 0 && ip && ip->ino_data.obj_type == 3505 HAMMER_OBJTYPE_DIRECTORY) { 3506 hammer_unlock_cursor(&cursor); 3507 error = hammer_ip_check_directory_empty(trans, ip); 3508 hammer_lock_cursor(&cursor); 3509 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3510 kprintf("HAMMER: Warning: avoided deadlock " 3511 "on rmdir '%s'\n", 3512 ncp->nc_name); 3513 error = EDEADLK; 3514 } 3515 } 3516 3517 /* 3518 * Delete the directory entry. 3519 * 3520 * WARNING: hammer_ip_del_directory() may have to terminate 3521 * the cursor to avoid a deadlock. It is ok to call 3522 * hammer_done_cursor() twice. 3523 */ 3524 if (error == 0) { 3525 error = hammer_ip_del_directory(trans, &cursor, 3526 dip, ip); 3527 } 3528 hammer_done_cursor(&cursor); 3529 if (error == 0) { 3530 /* 3531 * Tell the namecache that we are now unlinked. 3532 */ 3533 cache_unlink(nch); 3534 3535 /* 3536 * NOTE: ip->vp, if non-NULL, cannot be directly 3537 * referenced without formally acquiring the 3538 * vp since the vp might have zero refs on it, 3539 * or in the middle of a reclaim, etc. 3540 * 3541 * NOTE: The cache_setunresolved() can rip the vp 3542 * out from under us since the vp may not have 3543 * any refs, in which case ip->vp will be NULL 3544 * from the outset. 3545 */ 3546 while (ip && ip->vp) { 3547 struct vnode *vp; 3548 3549 error = hammer_get_vnode(ip, &vp); 3550 if (error == 0 && vp) { 3551 vn_unlock(vp); 3552 hammer_knote(ip->vp, NOTE_DELETE); 3553 #if 0 3554 /* 3555 * Don't do this, it can deadlock 3556 * on concurrent rm's of hardlinks. 3557 * Shouldn't be needed any more. 3558 */ 3559 cache_inval_vp(ip->vp, CINV_DESTROY); 3560 #endif 3561 vrele(vp); 3562 break; 3563 } 3564 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3565 } 3566 } 3567 if (ip) 3568 hammer_rel_inode(ip, 0); 3569 } else { 3570 hammer_done_cursor(&cursor); 3571 } 3572 if (error == EDEADLK) 3573 goto retry; 3574 3575 return (error); 3576 } 3577 3578 /************************************************************************ 3579 * FIFO AND SPECFS OPS * 3580 ************************************************************************ 3581 * 3582 */ 3583 static int 3584 hammer_vop_fifoclose (struct vop_close_args *ap) 3585 { 3586 /* XXX update itimes */ 3587 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3588 } 3589 3590 static int 3591 hammer_vop_fiforead (struct vop_read_args *ap) 3592 { 3593 int error; 3594 3595 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3596 /* XXX update access time */ 3597 return (error); 3598 } 3599 3600 static int 3601 hammer_vop_fifowrite (struct vop_write_args *ap) 3602 { 3603 int error; 3604 3605 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3606 /* XXX update access time */ 3607 return (error); 3608 } 3609 3610 static 3611 int 3612 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3613 { 3614 int error; 3615 3616 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3617 if (error) 3618 error = hammer_vop_kqfilter(ap); 3619 return(error); 3620 } 3621 3622 /************************************************************************ 3623 * KQFILTER OPS * 3624 ************************************************************************ 3625 * 3626 */ 3627 static void filt_hammerdetach(struct knote *kn); 3628 static int filt_hammerread(struct knote *kn, long hint); 3629 static int filt_hammerwrite(struct knote *kn, long hint); 3630 static int filt_hammervnode(struct knote *kn, long hint); 3631 3632 static struct filterops hammerread_filtops = 3633 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3634 NULL, filt_hammerdetach, filt_hammerread }; 3635 static struct filterops hammerwrite_filtops = 3636 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3637 NULL, filt_hammerdetach, filt_hammerwrite }; 3638 static struct filterops hammervnode_filtops = 3639 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3640 NULL, filt_hammerdetach, filt_hammervnode }; 3641 3642 static 3643 int 3644 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3645 { 3646 struct vnode *vp = ap->a_vp; 3647 struct knote *kn = ap->a_kn; 3648 3649 switch (kn->kn_filter) { 3650 case EVFILT_READ: 3651 kn->kn_fop = &hammerread_filtops; 3652 break; 3653 case EVFILT_WRITE: 3654 kn->kn_fop = &hammerwrite_filtops; 3655 break; 3656 case EVFILT_VNODE: 3657 kn->kn_fop = &hammervnode_filtops; 3658 break; 3659 default: 3660 return (EOPNOTSUPP); 3661 } 3662 3663 kn->kn_hook = (caddr_t)vp; 3664 3665 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3666 3667 return(0); 3668 } 3669 3670 static void 3671 filt_hammerdetach(struct knote *kn) 3672 { 3673 struct vnode *vp = (void *)kn->kn_hook; 3674 3675 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3676 } 3677 3678 static int 3679 filt_hammerread(struct knote *kn, long hint) 3680 { 3681 struct vnode *vp = (void *)kn->kn_hook; 3682 hammer_inode_t ip = VTOI(vp); 3683 hammer_mount_t hmp = ip->hmp; 3684 off_t off; 3685 3686 if (hint == NOTE_REVOKE) { 3687 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3688 return(1); 3689 } 3690 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3691 off = ip->ino_data.size - kn->kn_fp->f_offset; 3692 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3693 lwkt_reltoken(&hmp->fs_token); 3694 if (kn->kn_sfflags & NOTE_OLDAPI) 3695 return(1); 3696 return (kn->kn_data != 0); 3697 } 3698 3699 static int 3700 filt_hammerwrite(struct knote *kn, long hint) 3701 { 3702 if (hint == NOTE_REVOKE) 3703 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3704 kn->kn_data = 0; 3705 return (1); 3706 } 3707 3708 static int 3709 filt_hammervnode(struct knote *kn, long hint) 3710 { 3711 if (kn->kn_sfflags & hint) 3712 kn->kn_fflags |= hint; 3713 if (hint == NOTE_REVOKE) { 3714 kn->kn_flags |= (EV_EOF | EV_NODATA); 3715 return (1); 3716 } 3717 return (kn->kn_fflags != 0); 3718 } 3719 3720