1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/fcntl.h> 39 #include <sys/namecache.h> 40 #include <sys/vnode.h> 41 #include <sys/lockf.h> 42 #include <sys/event.h> 43 #include <sys/stat.h> 44 #include <sys/dirent.h> 45 #include <sys/file.h> 46 #include <vm/vm_extern.h> 47 #include <vm/swap_pager.h> 48 #include <vfs/fifofs/fifo.h> 49 50 #include "hammer.h" 51 52 /* 53 * USERFS VNOPS 54 */ 55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 56 static int hammer_vop_fsync(struct vop_fsync_args *); 57 static int hammer_vop_read(struct vop_read_args *); 58 static int hammer_vop_write(struct vop_write_args *); 59 static int hammer_vop_access(struct vop_access_args *); 60 static int hammer_vop_advlock(struct vop_advlock_args *); 61 static int hammer_vop_close(struct vop_close_args *); 62 static int hammer_vop_ncreate(struct vop_ncreate_args *); 63 static int hammer_vop_getattr(struct vop_getattr_args *); 64 static int hammer_vop_nresolve(struct vop_nresolve_args *); 65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 66 static int hammer_vop_nlink(struct vop_nlink_args *); 67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 68 static int hammer_vop_nmknod(struct vop_nmknod_args *); 69 static int hammer_vop_open(struct vop_open_args *); 70 static int hammer_vop_print(struct vop_print_args *); 71 static int hammer_vop_readdir(struct vop_readdir_args *); 72 static int hammer_vop_readlink(struct vop_readlink_args *); 73 static int hammer_vop_nremove(struct vop_nremove_args *); 74 static int hammer_vop_nrename(struct vop_nrename_args *); 75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 76 static int hammer_vop_markatime(struct vop_markatime_args *); 77 static int hammer_vop_setattr(struct vop_setattr_args *); 78 static int hammer_vop_strategy(struct vop_strategy_args *); 79 static int hammer_vop_bmap(struct vop_bmap_args *ap); 80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 82 static int hammer_vop_ioctl(struct vop_ioctl_args *); 83 static int hammer_vop_mountctl(struct vop_mountctl_args *); 84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 85 86 static int hammer_vop_fifoclose (struct vop_close_args *); 87 static int hammer_vop_fiforead (struct vop_read_args *); 88 static int hammer_vop_fifowrite (struct vop_write_args *); 89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 90 91 struct vop_ops hammer_vnode_vops = { 92 .vop_default = vop_defaultop, 93 .vop_fsync = hammer_vop_fsync, 94 .vop_getpages = vop_stdgetpages, 95 .vop_putpages = vop_stdputpages, 96 .vop_read = hammer_vop_read, 97 .vop_write = hammer_vop_write, 98 .vop_access = hammer_vop_access, 99 .vop_advlock = hammer_vop_advlock, 100 .vop_close = hammer_vop_close, 101 .vop_ncreate = hammer_vop_ncreate, 102 .vop_getattr = hammer_vop_getattr, 103 .vop_inactive = hammer_vop_inactive, 104 .vop_reclaim = hammer_vop_reclaim, 105 .vop_nresolve = hammer_vop_nresolve, 106 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 107 .vop_nlink = hammer_vop_nlink, 108 .vop_nmkdir = hammer_vop_nmkdir, 109 .vop_nmknod = hammer_vop_nmknod, 110 .vop_open = hammer_vop_open, 111 .vop_pathconf = vop_stdpathconf, 112 .vop_print = hammer_vop_print, 113 .vop_readdir = hammer_vop_readdir, 114 .vop_readlink = hammer_vop_readlink, 115 .vop_nremove = hammer_vop_nremove, 116 .vop_nrename = hammer_vop_nrename, 117 .vop_nrmdir = hammer_vop_nrmdir, 118 .vop_markatime = hammer_vop_markatime, 119 .vop_setattr = hammer_vop_setattr, 120 .vop_bmap = hammer_vop_bmap, 121 .vop_strategy = hammer_vop_strategy, 122 .vop_nsymlink = hammer_vop_nsymlink, 123 .vop_nwhiteout = hammer_vop_nwhiteout, 124 .vop_ioctl = hammer_vop_ioctl, 125 .vop_mountctl = hammer_vop_mountctl, 126 .vop_kqfilter = hammer_vop_kqfilter 127 }; 128 129 struct vop_ops hammer_spec_vops = { 130 .vop_default = vop_defaultop, 131 .vop_fsync = hammer_vop_fsync, 132 .vop_read = vop_stdnoread, 133 .vop_write = vop_stdnowrite, 134 .vop_access = hammer_vop_access, 135 .vop_close = hammer_vop_close, 136 .vop_markatime = hammer_vop_markatime, 137 .vop_getattr = hammer_vop_getattr, 138 .vop_inactive = hammer_vop_inactive, 139 .vop_reclaim = hammer_vop_reclaim, 140 .vop_setattr = hammer_vop_setattr 141 }; 142 143 struct vop_ops hammer_fifo_vops = { 144 .vop_default = fifo_vnoperate, 145 .vop_fsync = hammer_vop_fsync, 146 .vop_read = hammer_vop_fiforead, 147 .vop_write = hammer_vop_fifowrite, 148 .vop_access = hammer_vop_access, 149 .vop_close = hammer_vop_fifoclose, 150 .vop_markatime = hammer_vop_markatime, 151 .vop_getattr = hammer_vop_getattr, 152 .vop_inactive = hammer_vop_inactive, 153 .vop_reclaim = hammer_vop_reclaim, 154 .vop_setattr = hammer_vop_setattr, 155 .vop_kqfilter = hammer_vop_fifokqfilter 156 }; 157 158 static __inline 159 void 160 hammer_knote(struct vnode *vp, int flags) 161 { 162 if (flags) 163 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 164 } 165 166 #ifdef DEBUG_TRUNCATE 167 struct hammer_inode *HammerTruncIp; 168 #endif 169 170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 171 struct vnode *dvp, struct ucred *cred, 172 int flags, int isdir); 173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 175 176 #if 0 177 static 178 int 179 hammer_vop_vnoperate(struct vop_generic_args *) 180 { 181 return (VOCALL(&hammer_vnode_vops, ap)); 182 } 183 #endif 184 185 /* 186 * hammer_vop_fsync { vp, waitfor } 187 * 188 * fsync() an inode to disk and wait for it to be completely committed 189 * such that the information would not be undone if a crash occured after 190 * return. 191 * 192 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 193 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 194 * operation. 195 * 196 * Ultimately the combination of a REDO log and use of fast storage 197 * to front-end cluster caches will make fsync fast, but it aint 198 * here yet. And, in anycase, we need real transactional 199 * all-or-nothing features which are not restricted to a single file. 200 */ 201 static 202 int 203 hammer_vop_fsync(struct vop_fsync_args *ap) 204 { 205 hammer_inode_t ip = VTOI(ap->a_vp); 206 hammer_mount_t hmp = ip->hmp; 207 int waitfor = ap->a_waitfor; 208 int mode; 209 210 lwkt_gettoken(&hmp->fs_token); 211 212 /* 213 * Fsync rule relaxation (default is either full synchronous flush 214 * or REDO semantics with synchronous flush). 215 */ 216 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 217 switch(hammer_fsync_mode) { 218 case 0: 219 mode0: 220 /* no REDO, full synchronous flush */ 221 goto skip; 222 case 1: 223 mode1: 224 /* no REDO, full asynchronous flush */ 225 if (waitfor == MNT_WAIT) 226 waitfor = MNT_NOWAIT; 227 goto skip; 228 case 2: 229 /* REDO semantics, synchronous flush */ 230 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 231 goto mode0; 232 mode = HAMMER_FLUSH_UNDOS_AUTO; 233 break; 234 case 3: 235 /* REDO semantics, relaxed asynchronous flush */ 236 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 237 goto mode1; 238 mode = HAMMER_FLUSH_UNDOS_RELAXED; 239 if (waitfor == MNT_WAIT) 240 waitfor = MNT_NOWAIT; 241 break; 242 case 4: 243 /* ignore the fsync() system call */ 244 lwkt_reltoken(&hmp->fs_token); 245 return(0); 246 default: 247 /* we have to do something */ 248 mode = HAMMER_FLUSH_UNDOS_RELAXED; 249 if (waitfor == MNT_WAIT) 250 waitfor = MNT_NOWAIT; 251 break; 252 } 253 254 /* 255 * Fast fsync only needs to flush the UNDO/REDO fifo if 256 * HAMMER_INODE_REDO is non-zero and the only modifications 257 * made to the file are write or write-extends. 258 */ 259 if ((ip->flags & HAMMER_INODE_REDO) && 260 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 261 ) { 262 ++hammer_count_fsyncs; 263 hammer_flusher_flush_undos(hmp, mode); 264 ip->redo_count = 0; 265 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 266 vclrisdirty(ip->vp); 267 lwkt_reltoken(&hmp->fs_token); 268 return(0); 269 } 270 271 /* 272 * REDO is enabled by fsync(), the idea being we really only 273 * want to lay down REDO records when programs are using 274 * fsync() heavily. The first fsync() on the file starts 275 * the gravy train going and later fsync()s keep it hot by 276 * resetting the redo_count. 277 * 278 * We weren't running REDOs before now so we have to fall 279 * through and do a full fsync of what we have. 280 */ 281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 283 ip->flags |= HAMMER_INODE_REDO; 284 ip->redo_count = 0; 285 } 286 } 287 skip: 288 289 /* 290 * Do a full flush sequence. 291 * 292 * Attempt to release the vnode while waiting for the inode to 293 * finish flushing. This can really mess up inactive->reclaim 294 * sequences so only do it if the vnode is active. 295 * 296 * WARNING! The VX lock functions must be used. vn_lock() will 297 * fail when this is part of a VOP_RECLAIM sequence. 298 */ 299 ++hammer_count_fsyncs; 300 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 301 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 302 if (waitfor == MNT_WAIT) { 303 int dorelock; 304 305 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 306 vx_unlock(ap->a_vp); 307 dorelock = 1; 308 } else { 309 dorelock = 0; 310 } 311 hammer_wait_inode(ip); 312 if (dorelock) 313 vx_lock(ap->a_vp); 314 } 315 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 316 vclrisdirty(ip->vp); 317 lwkt_reltoken(&hmp->fs_token); 318 return (ip->error); 319 } 320 321 /* 322 * hammer_vop_read { vp, uio, ioflag, cred } 323 * 324 * MPSAFE (for the cache safe does not require fs_token) 325 */ 326 static 327 int 328 hammer_vop_read(struct vop_read_args *ap) 329 { 330 struct hammer_transaction trans; 331 hammer_inode_t ip; 332 hammer_mount_t hmp; 333 off_t offset; 334 struct buf *bp; 335 struct uio *uio; 336 int error; 337 int n; 338 int seqcount; 339 int ioseqcount; 340 int blksize; 341 int bigread; 342 int got_trans; 343 size_t resid; 344 345 if (ap->a_vp->v_type != VREG) 346 return (EINVAL); 347 ip = VTOI(ap->a_vp); 348 hmp = ip->hmp; 349 error = 0; 350 got_trans = 0; 351 uio = ap->a_uio; 352 353 /* 354 * Attempt to shortcut directly to the VM object using lwbufs. 355 * This is much faster than instantiating buffer cache buffers. 356 */ 357 resid = uio->uio_resid; 358 error = vop_helper_read_shortcut(ap); 359 hammer_stats_file_read += resid - uio->uio_resid; 360 if (error) 361 return (error); 362 if (uio->uio_resid == 0) 363 goto finished; 364 365 /* 366 * Allow the UIO's size to override the sequential heuristic. 367 */ 368 blksize = hammer_blocksize(uio->uio_offset); 369 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 370 ioseqcount = (ap->a_ioflag >> 16); 371 if (seqcount < ioseqcount) 372 seqcount = ioseqcount; 373 374 /* 375 * If reading or writing a huge amount of data we have to break 376 * atomicy and allow the operation to be interrupted by a signal 377 * or it can DOS the machine. 378 */ 379 bigread = (uio->uio_resid > 100 * 1024 * 1024); 380 381 /* 382 * Access the data typically in HAMMER_BUFSIZE blocks via the 383 * buffer cache, but HAMMER may use a variable block size based 384 * on the offset. 385 * 386 * XXX Temporary hack, delay the start transaction while we remain 387 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 388 * locked-shared. 389 */ 390 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 391 int64_t base_offset; 392 int64_t file_limit; 393 394 blksize = hammer_blocksize(uio->uio_offset); 395 offset = (int)uio->uio_offset & (blksize - 1); 396 base_offset = uio->uio_offset - offset; 397 398 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 399 break; 400 401 /* 402 * MPSAFE 403 */ 404 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 405 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 406 bp->b_flags &= ~B_AGE; 407 error = 0; 408 goto skip; 409 } 410 if (ap->a_ioflag & IO_NRDELAY) { 411 bqrelse(bp); 412 return (EWOULDBLOCK); 413 } 414 415 /* 416 * MPUNSAFE 417 */ 418 if (got_trans == 0) { 419 hammer_start_transaction(&trans, ip->hmp); 420 got_trans = 1; 421 } 422 423 /* 424 * NOTE: A valid bp has already been acquired, but was not 425 * B_CACHE. 426 */ 427 if (hammer_cluster_enable) { 428 /* 429 * Use file_limit to prevent cluster_read() from 430 * creating buffers of the wrong block size past 431 * the demarc. 432 */ 433 file_limit = ip->ino_data.size; 434 if (base_offset < HAMMER_XDEMARC && 435 file_limit > HAMMER_XDEMARC) { 436 file_limit = HAMMER_XDEMARC; 437 } 438 error = cluster_readx(ap->a_vp, 439 file_limit, base_offset, 440 blksize, uio->uio_resid, 441 seqcount * BKVASIZE, &bp); 442 } else { 443 error = breadnx(ap->a_vp, base_offset, blksize, 444 NULL, NULL, 0, &bp); 445 } 446 if (error) { 447 brelse(bp); 448 break; 449 } 450 skip: 451 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 452 kprintf("doff %016jx read file %016jx@%016jx\n", 453 (intmax_t)bp->b_bio2.bio_offset, 454 (intmax_t)ip->obj_id, 455 (intmax_t)bp->b_loffset); 456 } 457 bp->b_flags &= ~B_IODEBUG; 458 if (blksize == HAMMER_XBUFSIZE) 459 bp->b_flags |= B_CLUSTEROK; 460 461 n = blksize - offset; 462 if (n > uio->uio_resid) 463 n = uio->uio_resid; 464 if (n > ip->ino_data.size - uio->uio_offset) 465 n = (int)(ip->ino_data.size - uio->uio_offset); 466 467 /* 468 * Set B_AGE, data has a lower priority than meta-data. 469 * 470 * Use a hold/unlock/drop sequence to run the uiomove 471 * with the buffer unlocked, avoiding deadlocks against 472 * read()s on mmap()'d spaces. 473 */ 474 bp->b_flags |= B_AGE; 475 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 476 bqrelse(bp); 477 478 if (error) 479 break; 480 hammer_stats_file_read += n; 481 } 482 483 finished: 484 485 /* 486 * Try to update the atime with just the inode lock for maximum 487 * concurrency. If we can't shortcut it we have to get the full 488 * blown transaction. 489 */ 490 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 491 hammer_start_transaction(&trans, ip->hmp); 492 got_trans = 1; 493 } 494 495 if (got_trans) { 496 if ((ip->flags & HAMMER_INODE_RO) == 0 && 497 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 498 lwkt_gettoken(&hmp->fs_token); 499 ip->ino_data.atime = trans.time; 500 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 501 hammer_done_transaction(&trans); 502 lwkt_reltoken(&hmp->fs_token); 503 } else { 504 hammer_done_transaction(&trans); 505 } 506 } 507 return (error); 508 } 509 510 /* 511 * hammer_vop_write { vp, uio, ioflag, cred } 512 */ 513 static 514 int 515 hammer_vop_write(struct vop_write_args *ap) 516 { 517 struct hammer_transaction trans; 518 struct hammer_inode *ip; 519 hammer_mount_t hmp; 520 thread_t td; 521 struct uio *uio; 522 int offset; 523 off_t base_offset; 524 int64_t cluster_eof; 525 struct buf *bp; 526 int kflags; 527 int error; 528 int n; 529 int flags; 530 int seqcount; 531 int bigwrite; 532 533 if (ap->a_vp->v_type != VREG) 534 return (EINVAL); 535 ip = VTOI(ap->a_vp); 536 hmp = ip->hmp; 537 error = 0; 538 kflags = 0; 539 seqcount = ap->a_ioflag >> 16; 540 541 if (ip->flags & HAMMER_INODE_RO) 542 return (EROFS); 543 544 /* 545 * Create a transaction to cover the operations we perform. 546 */ 547 hammer_start_transaction(&trans, hmp); 548 uio = ap->a_uio; 549 550 /* 551 * Check append mode 552 */ 553 if (ap->a_ioflag & IO_APPEND) 554 uio->uio_offset = ip->ino_data.size; 555 556 /* 557 * Check for illegal write offsets. Valid range is 0...2^63-1. 558 * 559 * NOTE: the base_off assignment is required to work around what 560 * I consider to be a GCC-4 optimization bug. 561 */ 562 if (uio->uio_offset < 0) { 563 hammer_done_transaction(&trans); 564 return (EFBIG); 565 } 566 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 567 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 568 hammer_done_transaction(&trans); 569 return (EFBIG); 570 } 571 572 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 573 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 574 hammer_done_transaction(&trans); 575 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 576 return (EFBIG); 577 } 578 579 /* 580 * If reading or writing a huge amount of data we have to break 581 * atomicy and allow the operation to be interrupted by a signal 582 * or it can DOS the machine. 583 * 584 * Preset redo_count so we stop generating REDOs earlier if the 585 * limit is exceeded. 586 * 587 * redo_count is heuristical, SMP races are ok 588 */ 589 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 590 if ((ip->flags & HAMMER_INODE_REDO) && 591 ip->redo_count < hammer_limit_redo) { 592 ip->redo_count += uio->uio_resid; 593 } 594 595 /* 596 * Access the data typically in HAMMER_BUFSIZE blocks via the 597 * buffer cache, but HAMMER may use a variable block size based 598 * on the offset. 599 */ 600 while (uio->uio_resid > 0) { 601 int fixsize = 0; 602 int blksize; 603 int blkmask; 604 int trivial; 605 int endofblk; 606 off_t nsize; 607 608 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 609 break; 610 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 611 break; 612 613 blksize = hammer_blocksize(uio->uio_offset); 614 615 /* 616 * Control the number of pending records associated with 617 * this inode. If too many have accumulated start a 618 * flush. Try to maintain a pipeline with the flusher. 619 * 620 * NOTE: It is possible for other sources to grow the 621 * records but not necessarily issue another flush, 622 * so use a timeout and ensure that a re-flush occurs. 623 */ 624 if (ip->rsv_recs >= hammer_limit_inode_recs) { 625 lwkt_gettoken(&hmp->fs_token); 626 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 627 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 628 ip->flags |= HAMMER_INODE_RECSW; 629 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 630 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 631 } 632 lwkt_reltoken(&hmp->fs_token); 633 } 634 635 /* 636 * Do not allow HAMMER to blow out the buffer cache. Very 637 * large UIOs can lockout other processes due to bwillwrite() 638 * mechanics. 639 * 640 * The hammer inode is not locked during these operations. 641 * The vnode is locked which can interfere with the pageout 642 * daemon for non-UIO_NOCOPY writes but should not interfere 643 * with the buffer cache. Even so, we cannot afford to 644 * allow the pageout daemon to build up too many dirty buffer 645 * cache buffers. 646 * 647 * Only call this if we aren't being recursively called from 648 * a virtual disk device (vn), else we may deadlock. 649 */ 650 if ((ap->a_ioflag & IO_RECURSE) == 0) 651 bwillwrite(blksize); 652 653 /* 654 * Calculate the blocksize at the current offset and figure 655 * out how much we can actually write. 656 */ 657 blkmask = blksize - 1; 658 offset = (int)uio->uio_offset & blkmask; 659 base_offset = uio->uio_offset & ~(int64_t)blkmask; 660 n = blksize - offset; 661 if (n > uio->uio_resid) { 662 n = uio->uio_resid; 663 endofblk = 0; 664 } else { 665 endofblk = 1; 666 } 667 nsize = uio->uio_offset + n; 668 if (nsize > ip->ino_data.size) { 669 if (uio->uio_offset > ip->ino_data.size) 670 trivial = 0; 671 else 672 trivial = 1; 673 nvextendbuf(ap->a_vp, 674 ip->ino_data.size, 675 nsize, 676 hammer_blocksize(ip->ino_data.size), 677 hammer_blocksize(nsize), 678 hammer_blockoff(ip->ino_data.size), 679 hammer_blockoff(nsize), 680 trivial); 681 fixsize = 1; 682 kflags |= NOTE_EXTEND; 683 } 684 685 if (uio->uio_segflg == UIO_NOCOPY) { 686 /* 687 * Issuing a write with the same data backing the 688 * buffer. Instantiate the buffer to collect the 689 * backing vm pages, then read-in any missing bits. 690 * 691 * This case is used by vop_stdputpages(). 692 */ 693 bp = getblk(ap->a_vp, base_offset, 694 blksize, GETBLK_BHEAVY, 0); 695 if ((bp->b_flags & B_CACHE) == 0) { 696 bqrelse(bp); 697 error = bread(ap->a_vp, base_offset, 698 blksize, &bp); 699 } 700 } else if (offset == 0 && uio->uio_resid >= blksize) { 701 /* 702 * Even though we are entirely overwriting the buffer 703 * we may still have to zero it out to avoid a 704 * mmap/write visibility issue. 705 */ 706 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 707 if ((bp->b_flags & B_CACHE) == 0) 708 vfs_bio_clrbuf(bp); 709 } else if (base_offset >= ip->ino_data.size) { 710 /* 711 * If the base offset of the buffer is beyond the 712 * file EOF, we don't have to issue a read. 713 */ 714 bp = getblk(ap->a_vp, base_offset, 715 blksize, GETBLK_BHEAVY, 0); 716 vfs_bio_clrbuf(bp); 717 } else { 718 /* 719 * Partial overwrite, read in any missing bits then 720 * replace the portion being written. 721 */ 722 error = bread(ap->a_vp, base_offset, blksize, &bp); 723 if (error == 0) 724 bheavy(bp); 725 } 726 if (error == 0) 727 error = uiomovebp(bp, bp->b_data + offset, n, uio); 728 729 lwkt_gettoken(&hmp->fs_token); 730 731 /* 732 * Generate REDO records if enabled and redo_count will not 733 * exceeded the limit. 734 * 735 * If redo_count exceeds the limit we stop generating records 736 * and clear HAMMER_INODE_REDO. This will cause the next 737 * fsync() to do a full meta-data sync instead of just an 738 * UNDO/REDO fifo update. 739 * 740 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 741 * will still be tracked. The tracks will be terminated 742 * when the related meta-data (including possible data 743 * modifications which are not tracked via REDO) is 744 * flushed. 745 */ 746 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 747 if (ip->redo_count < hammer_limit_redo) { 748 bp->b_flags |= B_VFSFLAG1; 749 error = hammer_generate_redo(&trans, ip, 750 base_offset + offset, 751 HAMMER_REDO_WRITE, 752 bp->b_data + offset, 753 (size_t)n); 754 } else { 755 ip->flags &= ~HAMMER_INODE_REDO; 756 } 757 } 758 759 /* 760 * If we screwed up we have to undo any VM size changes we 761 * made. 762 */ 763 if (error) { 764 brelse(bp); 765 if (fixsize) { 766 nvtruncbuf(ap->a_vp, ip->ino_data.size, 767 hammer_blocksize(ip->ino_data.size), 768 hammer_blockoff(ip->ino_data.size), 769 0); 770 } 771 lwkt_reltoken(&hmp->fs_token); 772 break; 773 } 774 kflags |= NOTE_WRITE; 775 hammer_stats_file_write += n; 776 if (blksize == HAMMER_XBUFSIZE) 777 bp->b_flags |= B_CLUSTEROK; 778 if (ip->ino_data.size < uio->uio_offset) { 779 ip->ino_data.size = uio->uio_offset; 780 flags = HAMMER_INODE_SDIRTY; 781 } else { 782 flags = 0; 783 } 784 ip->ino_data.mtime = trans.time; 785 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 786 hammer_modify_inode(&trans, ip, flags); 787 788 /* 789 * Once we dirty the buffer any cached zone-X offset 790 * becomes invalid. HAMMER NOTE: no-history mode cannot 791 * allow overwriting over the same data sector unless 792 * we provide UNDOs for the old data, which we don't. 793 */ 794 bp->b_bio2.bio_offset = NOOFFSET; 795 796 lwkt_reltoken(&hmp->fs_token); 797 798 /* 799 * Final buffer disposition. 800 * 801 * Because meta-data updates are deferred, HAMMER is 802 * especially sensitive to excessive bdwrite()s because 803 * the I/O stream is not broken up by disk reads. So the 804 * buffer cache simply cannot keep up. 805 * 806 * WARNING! blksize is variable. cluster_write() is 807 * expected to not blow up if it encounters 808 * buffers that do not match the passed blksize. 809 * 810 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 811 * The ip->rsv_recs check should burst-flush the data. 812 * If we queue it immediately the buf could be left 813 * locked on the device queue for a very long time. 814 * 815 * However, failing to flush a dirty buffer out when 816 * issued from the pageout daemon can result in a low 817 * memory deadlock against bio_page_alloc(), so we 818 * have to bawrite() on IO_ASYNC as well. 819 * 820 * NOTE! To avoid degenerate stalls due to mismatched block 821 * sizes we only honor IO_DIRECT on the write which 822 * abuts the end of the buffer. However, we must 823 * honor IO_SYNC in case someone is silly enough to 824 * configure a HAMMER file as swap, or when HAMMER 825 * is serving NFS (for commits). Ick ick. 826 */ 827 bp->b_flags |= B_AGE; 828 if (blksize == HAMMER_XBUFSIZE) 829 bp->b_flags |= B_CLUSTEROK; 830 831 if (ap->a_ioflag & IO_SYNC) { 832 bwrite(bp); 833 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 834 bawrite(bp); 835 } else if (ap->a_ioflag & IO_ASYNC) { 836 bawrite(bp); 837 } else if (hammer_cluster_enable && 838 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 839 if (base_offset < HAMMER_XDEMARC) 840 cluster_eof = hammer_blockdemarc(base_offset, 841 ip->ino_data.size); 842 else 843 cluster_eof = ip->ino_data.size; 844 cluster_write(bp, cluster_eof, blksize, seqcount); 845 } else { 846 bdwrite(bp); 847 } 848 } 849 hammer_done_transaction(&trans); 850 hammer_knote(ap->a_vp, kflags); 851 852 return (error); 853 } 854 855 /* 856 * hammer_vop_access { vp, mode, cred } 857 * 858 * MPSAFE - does not require fs_token 859 */ 860 static 861 int 862 hammer_vop_access(struct vop_access_args *ap) 863 { 864 struct hammer_inode *ip = VTOI(ap->a_vp); 865 uid_t uid; 866 gid_t gid; 867 int error; 868 869 ++hammer_stats_file_iopsr; 870 uid = hammer_to_unix_xid(&ip->ino_data.uid); 871 gid = hammer_to_unix_xid(&ip->ino_data.gid); 872 873 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 874 ip->ino_data.uflags); 875 return (error); 876 } 877 878 /* 879 * hammer_vop_advlock { vp, id, op, fl, flags } 880 * 881 * MPSAFE - does not require fs_token 882 */ 883 static 884 int 885 hammer_vop_advlock(struct vop_advlock_args *ap) 886 { 887 hammer_inode_t ip = VTOI(ap->a_vp); 888 889 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 890 } 891 892 /* 893 * hammer_vop_close { vp, fflag } 894 * 895 * We can only sync-on-close for normal closes. XXX disabled for now. 896 */ 897 static 898 int 899 hammer_vop_close(struct vop_close_args *ap) 900 { 901 #if 0 902 struct vnode *vp = ap->a_vp; 903 hammer_inode_t ip = VTOI(vp); 904 int waitfor; 905 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 906 if (vn_islocked(vp) == LK_EXCLUSIVE && 907 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 908 if (ip->flags & HAMMER_INODE_CLOSESYNC) 909 waitfor = MNT_WAIT; 910 else 911 waitfor = MNT_NOWAIT; 912 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 913 HAMMER_INODE_CLOSEASYNC); 914 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 915 } 916 } 917 #endif 918 return (vop_stdclose(ap)); 919 } 920 921 /* 922 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 923 * 924 * The operating system has already ensured that the directory entry 925 * does not exist and done all appropriate namespace locking. 926 */ 927 static 928 int 929 hammer_vop_ncreate(struct vop_ncreate_args *ap) 930 { 931 struct hammer_transaction trans; 932 struct hammer_inode *dip; 933 struct hammer_inode *nip; 934 struct nchandle *nch; 935 hammer_mount_t hmp; 936 int error; 937 938 nch = ap->a_nch; 939 dip = VTOI(ap->a_dvp); 940 hmp = dip->hmp; 941 942 if (dip->flags & HAMMER_INODE_RO) 943 return (EROFS); 944 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 945 return (error); 946 947 /* 948 * Create a transaction to cover the operations we perform. 949 */ 950 lwkt_gettoken(&hmp->fs_token); 951 hammer_start_transaction(&trans, hmp); 952 ++hammer_stats_file_iopsw; 953 954 /* 955 * Create a new filesystem object of the requested type. The 956 * returned inode will be referenced and shared-locked to prevent 957 * it from being moved to the flusher. 958 */ 959 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 960 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 961 NULL, &nip); 962 if (error) { 963 hkprintf("hammer_create_inode error %d\n", error); 964 hammer_done_transaction(&trans); 965 *ap->a_vpp = NULL; 966 lwkt_reltoken(&hmp->fs_token); 967 return (error); 968 } 969 970 /* 971 * Add the new filesystem object to the directory. This will also 972 * bump the inode's link count. 973 */ 974 error = hammer_ip_add_directory(&trans, dip, 975 nch->ncp->nc_name, nch->ncp->nc_nlen, 976 nip); 977 if (error) 978 hkprintf("hammer_ip_add_directory error %d\n", error); 979 980 /* 981 * Finish up. 982 */ 983 if (error) { 984 hammer_rel_inode(nip, 0); 985 hammer_done_transaction(&trans); 986 *ap->a_vpp = NULL; 987 } else { 988 error = hammer_get_vnode(nip, ap->a_vpp); 989 hammer_done_transaction(&trans); 990 hammer_rel_inode(nip, 0); 991 if (error == 0) { 992 cache_setunresolved(ap->a_nch); 993 cache_setvp(ap->a_nch, *ap->a_vpp); 994 } 995 hammer_knote(ap->a_dvp, NOTE_WRITE); 996 } 997 lwkt_reltoken(&hmp->fs_token); 998 return (error); 999 } 1000 1001 /* 1002 * hammer_vop_getattr { vp, vap } 1003 * 1004 * Retrieve an inode's attribute information. When accessing inodes 1005 * historically we fake the atime field to ensure consistent results. 1006 * The atime field is stored in the B-Tree element and allowed to be 1007 * updated without cycling the element. 1008 * 1009 * MPSAFE - does not require fs_token 1010 */ 1011 static 1012 int 1013 hammer_vop_getattr(struct vop_getattr_args *ap) 1014 { 1015 struct hammer_inode *ip = VTOI(ap->a_vp); 1016 struct vattr *vap = ap->a_vap; 1017 1018 /* 1019 * We want the fsid to be different when accessing a filesystem 1020 * with different as-of's so programs like diff don't think 1021 * the files are the same. 1022 * 1023 * We also want the fsid to be the same when comparing snapshots, 1024 * or when comparing mirrors (which might be backed by different 1025 * physical devices). HAMMER fsids are based on the PFS's 1026 * shared_uuid field. 1027 * 1028 * XXX there is a chance of collision here. The va_fsid reported 1029 * by stat is different from the more involved fsid used in the 1030 * mount structure. 1031 */ 1032 ++hammer_stats_file_iopsr; 1033 hammer_lock_sh(&ip->lock); 1034 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1035 (u_int32_t)(ip->obj_asof >> 32); 1036 1037 vap->va_fileid = ip->ino_leaf.base.obj_id; 1038 vap->va_mode = ip->ino_data.mode; 1039 vap->va_nlink = ip->ino_data.nlinks; 1040 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1041 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1042 vap->va_rmajor = 0; 1043 vap->va_rminor = 0; 1044 vap->va_size = ip->ino_data.size; 1045 1046 /* 1047 * Special case for @@PFS softlinks. The actual size of the 1048 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1049 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1050 */ 1051 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1052 ip->ino_data.size == 10 && 1053 ip->obj_asof == HAMMER_MAX_TID && 1054 ip->obj_localization == 0 && 1055 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1056 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1057 vap->va_size = 26; 1058 else 1059 vap->va_size = 10; 1060 } 1061 1062 /* 1063 * We must provide a consistent atime and mtime for snapshots 1064 * so people can do a 'tar cf - ... | md5' on them and get 1065 * consistent results. 1066 */ 1067 if (ip->flags & HAMMER_INODE_RO) { 1068 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1069 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1070 } else { 1071 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1072 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1073 } 1074 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1075 vap->va_flags = ip->ino_data.uflags; 1076 vap->va_gen = 1; /* hammer inums are unique for all time */ 1077 vap->va_blocksize = HAMMER_BUFSIZE; 1078 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1079 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1080 ~HAMMER_XBUFMASK64; 1081 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1082 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1083 ~HAMMER_BUFMASK64; 1084 } else { 1085 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1086 } 1087 1088 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1089 vap->va_filerev = 0; /* XXX */ 1090 vap->va_uid_uuid = ip->ino_data.uid; 1091 vap->va_gid_uuid = ip->ino_data.gid; 1092 vap->va_fsid_uuid = ip->hmp->fsid; 1093 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1094 VA_FSID_UUID_VALID; 1095 1096 switch (ip->ino_data.obj_type) { 1097 case HAMMER_OBJTYPE_CDEV: 1098 case HAMMER_OBJTYPE_BDEV: 1099 vap->va_rmajor = ip->ino_data.rmajor; 1100 vap->va_rminor = ip->ino_data.rminor; 1101 break; 1102 default: 1103 break; 1104 } 1105 hammer_unlock(&ip->lock); 1106 return(0); 1107 } 1108 1109 /* 1110 * hammer_vop_nresolve { nch, dvp, cred } 1111 * 1112 * Locate the requested directory entry. 1113 */ 1114 static 1115 int 1116 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1117 { 1118 struct hammer_transaction trans; 1119 struct namecache *ncp; 1120 hammer_mount_t hmp; 1121 hammer_inode_t dip; 1122 hammer_inode_t ip; 1123 hammer_tid_t asof; 1124 struct hammer_cursor cursor; 1125 struct vnode *vp; 1126 int64_t namekey; 1127 int error; 1128 int i; 1129 int nlen; 1130 int flags; 1131 int ispfs; 1132 int64_t obj_id; 1133 u_int32_t localization; 1134 u_int32_t max_iterations; 1135 1136 /* 1137 * Misc initialization, plus handle as-of name extensions. Look for 1138 * the '@@' extension. Note that as-of files and directories cannot 1139 * be modified. 1140 */ 1141 dip = VTOI(ap->a_dvp); 1142 ncp = ap->a_nch->ncp; 1143 asof = dip->obj_asof; 1144 localization = dip->obj_localization; /* for code consistency */ 1145 nlen = ncp->nc_nlen; 1146 flags = dip->flags & HAMMER_INODE_RO; 1147 ispfs = 0; 1148 hmp = dip->hmp; 1149 1150 lwkt_gettoken(&hmp->fs_token); 1151 hammer_simple_transaction(&trans, hmp); 1152 ++hammer_stats_file_iopsr; 1153 1154 for (i = 0; i < nlen; ++i) { 1155 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1156 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1157 &ispfs, &asof, &localization); 1158 if (error != 0) { 1159 i = nlen; 1160 break; 1161 } 1162 if (asof != HAMMER_MAX_TID) 1163 flags |= HAMMER_INODE_RO; 1164 break; 1165 } 1166 } 1167 nlen = i; 1168 1169 /* 1170 * If this is a PFS softlink we dive into the PFS 1171 */ 1172 if (ispfs && nlen == 0) { 1173 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1174 asof, localization, 1175 flags, &error); 1176 if (error == 0) { 1177 error = hammer_get_vnode(ip, &vp); 1178 hammer_rel_inode(ip, 0); 1179 } else { 1180 vp = NULL; 1181 } 1182 if (error == 0) { 1183 vn_unlock(vp); 1184 cache_setvp(ap->a_nch, vp); 1185 vrele(vp); 1186 } 1187 goto done; 1188 } 1189 1190 /* 1191 * If there is no path component the time extension is relative to dip. 1192 * e.g. "fubar/@@<snapshot>" 1193 * 1194 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1195 * e.g. "fubar/.@@<snapshot>" 1196 * 1197 * ".." is handled by the kernel. We do not currently handle 1198 * "..@<snapshot>". 1199 */ 1200 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1201 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1202 asof, dip->obj_localization, 1203 flags, &error); 1204 if (error == 0) { 1205 error = hammer_get_vnode(ip, &vp); 1206 hammer_rel_inode(ip, 0); 1207 } else { 1208 vp = NULL; 1209 } 1210 if (error == 0) { 1211 vn_unlock(vp); 1212 cache_setvp(ap->a_nch, vp); 1213 vrele(vp); 1214 } 1215 goto done; 1216 } 1217 1218 /* 1219 * Calculate the namekey and setup the key range for the scan. This 1220 * works kinda like a chained hash table where the lower 32 bits 1221 * of the namekey synthesize the chain. 1222 * 1223 * The key range is inclusive of both key_beg and key_end. 1224 */ 1225 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1226 &max_iterations); 1227 1228 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1229 cursor.key_beg.localization = dip->obj_localization + 1230 hammer_dir_localization(dip); 1231 cursor.key_beg.obj_id = dip->obj_id; 1232 cursor.key_beg.key = namekey; 1233 cursor.key_beg.create_tid = 0; 1234 cursor.key_beg.delete_tid = 0; 1235 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1236 cursor.key_beg.obj_type = 0; 1237 1238 cursor.key_end = cursor.key_beg; 1239 cursor.key_end.key += max_iterations; 1240 cursor.asof = asof; 1241 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1242 1243 /* 1244 * Scan all matching records (the chain), locate the one matching 1245 * the requested path component. 1246 * 1247 * The hammer_ip_*() functions merge in-memory records with on-disk 1248 * records for the purposes of the search. 1249 */ 1250 obj_id = 0; 1251 localization = HAMMER_DEF_LOCALIZATION; 1252 1253 if (error == 0) { 1254 error = hammer_ip_first(&cursor); 1255 while (error == 0) { 1256 error = hammer_ip_resolve_data(&cursor); 1257 if (error) 1258 break; 1259 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1260 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1261 obj_id = cursor.data->entry.obj_id; 1262 localization = cursor.data->entry.localization; 1263 break; 1264 } 1265 error = hammer_ip_next(&cursor); 1266 } 1267 } 1268 hammer_done_cursor(&cursor); 1269 1270 /* 1271 * Lookup the obj_id. This should always succeed. If it does not 1272 * the filesystem may be damaged and we return a dummy inode. 1273 */ 1274 if (error == 0) { 1275 ip = hammer_get_inode(&trans, dip, obj_id, 1276 asof, localization, 1277 flags, &error); 1278 if (error == ENOENT) { 1279 kprintf("HAMMER: WARNING: Missing " 1280 "inode for dirent \"%s\"\n" 1281 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1282 ncp->nc_name, 1283 (long long)obj_id, (long long)asof, 1284 localization); 1285 error = 0; 1286 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1287 asof, localization, 1288 flags, &error); 1289 } 1290 if (error == 0) { 1291 error = hammer_get_vnode(ip, &vp); 1292 hammer_rel_inode(ip, 0); 1293 } else { 1294 vp = NULL; 1295 } 1296 if (error == 0) { 1297 vn_unlock(vp); 1298 cache_setvp(ap->a_nch, vp); 1299 vrele(vp); 1300 } 1301 } else if (error == ENOENT) { 1302 cache_setvp(ap->a_nch, NULL); 1303 } 1304 done: 1305 hammer_done_transaction(&trans); 1306 lwkt_reltoken(&hmp->fs_token); 1307 return (error); 1308 } 1309 1310 /* 1311 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1312 * 1313 * Locate the parent directory of a directory vnode. 1314 * 1315 * dvp is referenced but not locked. *vpp must be returned referenced and 1316 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1317 * at the root, instead it could indicate that the directory we were in was 1318 * removed. 1319 * 1320 * NOTE: as-of sequences are not linked into the directory structure. If 1321 * we are at the root with a different asof then the mount point, reload 1322 * the same directory with the mount point's asof. I'm not sure what this 1323 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1324 * get confused, but it hasn't been tested. 1325 */ 1326 static 1327 int 1328 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1329 { 1330 struct hammer_transaction trans; 1331 struct hammer_inode *dip; 1332 struct hammer_inode *ip; 1333 hammer_mount_t hmp; 1334 int64_t parent_obj_id; 1335 u_int32_t parent_obj_localization; 1336 hammer_tid_t asof; 1337 int error; 1338 1339 dip = VTOI(ap->a_dvp); 1340 asof = dip->obj_asof; 1341 hmp = dip->hmp; 1342 1343 /* 1344 * Whos are parent? This could be the root of a pseudo-filesystem 1345 * whos parent is in another localization domain. 1346 */ 1347 lwkt_gettoken(&hmp->fs_token); 1348 parent_obj_id = dip->ino_data.parent_obj_id; 1349 if (dip->obj_id == HAMMER_OBJID_ROOT) 1350 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1351 else 1352 parent_obj_localization = dip->obj_localization; 1353 1354 if (parent_obj_id == 0) { 1355 if (dip->obj_id == HAMMER_OBJID_ROOT && 1356 asof != hmp->asof) { 1357 parent_obj_id = dip->obj_id; 1358 asof = hmp->asof; 1359 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1360 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1361 (long long)dip->obj_asof); 1362 } else { 1363 *ap->a_vpp = NULL; 1364 lwkt_reltoken(&hmp->fs_token); 1365 return ENOENT; 1366 } 1367 } 1368 1369 hammer_simple_transaction(&trans, hmp); 1370 ++hammer_stats_file_iopsr; 1371 1372 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1373 asof, parent_obj_localization, 1374 dip->flags, &error); 1375 if (ip) { 1376 error = hammer_get_vnode(ip, ap->a_vpp); 1377 hammer_rel_inode(ip, 0); 1378 } else { 1379 *ap->a_vpp = NULL; 1380 } 1381 hammer_done_transaction(&trans); 1382 lwkt_reltoken(&hmp->fs_token); 1383 return (error); 1384 } 1385 1386 /* 1387 * hammer_vop_nlink { nch, dvp, vp, cred } 1388 */ 1389 static 1390 int 1391 hammer_vop_nlink(struct vop_nlink_args *ap) 1392 { 1393 struct hammer_transaction trans; 1394 struct hammer_inode *dip; 1395 struct hammer_inode *ip; 1396 struct nchandle *nch; 1397 hammer_mount_t hmp; 1398 int error; 1399 1400 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1401 return(EXDEV); 1402 1403 nch = ap->a_nch; 1404 dip = VTOI(ap->a_dvp); 1405 ip = VTOI(ap->a_vp); 1406 hmp = dip->hmp; 1407 1408 if (dip->obj_localization != ip->obj_localization) 1409 return(EXDEV); 1410 1411 if (dip->flags & HAMMER_INODE_RO) 1412 return (EROFS); 1413 if (ip->flags & HAMMER_INODE_RO) 1414 return (EROFS); 1415 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1416 return (error); 1417 1418 /* 1419 * Create a transaction to cover the operations we perform. 1420 */ 1421 lwkt_gettoken(&hmp->fs_token); 1422 hammer_start_transaction(&trans, hmp); 1423 ++hammer_stats_file_iopsw; 1424 1425 /* 1426 * Add the filesystem object to the directory. Note that neither 1427 * dip nor ip are referenced or locked, but their vnodes are 1428 * referenced. This function will bump the inode's link count. 1429 */ 1430 error = hammer_ip_add_directory(&trans, dip, 1431 nch->ncp->nc_name, nch->ncp->nc_nlen, 1432 ip); 1433 1434 /* 1435 * Finish up. 1436 */ 1437 if (error == 0) { 1438 cache_setunresolved(nch); 1439 cache_setvp(nch, ap->a_vp); 1440 } 1441 hammer_done_transaction(&trans); 1442 hammer_knote(ap->a_vp, NOTE_LINK); 1443 hammer_knote(ap->a_dvp, NOTE_WRITE); 1444 lwkt_reltoken(&hmp->fs_token); 1445 return (error); 1446 } 1447 1448 /* 1449 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1450 * 1451 * The operating system has already ensured that the directory entry 1452 * does not exist and done all appropriate namespace locking. 1453 */ 1454 static 1455 int 1456 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1457 { 1458 struct hammer_transaction trans; 1459 struct hammer_inode *dip; 1460 struct hammer_inode *nip; 1461 struct nchandle *nch; 1462 hammer_mount_t hmp; 1463 int error; 1464 1465 nch = ap->a_nch; 1466 dip = VTOI(ap->a_dvp); 1467 hmp = dip->hmp; 1468 1469 if (dip->flags & HAMMER_INODE_RO) 1470 return (EROFS); 1471 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1472 return (error); 1473 1474 /* 1475 * Create a transaction to cover the operations we perform. 1476 */ 1477 lwkt_gettoken(&hmp->fs_token); 1478 hammer_start_transaction(&trans, hmp); 1479 ++hammer_stats_file_iopsw; 1480 1481 /* 1482 * Create a new filesystem object of the requested type. The 1483 * returned inode will be referenced but not locked. 1484 */ 1485 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1486 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1487 NULL, &nip); 1488 if (error) { 1489 hkprintf("hammer_mkdir error %d\n", error); 1490 hammer_done_transaction(&trans); 1491 *ap->a_vpp = NULL; 1492 lwkt_reltoken(&hmp->fs_token); 1493 return (error); 1494 } 1495 /* 1496 * Add the new filesystem object to the directory. This will also 1497 * bump the inode's link count. 1498 */ 1499 error = hammer_ip_add_directory(&trans, dip, 1500 nch->ncp->nc_name, nch->ncp->nc_nlen, 1501 nip); 1502 if (error) 1503 hkprintf("hammer_mkdir (add) error %d\n", error); 1504 1505 /* 1506 * Finish up. 1507 */ 1508 if (error) { 1509 hammer_rel_inode(nip, 0); 1510 *ap->a_vpp = NULL; 1511 } else { 1512 error = hammer_get_vnode(nip, ap->a_vpp); 1513 hammer_rel_inode(nip, 0); 1514 if (error == 0) { 1515 cache_setunresolved(ap->a_nch); 1516 cache_setvp(ap->a_nch, *ap->a_vpp); 1517 } 1518 } 1519 hammer_done_transaction(&trans); 1520 if (error == 0) 1521 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1522 lwkt_reltoken(&hmp->fs_token); 1523 return (error); 1524 } 1525 1526 /* 1527 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1528 * 1529 * The operating system has already ensured that the directory entry 1530 * does not exist and done all appropriate namespace locking. 1531 */ 1532 static 1533 int 1534 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1535 { 1536 struct hammer_transaction trans; 1537 struct hammer_inode *dip; 1538 struct hammer_inode *nip; 1539 struct nchandle *nch; 1540 hammer_mount_t hmp; 1541 int error; 1542 1543 nch = ap->a_nch; 1544 dip = VTOI(ap->a_dvp); 1545 hmp = dip->hmp; 1546 1547 if (dip->flags & HAMMER_INODE_RO) 1548 return (EROFS); 1549 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1550 return (error); 1551 1552 /* 1553 * Create a transaction to cover the operations we perform. 1554 */ 1555 lwkt_gettoken(&hmp->fs_token); 1556 hammer_start_transaction(&trans, hmp); 1557 ++hammer_stats_file_iopsw; 1558 1559 /* 1560 * Create a new filesystem object of the requested type. The 1561 * returned inode will be referenced but not locked. 1562 * 1563 * If mknod specifies a directory a pseudo-fs is created. 1564 */ 1565 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1566 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1567 NULL, &nip); 1568 if (error) { 1569 hammer_done_transaction(&trans); 1570 *ap->a_vpp = NULL; 1571 lwkt_reltoken(&hmp->fs_token); 1572 return (error); 1573 } 1574 1575 /* 1576 * Add the new filesystem object to the directory. This will also 1577 * bump the inode's link count. 1578 */ 1579 error = hammer_ip_add_directory(&trans, dip, 1580 nch->ncp->nc_name, nch->ncp->nc_nlen, 1581 nip); 1582 1583 /* 1584 * Finish up. 1585 */ 1586 if (error) { 1587 hammer_rel_inode(nip, 0); 1588 *ap->a_vpp = NULL; 1589 } else { 1590 error = hammer_get_vnode(nip, ap->a_vpp); 1591 hammer_rel_inode(nip, 0); 1592 if (error == 0) { 1593 cache_setunresolved(ap->a_nch); 1594 cache_setvp(ap->a_nch, *ap->a_vpp); 1595 } 1596 } 1597 hammer_done_transaction(&trans); 1598 if (error == 0) 1599 hammer_knote(ap->a_dvp, NOTE_WRITE); 1600 lwkt_reltoken(&hmp->fs_token); 1601 return (error); 1602 } 1603 1604 /* 1605 * hammer_vop_open { vp, mode, cred, fp } 1606 * 1607 * MPSAFE (does not require fs_token) 1608 */ 1609 static 1610 int 1611 hammer_vop_open(struct vop_open_args *ap) 1612 { 1613 hammer_inode_t ip; 1614 1615 ++hammer_stats_file_iopsr; 1616 ip = VTOI(ap->a_vp); 1617 1618 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1619 return (EROFS); 1620 return(vop_stdopen(ap)); 1621 } 1622 1623 /* 1624 * hammer_vop_print { vp } 1625 */ 1626 static 1627 int 1628 hammer_vop_print(struct vop_print_args *ap) 1629 { 1630 return EOPNOTSUPP; 1631 } 1632 1633 /* 1634 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1635 */ 1636 static 1637 int 1638 hammer_vop_readdir(struct vop_readdir_args *ap) 1639 { 1640 struct hammer_transaction trans; 1641 struct hammer_cursor cursor; 1642 struct hammer_inode *ip; 1643 hammer_mount_t hmp; 1644 struct uio *uio; 1645 hammer_base_elm_t base; 1646 int error; 1647 int cookie_index; 1648 int ncookies; 1649 off_t *cookies; 1650 off_t saveoff; 1651 int r; 1652 int dtype; 1653 1654 ++hammer_stats_file_iopsr; 1655 ip = VTOI(ap->a_vp); 1656 uio = ap->a_uio; 1657 saveoff = uio->uio_offset; 1658 hmp = ip->hmp; 1659 1660 if (ap->a_ncookies) { 1661 ncookies = uio->uio_resid / 16 + 1; 1662 if (ncookies > 1024) 1663 ncookies = 1024; 1664 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1665 cookie_index = 0; 1666 } else { 1667 ncookies = -1; 1668 cookies = NULL; 1669 cookie_index = 0; 1670 } 1671 1672 lwkt_gettoken(&hmp->fs_token); 1673 hammer_simple_transaction(&trans, hmp); 1674 1675 /* 1676 * Handle artificial entries 1677 * 1678 * It should be noted that the minimum value for a directory 1679 * hash key on-media is 0x0000000100000000, so we can use anything 1680 * less then that to represent our 'special' key space. 1681 */ 1682 error = 0; 1683 if (saveoff == 0) { 1684 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1685 if (r) 1686 goto done; 1687 if (cookies) 1688 cookies[cookie_index] = saveoff; 1689 ++saveoff; 1690 ++cookie_index; 1691 if (cookie_index == ncookies) 1692 goto done; 1693 } 1694 if (saveoff == 1) { 1695 if (ip->ino_data.parent_obj_id) { 1696 r = vop_write_dirent(&error, uio, 1697 ip->ino_data.parent_obj_id, 1698 DT_DIR, 2, ".."); 1699 } else { 1700 r = vop_write_dirent(&error, uio, 1701 ip->obj_id, DT_DIR, 2, ".."); 1702 } 1703 if (r) 1704 goto done; 1705 if (cookies) 1706 cookies[cookie_index] = saveoff; 1707 ++saveoff; 1708 ++cookie_index; 1709 if (cookie_index == ncookies) 1710 goto done; 1711 } 1712 1713 /* 1714 * Key range (begin and end inclusive) to scan. Directory keys 1715 * directly translate to a 64 bit 'seek' position. 1716 */ 1717 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1718 cursor.key_beg.localization = ip->obj_localization + 1719 hammer_dir_localization(ip); 1720 cursor.key_beg.obj_id = ip->obj_id; 1721 cursor.key_beg.create_tid = 0; 1722 cursor.key_beg.delete_tid = 0; 1723 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1724 cursor.key_beg.obj_type = 0; 1725 cursor.key_beg.key = saveoff; 1726 1727 cursor.key_end = cursor.key_beg; 1728 cursor.key_end.key = HAMMER_MAX_KEY; 1729 cursor.asof = ip->obj_asof; 1730 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1731 1732 error = hammer_ip_first(&cursor); 1733 1734 while (error == 0) { 1735 error = hammer_ip_resolve_data(&cursor); 1736 if (error) 1737 break; 1738 base = &cursor.leaf->base; 1739 saveoff = base->key; 1740 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1741 1742 if (base->obj_id != ip->obj_id) 1743 panic("readdir: bad record at %p", cursor.node); 1744 1745 /* 1746 * Convert pseudo-filesystems into softlinks 1747 */ 1748 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1749 r = vop_write_dirent( 1750 &error, uio, cursor.data->entry.obj_id, 1751 dtype, 1752 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1753 (void *)cursor.data->entry.name); 1754 if (r) 1755 break; 1756 ++saveoff; 1757 if (cookies) 1758 cookies[cookie_index] = base->key; 1759 ++cookie_index; 1760 if (cookie_index == ncookies) 1761 break; 1762 error = hammer_ip_next(&cursor); 1763 } 1764 hammer_done_cursor(&cursor); 1765 1766 done: 1767 hammer_done_transaction(&trans); 1768 1769 if (ap->a_eofflag) 1770 *ap->a_eofflag = (error == ENOENT); 1771 uio->uio_offset = saveoff; 1772 if (error && cookie_index == 0) { 1773 if (error == ENOENT) 1774 error = 0; 1775 if (cookies) { 1776 kfree(cookies, M_TEMP); 1777 *ap->a_ncookies = 0; 1778 *ap->a_cookies = NULL; 1779 } 1780 } else { 1781 if (error == ENOENT) 1782 error = 0; 1783 if (cookies) { 1784 *ap->a_ncookies = cookie_index; 1785 *ap->a_cookies = cookies; 1786 } 1787 } 1788 lwkt_reltoken(&hmp->fs_token); 1789 return(error); 1790 } 1791 1792 /* 1793 * hammer_vop_readlink { vp, uio, cred } 1794 */ 1795 static 1796 int 1797 hammer_vop_readlink(struct vop_readlink_args *ap) 1798 { 1799 struct hammer_transaction trans; 1800 struct hammer_cursor cursor; 1801 struct hammer_inode *ip; 1802 hammer_mount_t hmp; 1803 char buf[32]; 1804 u_int32_t localization; 1805 hammer_pseudofs_inmem_t pfsm; 1806 int error; 1807 1808 ip = VTOI(ap->a_vp); 1809 hmp = ip->hmp; 1810 1811 lwkt_gettoken(&hmp->fs_token); 1812 1813 /* 1814 * Shortcut if the symlink data was stuffed into ino_data. 1815 * 1816 * Also expand special "@@PFS%05d" softlinks (expansion only 1817 * occurs for non-historical (current) accesses made from the 1818 * primary filesystem). 1819 */ 1820 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1821 char *ptr; 1822 int bytes; 1823 1824 ptr = ip->ino_data.ext.symlink; 1825 bytes = (int)ip->ino_data.size; 1826 if (bytes == 10 && 1827 ip->obj_asof == HAMMER_MAX_TID && 1828 ip->obj_localization == 0 && 1829 strncmp(ptr, "@@PFS", 5) == 0) { 1830 hammer_simple_transaction(&trans, hmp); 1831 bcopy(ptr + 5, buf, 5); 1832 buf[5] = 0; 1833 localization = strtoul(buf, NULL, 10) << 16; 1834 pfsm = hammer_load_pseudofs(&trans, localization, 1835 &error); 1836 if (error == 0) { 1837 if (pfsm->pfsd.mirror_flags & 1838 HAMMER_PFSD_SLAVE) { 1839 /* vap->va_size == 26 */ 1840 ksnprintf(buf, sizeof(buf), 1841 "@@0x%016llx:%05d", 1842 (long long)pfsm->pfsd.sync_end_tid, 1843 localization >> 16); 1844 } else { 1845 /* vap->va_size == 10 */ 1846 ksnprintf(buf, sizeof(buf), 1847 "@@-1:%05d", 1848 localization >> 16); 1849 #if 0 1850 ksnprintf(buf, sizeof(buf), 1851 "@@0x%016llx:%05d", 1852 (long long)HAMMER_MAX_TID, 1853 localization >> 16); 1854 #endif 1855 } 1856 ptr = buf; 1857 bytes = strlen(buf); 1858 } 1859 if (pfsm) 1860 hammer_rel_pseudofs(hmp, pfsm); 1861 hammer_done_transaction(&trans); 1862 } 1863 error = uiomove(ptr, bytes, ap->a_uio); 1864 lwkt_reltoken(&hmp->fs_token); 1865 return(error); 1866 } 1867 1868 /* 1869 * Long version 1870 */ 1871 hammer_simple_transaction(&trans, hmp); 1872 ++hammer_stats_file_iopsr; 1873 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1874 1875 /* 1876 * Key range (begin and end inclusive) to scan. Directory keys 1877 * directly translate to a 64 bit 'seek' position. 1878 */ 1879 cursor.key_beg.localization = ip->obj_localization + 1880 HAMMER_LOCALIZE_MISC; 1881 cursor.key_beg.obj_id = ip->obj_id; 1882 cursor.key_beg.create_tid = 0; 1883 cursor.key_beg.delete_tid = 0; 1884 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1885 cursor.key_beg.obj_type = 0; 1886 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1887 cursor.asof = ip->obj_asof; 1888 cursor.flags |= HAMMER_CURSOR_ASOF; 1889 1890 error = hammer_ip_lookup(&cursor); 1891 if (error == 0) { 1892 error = hammer_ip_resolve_data(&cursor); 1893 if (error == 0) { 1894 KKASSERT(cursor.leaf->data_len >= 1895 HAMMER_SYMLINK_NAME_OFF); 1896 error = uiomove(cursor.data->symlink.name, 1897 cursor.leaf->data_len - 1898 HAMMER_SYMLINK_NAME_OFF, 1899 ap->a_uio); 1900 } 1901 } 1902 hammer_done_cursor(&cursor); 1903 hammer_done_transaction(&trans); 1904 lwkt_reltoken(&hmp->fs_token); 1905 return(error); 1906 } 1907 1908 /* 1909 * hammer_vop_nremove { nch, dvp, cred } 1910 */ 1911 static 1912 int 1913 hammer_vop_nremove(struct vop_nremove_args *ap) 1914 { 1915 struct hammer_transaction trans; 1916 struct hammer_inode *dip; 1917 hammer_mount_t hmp; 1918 int error; 1919 1920 dip = VTOI(ap->a_dvp); 1921 hmp = dip->hmp; 1922 1923 if (hammer_nohistory(dip) == 0 && 1924 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1925 return (error); 1926 } 1927 1928 lwkt_gettoken(&hmp->fs_token); 1929 hammer_start_transaction(&trans, hmp); 1930 ++hammer_stats_file_iopsw; 1931 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1932 hammer_done_transaction(&trans); 1933 if (error == 0) 1934 hammer_knote(ap->a_dvp, NOTE_WRITE); 1935 lwkt_reltoken(&hmp->fs_token); 1936 return (error); 1937 } 1938 1939 /* 1940 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1941 */ 1942 static 1943 int 1944 hammer_vop_nrename(struct vop_nrename_args *ap) 1945 { 1946 struct hammer_transaction trans; 1947 struct namecache *fncp; 1948 struct namecache *tncp; 1949 struct hammer_inode *fdip; 1950 struct hammer_inode *tdip; 1951 struct hammer_inode *ip; 1952 hammer_mount_t hmp; 1953 struct hammer_cursor cursor; 1954 int64_t namekey; 1955 u_int32_t max_iterations; 1956 int nlen, error; 1957 1958 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1959 return(EXDEV); 1960 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1961 return(EXDEV); 1962 1963 fdip = VTOI(ap->a_fdvp); 1964 tdip = VTOI(ap->a_tdvp); 1965 fncp = ap->a_fnch->ncp; 1966 tncp = ap->a_tnch->ncp; 1967 ip = VTOI(fncp->nc_vp); 1968 KKASSERT(ip != NULL); 1969 1970 hmp = ip->hmp; 1971 1972 if (fdip->obj_localization != tdip->obj_localization) 1973 return(EXDEV); 1974 if (fdip->obj_localization != ip->obj_localization) 1975 return(EXDEV); 1976 1977 if (fdip->flags & HAMMER_INODE_RO) 1978 return (EROFS); 1979 if (tdip->flags & HAMMER_INODE_RO) 1980 return (EROFS); 1981 if (ip->flags & HAMMER_INODE_RO) 1982 return (EROFS); 1983 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1984 return (error); 1985 1986 lwkt_gettoken(&hmp->fs_token); 1987 hammer_start_transaction(&trans, hmp); 1988 ++hammer_stats_file_iopsw; 1989 1990 /* 1991 * Remove tncp from the target directory and then link ip as 1992 * tncp. XXX pass trans to dounlink 1993 * 1994 * Force the inode sync-time to match the transaction so it is 1995 * in-sync with the creation of the target directory entry. 1996 */ 1997 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1998 ap->a_cred, 0, -1); 1999 if (error == 0 || error == ENOENT) { 2000 error = hammer_ip_add_directory(&trans, tdip, 2001 tncp->nc_name, tncp->nc_nlen, 2002 ip); 2003 if (error == 0) { 2004 ip->ino_data.parent_obj_id = tdip->obj_id; 2005 ip->ino_data.ctime = trans.time; 2006 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2007 } 2008 } 2009 if (error) 2010 goto failed; /* XXX */ 2011 2012 /* 2013 * Locate the record in the originating directory and remove it. 2014 * 2015 * Calculate the namekey and setup the key range for the scan. This 2016 * works kinda like a chained hash table where the lower 32 bits 2017 * of the namekey synthesize the chain. 2018 * 2019 * The key range is inclusive of both key_beg and key_end. 2020 */ 2021 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2022 &max_iterations); 2023 retry: 2024 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2025 cursor.key_beg.localization = fdip->obj_localization + 2026 hammer_dir_localization(fdip); 2027 cursor.key_beg.obj_id = fdip->obj_id; 2028 cursor.key_beg.key = namekey; 2029 cursor.key_beg.create_tid = 0; 2030 cursor.key_beg.delete_tid = 0; 2031 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2032 cursor.key_beg.obj_type = 0; 2033 2034 cursor.key_end = cursor.key_beg; 2035 cursor.key_end.key += max_iterations; 2036 cursor.asof = fdip->obj_asof; 2037 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2038 2039 /* 2040 * Scan all matching records (the chain), locate the one matching 2041 * the requested path component. 2042 * 2043 * The hammer_ip_*() functions merge in-memory records with on-disk 2044 * records for the purposes of the search. 2045 */ 2046 error = hammer_ip_first(&cursor); 2047 while (error == 0) { 2048 if (hammer_ip_resolve_data(&cursor) != 0) 2049 break; 2050 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2051 KKASSERT(nlen > 0); 2052 if (fncp->nc_nlen == nlen && 2053 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2054 break; 2055 } 2056 error = hammer_ip_next(&cursor); 2057 } 2058 2059 /* 2060 * If all is ok we have to get the inode so we can adjust nlinks. 2061 * 2062 * WARNING: hammer_ip_del_directory() may have to terminate the 2063 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2064 * twice. 2065 */ 2066 if (error == 0) 2067 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2068 2069 /* 2070 * XXX A deadlock here will break rename's atomicy for the purposes 2071 * of crash recovery. 2072 */ 2073 if (error == EDEADLK) { 2074 hammer_done_cursor(&cursor); 2075 goto retry; 2076 } 2077 2078 /* 2079 * Cleanup and tell the kernel that the rename succeeded. 2080 * 2081 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2082 * without formally acquiring the vp since the vp might 2083 * have zero refs on it, or in the middle of a reclaim, 2084 * etc. 2085 */ 2086 hammer_done_cursor(&cursor); 2087 if (error == 0) { 2088 cache_rename(ap->a_fnch, ap->a_tnch); 2089 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2090 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2091 while (ip->vp) { 2092 struct vnode *vp; 2093 2094 error = hammer_get_vnode(ip, &vp); 2095 if (error == 0 && vp) { 2096 vn_unlock(vp); 2097 hammer_knote(ip->vp, NOTE_RENAME); 2098 vrele(vp); 2099 break; 2100 } 2101 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2102 } 2103 } 2104 2105 failed: 2106 hammer_done_transaction(&trans); 2107 lwkt_reltoken(&hmp->fs_token); 2108 return (error); 2109 } 2110 2111 /* 2112 * hammer_vop_nrmdir { nch, dvp, cred } 2113 */ 2114 static 2115 int 2116 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2117 { 2118 struct hammer_transaction trans; 2119 struct hammer_inode *dip; 2120 hammer_mount_t hmp; 2121 int error; 2122 2123 dip = VTOI(ap->a_dvp); 2124 hmp = dip->hmp; 2125 2126 if (hammer_nohistory(dip) == 0 && 2127 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2128 return (error); 2129 } 2130 2131 lwkt_gettoken(&hmp->fs_token); 2132 hammer_start_transaction(&trans, hmp); 2133 ++hammer_stats_file_iopsw; 2134 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2135 hammer_done_transaction(&trans); 2136 if (error == 0) 2137 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2138 lwkt_reltoken(&hmp->fs_token); 2139 return (error); 2140 } 2141 2142 /* 2143 * hammer_vop_markatime { vp, cred } 2144 */ 2145 static 2146 int 2147 hammer_vop_markatime(struct vop_markatime_args *ap) 2148 { 2149 struct hammer_transaction trans; 2150 struct hammer_inode *ip; 2151 hammer_mount_t hmp; 2152 2153 ip = VTOI(ap->a_vp); 2154 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2155 return (EROFS); 2156 if (ip->flags & HAMMER_INODE_RO) 2157 return (EROFS); 2158 hmp = ip->hmp; 2159 if (hmp->mp->mnt_flag & MNT_NOATIME) 2160 return (0); 2161 lwkt_gettoken(&hmp->fs_token); 2162 hammer_start_transaction(&trans, hmp); 2163 ++hammer_stats_file_iopsw; 2164 2165 ip->ino_data.atime = trans.time; 2166 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2167 hammer_done_transaction(&trans); 2168 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2169 lwkt_reltoken(&hmp->fs_token); 2170 return (0); 2171 } 2172 2173 /* 2174 * hammer_vop_setattr { vp, vap, cred } 2175 */ 2176 static 2177 int 2178 hammer_vop_setattr(struct vop_setattr_args *ap) 2179 { 2180 struct hammer_transaction trans; 2181 struct hammer_inode *ip; 2182 struct vattr *vap; 2183 hammer_mount_t hmp; 2184 int modflags; 2185 int error; 2186 int truncating; 2187 int blksize; 2188 int kflags; 2189 #if 0 2190 int64_t aligned_size; 2191 #endif 2192 u_int32_t flags; 2193 2194 vap = ap->a_vap; 2195 ip = ap->a_vp->v_data; 2196 modflags = 0; 2197 kflags = 0; 2198 hmp = ip->hmp; 2199 2200 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2201 return(EROFS); 2202 if (ip->flags & HAMMER_INODE_RO) 2203 return (EROFS); 2204 if (hammer_nohistory(ip) == 0 && 2205 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2206 return (error); 2207 } 2208 2209 lwkt_gettoken(&hmp->fs_token); 2210 hammer_start_transaction(&trans, hmp); 2211 ++hammer_stats_file_iopsw; 2212 error = 0; 2213 2214 if (vap->va_flags != VNOVAL) { 2215 flags = ip->ino_data.uflags; 2216 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2217 hammer_to_unix_xid(&ip->ino_data.uid), 2218 ap->a_cred); 2219 if (error == 0) { 2220 if (ip->ino_data.uflags != flags) { 2221 ip->ino_data.uflags = flags; 2222 ip->ino_data.ctime = trans.time; 2223 modflags |= HAMMER_INODE_DDIRTY; 2224 kflags |= NOTE_ATTRIB; 2225 } 2226 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2227 error = 0; 2228 goto done; 2229 } 2230 } 2231 goto done; 2232 } 2233 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2234 error = EPERM; 2235 goto done; 2236 } 2237 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2238 mode_t cur_mode = ip->ino_data.mode; 2239 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2240 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2241 uuid_t uuid_uid; 2242 uuid_t uuid_gid; 2243 2244 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2245 ap->a_cred, 2246 &cur_uid, &cur_gid, &cur_mode); 2247 if (error == 0) { 2248 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2249 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2250 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2251 sizeof(uuid_uid)) || 2252 bcmp(&uuid_gid, &ip->ino_data.gid, 2253 sizeof(uuid_gid)) || 2254 ip->ino_data.mode != cur_mode 2255 ) { 2256 ip->ino_data.uid = uuid_uid; 2257 ip->ino_data.gid = uuid_gid; 2258 ip->ino_data.mode = cur_mode; 2259 ip->ino_data.ctime = trans.time; 2260 modflags |= HAMMER_INODE_DDIRTY; 2261 } 2262 kflags |= NOTE_ATTRIB; 2263 } 2264 } 2265 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2266 switch(ap->a_vp->v_type) { 2267 case VREG: 2268 if (vap->va_size == ip->ino_data.size) 2269 break; 2270 2271 /* 2272 * Log the operation if in fast-fsync mode or if 2273 * there are unterminated redo write records present. 2274 * 2275 * The second check is needed so the recovery code 2276 * properly truncates write redos even if nominal 2277 * REDO operations is turned off due to excessive 2278 * writes, because the related records might be 2279 * destroyed and never lay down a TERM_WRITE. 2280 */ 2281 if ((ip->flags & HAMMER_INODE_REDO) || 2282 (ip->flags & HAMMER_INODE_RDIRTY)) { 2283 error = hammer_generate_redo(&trans, ip, 2284 vap->va_size, 2285 HAMMER_REDO_TRUNC, 2286 NULL, 0); 2287 } 2288 blksize = hammer_blocksize(vap->va_size); 2289 2290 /* 2291 * XXX break atomicy, we can deadlock the backend 2292 * if we do not release the lock. Probably not a 2293 * big deal here. 2294 */ 2295 if (vap->va_size < ip->ino_data.size) { 2296 nvtruncbuf(ap->a_vp, vap->va_size, 2297 blksize, 2298 hammer_blockoff(vap->va_size), 2299 0); 2300 truncating = 1; 2301 kflags |= NOTE_WRITE; 2302 } else { 2303 nvextendbuf(ap->a_vp, 2304 ip->ino_data.size, 2305 vap->va_size, 2306 hammer_blocksize(ip->ino_data.size), 2307 hammer_blocksize(vap->va_size), 2308 hammer_blockoff(ip->ino_data.size), 2309 hammer_blockoff(vap->va_size), 2310 0); 2311 truncating = 0; 2312 kflags |= NOTE_WRITE | NOTE_EXTEND; 2313 } 2314 ip->ino_data.size = vap->va_size; 2315 ip->ino_data.mtime = trans.time; 2316 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2317 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2318 2319 /* 2320 * On-media truncation is cached in the inode until 2321 * the inode is synchronized. We must immediately 2322 * handle any frontend records. 2323 */ 2324 if (truncating) { 2325 hammer_ip_frontend_trunc(ip, vap->va_size); 2326 #ifdef DEBUG_TRUNCATE 2327 if (HammerTruncIp == NULL) 2328 HammerTruncIp = ip; 2329 #endif 2330 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2331 ip->flags |= HAMMER_INODE_TRUNCATED; 2332 ip->trunc_off = vap->va_size; 2333 hammer_inode_dirty(ip); 2334 #ifdef DEBUG_TRUNCATE 2335 if (ip == HammerTruncIp) 2336 kprintf("truncate1 %016llx\n", 2337 (long long)ip->trunc_off); 2338 #endif 2339 } else if (ip->trunc_off > vap->va_size) { 2340 ip->trunc_off = vap->va_size; 2341 #ifdef DEBUG_TRUNCATE 2342 if (ip == HammerTruncIp) 2343 kprintf("truncate2 %016llx\n", 2344 (long long)ip->trunc_off); 2345 #endif 2346 } else { 2347 #ifdef DEBUG_TRUNCATE 2348 if (ip == HammerTruncIp) 2349 kprintf("truncate3 %016llx (ignored)\n", 2350 (long long)vap->va_size); 2351 #endif 2352 } 2353 } 2354 2355 #if 0 2356 /* 2357 * When truncating, nvtruncbuf() may have cleaned out 2358 * a portion of the last block on-disk in the buffer 2359 * cache. We must clean out any frontend records 2360 * for blocks beyond the new last block. 2361 */ 2362 aligned_size = (vap->va_size + (blksize - 1)) & 2363 ~(int64_t)(blksize - 1); 2364 if (truncating && vap->va_size < aligned_size) { 2365 aligned_size -= blksize; 2366 hammer_ip_frontend_trunc(ip, aligned_size); 2367 } 2368 #endif 2369 break; 2370 case VDATABASE: 2371 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2372 ip->flags |= HAMMER_INODE_TRUNCATED; 2373 ip->trunc_off = vap->va_size; 2374 hammer_inode_dirty(ip); 2375 } else if (ip->trunc_off > vap->va_size) { 2376 ip->trunc_off = vap->va_size; 2377 } 2378 hammer_ip_frontend_trunc(ip, vap->va_size); 2379 ip->ino_data.size = vap->va_size; 2380 ip->ino_data.mtime = trans.time; 2381 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2382 kflags |= NOTE_ATTRIB; 2383 break; 2384 default: 2385 error = EINVAL; 2386 goto done; 2387 } 2388 break; 2389 } 2390 if (vap->va_atime.tv_sec != VNOVAL) { 2391 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2392 modflags |= HAMMER_INODE_ATIME; 2393 kflags |= NOTE_ATTRIB; 2394 } 2395 if (vap->va_mtime.tv_sec != VNOVAL) { 2396 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2397 modflags |= HAMMER_INODE_MTIME; 2398 kflags |= NOTE_ATTRIB; 2399 } 2400 if (vap->va_mode != (mode_t)VNOVAL) { 2401 mode_t cur_mode = ip->ino_data.mode; 2402 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2403 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2404 2405 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2406 cur_uid, cur_gid, &cur_mode); 2407 if (error == 0 && ip->ino_data.mode != cur_mode) { 2408 ip->ino_data.mode = cur_mode; 2409 ip->ino_data.ctime = trans.time; 2410 modflags |= HAMMER_INODE_DDIRTY; 2411 kflags |= NOTE_ATTRIB; 2412 } 2413 } 2414 done: 2415 if (error == 0) 2416 hammer_modify_inode(&trans, ip, modflags); 2417 hammer_done_transaction(&trans); 2418 hammer_knote(ap->a_vp, kflags); 2419 lwkt_reltoken(&hmp->fs_token); 2420 return (error); 2421 } 2422 2423 /* 2424 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2425 */ 2426 static 2427 int 2428 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2429 { 2430 struct hammer_transaction trans; 2431 struct hammer_inode *dip; 2432 struct hammer_inode *nip; 2433 hammer_record_t record; 2434 struct nchandle *nch; 2435 hammer_mount_t hmp; 2436 int error; 2437 int bytes; 2438 2439 ap->a_vap->va_type = VLNK; 2440 2441 nch = ap->a_nch; 2442 dip = VTOI(ap->a_dvp); 2443 hmp = dip->hmp; 2444 2445 if (dip->flags & HAMMER_INODE_RO) 2446 return (EROFS); 2447 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2448 return (error); 2449 2450 /* 2451 * Create a transaction to cover the operations we perform. 2452 */ 2453 lwkt_gettoken(&hmp->fs_token); 2454 hammer_start_transaction(&trans, hmp); 2455 ++hammer_stats_file_iopsw; 2456 2457 /* 2458 * Create a new filesystem object of the requested type. The 2459 * returned inode will be referenced but not locked. 2460 */ 2461 2462 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2463 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2464 NULL, &nip); 2465 if (error) { 2466 hammer_done_transaction(&trans); 2467 *ap->a_vpp = NULL; 2468 lwkt_reltoken(&hmp->fs_token); 2469 return (error); 2470 } 2471 2472 /* 2473 * Add a record representing the symlink. symlink stores the link 2474 * as pure data, not a string, and is no \0 terminated. 2475 */ 2476 if (error == 0) { 2477 bytes = strlen(ap->a_target); 2478 2479 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2480 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2481 } else { 2482 record = hammer_alloc_mem_record(nip, bytes); 2483 record->type = HAMMER_MEM_RECORD_GENERAL; 2484 2485 record->leaf.base.localization = nip->obj_localization + 2486 HAMMER_LOCALIZE_MISC; 2487 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2488 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2489 record->leaf.data_len = bytes; 2490 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2491 bcopy(ap->a_target, record->data->symlink.name, bytes); 2492 error = hammer_ip_add_record(&trans, record); 2493 } 2494 2495 /* 2496 * Set the file size to the length of the link. 2497 */ 2498 if (error == 0) { 2499 nip->ino_data.size = bytes; 2500 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2501 } 2502 } 2503 if (error == 0) 2504 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2505 nch->ncp->nc_nlen, nip); 2506 2507 /* 2508 * Finish up. 2509 */ 2510 if (error) { 2511 hammer_rel_inode(nip, 0); 2512 *ap->a_vpp = NULL; 2513 } else { 2514 error = hammer_get_vnode(nip, ap->a_vpp); 2515 hammer_rel_inode(nip, 0); 2516 if (error == 0) { 2517 cache_setunresolved(ap->a_nch); 2518 cache_setvp(ap->a_nch, *ap->a_vpp); 2519 hammer_knote(ap->a_dvp, NOTE_WRITE); 2520 } 2521 } 2522 hammer_done_transaction(&trans); 2523 lwkt_reltoken(&hmp->fs_token); 2524 return (error); 2525 } 2526 2527 /* 2528 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2529 */ 2530 static 2531 int 2532 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2533 { 2534 struct hammer_transaction trans; 2535 struct hammer_inode *dip; 2536 hammer_mount_t hmp; 2537 int error; 2538 2539 dip = VTOI(ap->a_dvp); 2540 hmp = dip->hmp; 2541 2542 if (hammer_nohistory(dip) == 0 && 2543 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2544 return (error); 2545 } 2546 2547 lwkt_gettoken(&hmp->fs_token); 2548 hammer_start_transaction(&trans, hmp); 2549 ++hammer_stats_file_iopsw; 2550 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2551 ap->a_cred, ap->a_flags, -1); 2552 hammer_done_transaction(&trans); 2553 lwkt_reltoken(&hmp->fs_token); 2554 2555 return (error); 2556 } 2557 2558 /* 2559 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2560 */ 2561 static 2562 int 2563 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2564 { 2565 struct hammer_inode *ip = ap->a_vp->v_data; 2566 hammer_mount_t hmp = ip->hmp; 2567 int error; 2568 2569 ++hammer_stats_file_iopsr; 2570 lwkt_gettoken(&hmp->fs_token); 2571 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2572 ap->a_fflag, ap->a_cred); 2573 lwkt_reltoken(&hmp->fs_token); 2574 return (error); 2575 } 2576 2577 static 2578 int 2579 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2580 { 2581 static const struct mountctl_opt extraopt[] = { 2582 { HMNT_NOHISTORY, "nohistory" }, 2583 { HMNT_MASTERID, "master" }, 2584 { 0, NULL} 2585 2586 }; 2587 struct hammer_mount *hmp; 2588 struct mount *mp; 2589 int usedbytes; 2590 int error; 2591 2592 error = 0; 2593 usedbytes = 0; 2594 mp = ap->a_head.a_ops->head.vv_mount; 2595 KKASSERT(mp->mnt_data != NULL); 2596 hmp = (struct hammer_mount *)mp->mnt_data; 2597 2598 lwkt_gettoken(&hmp->fs_token); 2599 2600 switch(ap->a_op) { 2601 case MOUNTCTL_SET_EXPORT: 2602 if (ap->a_ctllen != sizeof(struct export_args)) 2603 error = EINVAL; 2604 else 2605 error = hammer_vfs_export(mp, ap->a_op, 2606 (const struct export_args *)ap->a_ctl); 2607 break; 2608 case MOUNTCTL_MOUNTFLAGS: 2609 { 2610 /* 2611 * Call standard mountctl VOP function 2612 * so we get user mount flags. 2613 */ 2614 error = vop_stdmountctl(ap); 2615 if (error) 2616 break; 2617 2618 usedbytes = *ap->a_res; 2619 2620 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2621 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2622 ap->a_buf, 2623 ap->a_buflen - usedbytes, 2624 &error); 2625 } 2626 2627 *ap->a_res += usedbytes; 2628 break; 2629 } 2630 default: 2631 error = vop_stdmountctl(ap); 2632 break; 2633 } 2634 lwkt_reltoken(&hmp->fs_token); 2635 return(error); 2636 } 2637 2638 /* 2639 * hammer_vop_strategy { vp, bio } 2640 * 2641 * Strategy call, used for regular file read & write only. Note that the 2642 * bp may represent a cluster. 2643 * 2644 * To simplify operation and allow better optimizations in the future, 2645 * this code does not make any assumptions with regards to buffer alignment 2646 * or size. 2647 */ 2648 static 2649 int 2650 hammer_vop_strategy(struct vop_strategy_args *ap) 2651 { 2652 struct buf *bp; 2653 int error; 2654 2655 bp = ap->a_bio->bio_buf; 2656 2657 switch(bp->b_cmd) { 2658 case BUF_CMD_READ: 2659 error = hammer_vop_strategy_read(ap); 2660 break; 2661 case BUF_CMD_WRITE: 2662 error = hammer_vop_strategy_write(ap); 2663 break; 2664 default: 2665 bp->b_error = error = EINVAL; 2666 bp->b_flags |= B_ERROR; 2667 biodone(ap->a_bio); 2668 break; 2669 } 2670 2671 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2672 2673 return (error); 2674 } 2675 2676 /* 2677 * Read from a regular file. Iterate the related records and fill in the 2678 * BIO/BUF. Gaps are zero-filled. 2679 * 2680 * The support code in hammer_object.c should be used to deal with mixed 2681 * in-memory and on-disk records. 2682 * 2683 * NOTE: Can be called from the cluster code with an oversized buf. 2684 * 2685 * XXX atime update 2686 */ 2687 static 2688 int 2689 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2690 { 2691 struct hammer_transaction trans; 2692 struct hammer_inode *ip; 2693 struct hammer_inode *dip; 2694 hammer_mount_t hmp; 2695 struct hammer_cursor cursor; 2696 hammer_base_elm_t base; 2697 hammer_off_t disk_offset; 2698 struct bio *bio; 2699 struct bio *nbio; 2700 struct buf *bp; 2701 int64_t rec_offset; 2702 int64_t ran_end; 2703 int64_t tmp64; 2704 int error; 2705 int boff; 2706 int roff; 2707 int n; 2708 int isdedupable; 2709 2710 bio = ap->a_bio; 2711 bp = bio->bio_buf; 2712 ip = ap->a_vp->v_data; 2713 hmp = ip->hmp; 2714 2715 /* 2716 * The zone-2 disk offset may have been set by the cluster code via 2717 * a BMAP operation, or else should be NOOFFSET. 2718 * 2719 * Checking the high bits for a match against zone-2 should suffice. 2720 * 2721 * In cases where a lot of data duplication is present it may be 2722 * more beneficial to drop through and doubule-buffer through the 2723 * device. 2724 */ 2725 nbio = push_bio(bio); 2726 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2727 HAMMER_ZONE_LARGE_DATA) { 2728 if (hammer_double_buffer == 0) { 2729 lwkt_gettoken(&hmp->fs_token); 2730 error = hammer_io_direct_read(hmp, nbio, NULL); 2731 lwkt_reltoken(&hmp->fs_token); 2732 return (error); 2733 } 2734 2735 /* 2736 * Try to shortcut requests for double_buffer mode too. 2737 * Since this mode runs through the device buffer cache 2738 * only compatible buffer sizes (meaning those generated 2739 * by normal filesystem buffers) are legal. 2740 */ 2741 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2742 lwkt_gettoken(&hmp->fs_token); 2743 error = hammer_io_indirect_read(hmp, nbio, NULL); 2744 lwkt_reltoken(&hmp->fs_token); 2745 return (error); 2746 } 2747 } 2748 2749 /* 2750 * Well, that sucked. Do it the hard way. If all the stars are 2751 * aligned we may still be able to issue a direct-read. 2752 */ 2753 lwkt_gettoken(&hmp->fs_token); 2754 hammer_simple_transaction(&trans, hmp); 2755 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2756 2757 /* 2758 * Key range (begin and end inclusive) to scan. Note that the key's 2759 * stored in the actual records represent BASE+LEN, not BASE. The 2760 * first record containing bio_offset will have a key > bio_offset. 2761 */ 2762 cursor.key_beg.localization = ip->obj_localization + 2763 HAMMER_LOCALIZE_MISC; 2764 cursor.key_beg.obj_id = ip->obj_id; 2765 cursor.key_beg.create_tid = 0; 2766 cursor.key_beg.delete_tid = 0; 2767 cursor.key_beg.obj_type = 0; 2768 cursor.key_beg.key = bio->bio_offset + 1; 2769 cursor.asof = ip->obj_asof; 2770 cursor.flags |= HAMMER_CURSOR_ASOF; 2771 2772 cursor.key_end = cursor.key_beg; 2773 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2774 #if 0 2775 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2776 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2777 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2778 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2779 } else 2780 #endif 2781 { 2782 ran_end = bio->bio_offset + bp->b_bufsize; 2783 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2784 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2785 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2786 if (tmp64 < ran_end) 2787 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2788 else 2789 cursor.key_end.key = ran_end + MAXPHYS + 1; 2790 } 2791 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2792 2793 /* 2794 * Set NOSWAPCACHE for cursor data extraction if double buffering 2795 * is disabled or (if the file is not marked cacheable via chflags 2796 * and vm.swapcache_use_chflags is enabled). 2797 */ 2798 if (hammer_double_buffer == 0 || 2799 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2800 vm_swapcache_use_chflags)) { 2801 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2802 } 2803 2804 error = hammer_ip_first(&cursor); 2805 boff = 0; 2806 2807 while (error == 0) { 2808 /* 2809 * Get the base file offset of the record. The key for 2810 * data records is (base + bytes) rather then (base). 2811 */ 2812 base = &cursor.leaf->base; 2813 rec_offset = base->key - cursor.leaf->data_len; 2814 2815 /* 2816 * Calculate the gap, if any, and zero-fill it. 2817 * 2818 * n is the offset of the start of the record verses our 2819 * current seek offset in the bio. 2820 */ 2821 n = (int)(rec_offset - (bio->bio_offset + boff)); 2822 if (n > 0) { 2823 if (n > bp->b_bufsize - boff) 2824 n = bp->b_bufsize - boff; 2825 bzero((char *)bp->b_data + boff, n); 2826 boff += n; 2827 n = 0; 2828 } 2829 2830 /* 2831 * Calculate the data offset in the record and the number 2832 * of bytes we can copy. 2833 * 2834 * There are two degenerate cases. First, boff may already 2835 * be at bp->b_bufsize. Secondly, the data offset within 2836 * the record may exceed the record's size. 2837 */ 2838 roff = -n; 2839 rec_offset += roff; 2840 n = cursor.leaf->data_len - roff; 2841 if (n <= 0) { 2842 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2843 n = 0; 2844 } else if (n > bp->b_bufsize - boff) { 2845 n = bp->b_bufsize - boff; 2846 } 2847 2848 /* 2849 * Deal with cached truncations. This cool bit of code 2850 * allows truncate()/ftruncate() to avoid having to sync 2851 * the file. 2852 * 2853 * If the frontend is truncated then all backend records are 2854 * subject to the frontend's truncation. 2855 * 2856 * If the backend is truncated then backend records on-disk 2857 * (but not in-memory) are subject to the backend's 2858 * truncation. In-memory records owned by the backend 2859 * represent data written after the truncation point on the 2860 * backend and must not be truncated. 2861 * 2862 * Truncate operations deal with frontend buffer cache 2863 * buffers and frontend-owned in-memory records synchronously. 2864 */ 2865 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2866 if (hammer_cursor_ondisk(&cursor)/* || 2867 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2868 if (ip->trunc_off <= rec_offset) 2869 n = 0; 2870 else if (ip->trunc_off < rec_offset + n) 2871 n = (int)(ip->trunc_off - rec_offset); 2872 } 2873 } 2874 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2875 if (hammer_cursor_ondisk(&cursor)) { 2876 if (ip->sync_trunc_off <= rec_offset) 2877 n = 0; 2878 else if (ip->sync_trunc_off < rec_offset + n) 2879 n = (int)(ip->sync_trunc_off - rec_offset); 2880 } 2881 } 2882 2883 /* 2884 * Try to issue a direct read into our bio if possible, 2885 * otherwise resolve the element data into a hammer_buffer 2886 * and copy. 2887 * 2888 * The buffer on-disk should be zerod past any real 2889 * truncation point, but may not be for any synthesized 2890 * truncation point from above. 2891 * 2892 * NOTE: disk_offset is only valid if the cursor data is 2893 * on-disk. 2894 */ 2895 disk_offset = cursor.leaf->data_offset + roff; 2896 isdedupable = (boff == 0 && n == bp->b_bufsize && 2897 hammer_cursor_ondisk(&cursor) && 2898 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2899 2900 if (isdedupable && hammer_double_buffer == 0) { 2901 /* 2902 * Direct read case 2903 */ 2904 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2905 HAMMER_ZONE_LARGE_DATA); 2906 nbio->bio_offset = disk_offset; 2907 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2908 if (hammer_live_dedup && error == 0) 2909 hammer_dedup_cache_add(ip, cursor.leaf); 2910 goto done; 2911 } else if (isdedupable) { 2912 /* 2913 * Async I/O case for reading from backing store 2914 * and copying the data to the filesystem buffer. 2915 * live-dedup has to verify the data anyway if it 2916 * gets a hit later so we can just add the entry 2917 * now. 2918 */ 2919 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2920 HAMMER_ZONE_LARGE_DATA); 2921 nbio->bio_offset = disk_offset; 2922 if (hammer_live_dedup) 2923 hammer_dedup_cache_add(ip, cursor.leaf); 2924 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2925 goto done; 2926 } else if (n) { 2927 error = hammer_ip_resolve_data(&cursor); 2928 if (error == 0) { 2929 if (hammer_live_dedup && isdedupable) 2930 hammer_dedup_cache_add(ip, cursor.leaf); 2931 bcopy((char *)cursor.data + roff, 2932 (char *)bp->b_data + boff, n); 2933 } 2934 } 2935 if (error) 2936 break; 2937 2938 /* 2939 * We have to be sure that the only elements added to the 2940 * dedup cache are those which are already on-media. 2941 */ 2942 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2943 hammer_dedup_cache_add(ip, cursor.leaf); 2944 2945 /* 2946 * Iterate until we have filled the request. 2947 */ 2948 boff += n; 2949 if (boff == bp->b_bufsize) 2950 break; 2951 error = hammer_ip_next(&cursor); 2952 } 2953 2954 /* 2955 * There may have been a gap after the last record 2956 */ 2957 if (error == ENOENT) 2958 error = 0; 2959 if (error == 0 && boff != bp->b_bufsize) { 2960 KKASSERT(boff < bp->b_bufsize); 2961 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2962 /* boff = bp->b_bufsize; */ 2963 } 2964 2965 /* 2966 * Disallow swapcache operation on the vnode buffer if double 2967 * buffering is enabled, the swapcache will get the data via 2968 * the block device buffer. 2969 */ 2970 if (hammer_double_buffer) 2971 bp->b_flags |= B_NOTMETA; 2972 2973 /* 2974 * Cleanup 2975 */ 2976 bp->b_resid = 0; 2977 bp->b_error = error; 2978 if (error) 2979 bp->b_flags |= B_ERROR; 2980 biodone(ap->a_bio); 2981 2982 done: 2983 /* 2984 * Cache the b-tree node for the last data read in cache[1]. 2985 * 2986 * If we hit the file EOF then also cache the node in the 2987 * governing director's cache[3], it will be used to initialize 2988 * the inode's cache[1] for any inodes looked up via the directory. 2989 * 2990 * This doesn't reduce disk accesses since the B-Tree chain is 2991 * likely cached, but it does reduce cpu overhead when looking 2992 * up file offsets for cpdup/tar/cpio style iterations. 2993 */ 2994 if (cursor.node) 2995 hammer_cache_node(&ip->cache[1], cursor.node); 2996 if (ran_end >= ip->ino_data.size) { 2997 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2998 ip->obj_asof, ip->obj_localization); 2999 if (dip) { 3000 hammer_cache_node(&dip->cache[3], cursor.node); 3001 hammer_rel_inode(dip, 0); 3002 } 3003 } 3004 hammer_done_cursor(&cursor); 3005 hammer_done_transaction(&trans); 3006 lwkt_reltoken(&hmp->fs_token); 3007 return(error); 3008 } 3009 3010 /* 3011 * BMAP operation - used to support cluster_read() only. 3012 * 3013 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3014 * 3015 * This routine may return EOPNOTSUPP if the opration is not supported for 3016 * the specified offset. The contents of the pointer arguments do not 3017 * need to be initialized in that case. 3018 * 3019 * If a disk address is available and properly aligned return 0 with 3020 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3021 * to the run-length relative to that offset. Callers may assume that 3022 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3023 * large, so return EOPNOTSUPP if it is not sufficiently large. 3024 */ 3025 static 3026 int 3027 hammer_vop_bmap(struct vop_bmap_args *ap) 3028 { 3029 struct hammer_transaction trans; 3030 struct hammer_inode *ip; 3031 hammer_mount_t hmp; 3032 struct hammer_cursor cursor; 3033 hammer_base_elm_t base; 3034 int64_t rec_offset; 3035 int64_t ran_end; 3036 int64_t tmp64; 3037 int64_t base_offset; 3038 int64_t base_disk_offset; 3039 int64_t last_offset; 3040 hammer_off_t last_disk_offset; 3041 hammer_off_t disk_offset; 3042 int rec_len; 3043 int error; 3044 int blksize; 3045 3046 ++hammer_stats_file_iopsr; 3047 ip = ap->a_vp->v_data; 3048 hmp = ip->hmp; 3049 3050 /* 3051 * We can only BMAP regular files. We can't BMAP database files, 3052 * directories, etc. 3053 */ 3054 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3055 return(EOPNOTSUPP); 3056 3057 /* 3058 * bmap is typically called with runp/runb both NULL when used 3059 * for writing. We do not support BMAP for writing atm. 3060 */ 3061 if (ap->a_cmd != BUF_CMD_READ) 3062 return(EOPNOTSUPP); 3063 3064 /* 3065 * Scan the B-Tree to acquire blockmap addresses, then translate 3066 * to raw addresses. 3067 */ 3068 lwkt_gettoken(&hmp->fs_token); 3069 hammer_simple_transaction(&trans, hmp); 3070 #if 0 3071 kprintf("bmap_beg %016llx ip->cache %p\n", 3072 (long long)ap->a_loffset, ip->cache[1]); 3073 #endif 3074 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3075 3076 /* 3077 * Key range (begin and end inclusive) to scan. Note that the key's 3078 * stored in the actual records represent BASE+LEN, not BASE. The 3079 * first record containing bio_offset will have a key > bio_offset. 3080 */ 3081 cursor.key_beg.localization = ip->obj_localization + 3082 HAMMER_LOCALIZE_MISC; 3083 cursor.key_beg.obj_id = ip->obj_id; 3084 cursor.key_beg.create_tid = 0; 3085 cursor.key_beg.delete_tid = 0; 3086 cursor.key_beg.obj_type = 0; 3087 if (ap->a_runb) 3088 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3089 else 3090 cursor.key_beg.key = ap->a_loffset + 1; 3091 if (cursor.key_beg.key < 0) 3092 cursor.key_beg.key = 0; 3093 cursor.asof = ip->obj_asof; 3094 cursor.flags |= HAMMER_CURSOR_ASOF; 3095 3096 cursor.key_end = cursor.key_beg; 3097 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3098 3099 ran_end = ap->a_loffset + MAXPHYS; 3100 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3101 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3102 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3103 if (tmp64 < ran_end) 3104 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3105 else 3106 cursor.key_end.key = ran_end + MAXPHYS + 1; 3107 3108 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3109 3110 error = hammer_ip_first(&cursor); 3111 base_offset = last_offset = 0; 3112 base_disk_offset = last_disk_offset = 0; 3113 3114 while (error == 0) { 3115 /* 3116 * Get the base file offset of the record. The key for 3117 * data records is (base + bytes) rather then (base). 3118 * 3119 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3120 * The extra bytes should be zero on-disk and the BMAP op 3121 * should still be ok. 3122 */ 3123 base = &cursor.leaf->base; 3124 rec_offset = base->key - cursor.leaf->data_len; 3125 rec_len = cursor.leaf->data_len; 3126 3127 /* 3128 * Incorporate any cached truncation. 3129 * 3130 * NOTE: Modifications to rec_len based on synthesized 3131 * truncation points remove the guarantee that any extended 3132 * data on disk is zero (since the truncations may not have 3133 * taken place on-media yet). 3134 */ 3135 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3136 if (hammer_cursor_ondisk(&cursor) || 3137 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3138 if (ip->trunc_off <= rec_offset) 3139 rec_len = 0; 3140 else if (ip->trunc_off < rec_offset + rec_len) 3141 rec_len = (int)(ip->trunc_off - rec_offset); 3142 } 3143 } 3144 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3145 if (hammer_cursor_ondisk(&cursor)) { 3146 if (ip->sync_trunc_off <= rec_offset) 3147 rec_len = 0; 3148 else if (ip->sync_trunc_off < rec_offset + rec_len) 3149 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3150 } 3151 } 3152 3153 /* 3154 * Accumulate information. If we have hit a discontiguous 3155 * block reset base_offset unless we are already beyond the 3156 * requested offset. If we are, that's it, we stop. 3157 */ 3158 if (error) 3159 break; 3160 if (hammer_cursor_ondisk(&cursor)) { 3161 disk_offset = cursor.leaf->data_offset; 3162 if (rec_offset != last_offset || 3163 disk_offset != last_disk_offset) { 3164 if (rec_offset > ap->a_loffset) 3165 break; 3166 base_offset = rec_offset; 3167 base_disk_offset = disk_offset; 3168 } 3169 last_offset = rec_offset + rec_len; 3170 last_disk_offset = disk_offset + rec_len; 3171 3172 if (hammer_live_dedup) 3173 hammer_dedup_cache_add(ip, cursor.leaf); 3174 } 3175 3176 error = hammer_ip_next(&cursor); 3177 } 3178 3179 #if 0 3180 kprintf("BMAP %016llx: %016llx - %016llx\n", 3181 (long long)ap->a_loffset, 3182 (long long)base_offset, 3183 (long long)last_offset); 3184 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3185 (long long)base_disk_offset, 3186 (long long)last_disk_offset); 3187 #endif 3188 3189 if (cursor.node) { 3190 hammer_cache_node(&ip->cache[1], cursor.node); 3191 #if 0 3192 kprintf("bmap_end2 %016llx ip->cache %p\n", 3193 (long long)ap->a_loffset, ip->cache[1]); 3194 #endif 3195 } 3196 hammer_done_cursor(&cursor); 3197 hammer_done_transaction(&trans); 3198 lwkt_reltoken(&hmp->fs_token); 3199 3200 /* 3201 * If we couldn't find any records or the records we did find were 3202 * all behind the requested offset, return failure. A forward 3203 * truncation can leave a hole w/ no on-disk records. 3204 */ 3205 if (last_offset == 0 || last_offset < ap->a_loffset) 3206 return (EOPNOTSUPP); 3207 3208 /* 3209 * Figure out the block size at the requested offset and adjust 3210 * our limits so the cluster_read() does not create inappropriately 3211 * sized buffer cache buffers. 3212 */ 3213 blksize = hammer_blocksize(ap->a_loffset); 3214 if (hammer_blocksize(base_offset) != blksize) { 3215 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3216 } 3217 if (last_offset != ap->a_loffset && 3218 hammer_blocksize(last_offset - 1) != blksize) { 3219 last_offset = hammer_blockdemarc(ap->a_loffset, 3220 last_offset - 1); 3221 } 3222 3223 /* 3224 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3225 * from occuring. 3226 */ 3227 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3228 3229 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3230 /* 3231 * Only large-data zones can be direct-IOd 3232 */ 3233 error = EOPNOTSUPP; 3234 } else if ((disk_offset & HAMMER_BUFMASK) || 3235 (last_offset - ap->a_loffset) < blksize) { 3236 /* 3237 * doffsetp is not aligned or the forward run size does 3238 * not cover a whole buffer, disallow the direct I/O. 3239 */ 3240 error = EOPNOTSUPP; 3241 } else { 3242 /* 3243 * We're good. 3244 */ 3245 *ap->a_doffsetp = disk_offset; 3246 if (ap->a_runb) { 3247 *ap->a_runb = ap->a_loffset - base_offset; 3248 KKASSERT(*ap->a_runb >= 0); 3249 } 3250 if (ap->a_runp) { 3251 *ap->a_runp = last_offset - ap->a_loffset; 3252 KKASSERT(*ap->a_runp >= 0); 3253 } 3254 error = 0; 3255 } 3256 return(error); 3257 } 3258 3259 /* 3260 * Write to a regular file. Because this is a strategy call the OS is 3261 * trying to actually get data onto the media. 3262 */ 3263 static 3264 int 3265 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3266 { 3267 hammer_record_t record; 3268 hammer_mount_t hmp; 3269 hammer_inode_t ip; 3270 struct bio *bio; 3271 struct buf *bp; 3272 int blksize __debugvar; 3273 int bytes; 3274 int error; 3275 3276 bio = ap->a_bio; 3277 bp = bio->bio_buf; 3278 ip = ap->a_vp->v_data; 3279 hmp = ip->hmp; 3280 3281 blksize = hammer_blocksize(bio->bio_offset); 3282 KKASSERT(bp->b_bufsize == blksize); 3283 3284 if (ip->flags & HAMMER_INODE_RO) { 3285 bp->b_error = EROFS; 3286 bp->b_flags |= B_ERROR; 3287 biodone(ap->a_bio); 3288 return(EROFS); 3289 } 3290 3291 lwkt_gettoken(&hmp->fs_token); 3292 3293 /* 3294 * Disallow swapcache operation on the vnode buffer if double 3295 * buffering is enabled, the swapcache will get the data via 3296 * the block device buffer. 3297 */ 3298 if (hammer_double_buffer) 3299 bp->b_flags |= B_NOTMETA; 3300 3301 /* 3302 * Interlock with inode destruction (no in-kernel or directory 3303 * topology visibility). If we queue new IO while trying to 3304 * destroy the inode we can deadlock the vtrunc call in 3305 * hammer_inode_unloadable_check(). 3306 * 3307 * Besides, there's no point flushing a bp associated with an 3308 * inode that is being destroyed on-media and has no kernel 3309 * references. 3310 */ 3311 if ((ip->flags | ip->sync_flags) & 3312 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3313 bp->b_resid = 0; 3314 biodone(ap->a_bio); 3315 lwkt_reltoken(&hmp->fs_token); 3316 return(0); 3317 } 3318 3319 /* 3320 * Reserve space and issue a direct-write from the front-end. 3321 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3322 * allocations. 3323 * 3324 * An in-memory record will be installed to reference the storage 3325 * until the flusher can get to it. 3326 * 3327 * Since we own the high level bio the front-end will not try to 3328 * do a direct-read until the write completes. 3329 * 3330 * NOTE: The only time we do not reserve a full-sized buffers 3331 * worth of data is if the file is small. We do not try to 3332 * allocate a fragment (from the small-data zone) at the end of 3333 * an otherwise large file as this can lead to wildly separated 3334 * data. 3335 */ 3336 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3337 KKASSERT(bio->bio_offset < ip->ino_data.size); 3338 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3339 bytes = bp->b_bufsize; 3340 else 3341 bytes = ((int)ip->ino_data.size + 15) & ~15; 3342 3343 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3344 bytes, &error); 3345 3346 /* 3347 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3348 * in hammer_vop_write(). We must flag the record so the proper 3349 * REDO_TERM_WRITE entry is generated during the flush. 3350 */ 3351 if (record) { 3352 if (bp->b_flags & B_VFSFLAG1) { 3353 record->flags |= HAMMER_RECF_REDO; 3354 bp->b_flags &= ~B_VFSFLAG1; 3355 } 3356 if (record->flags & HAMMER_RECF_DEDUPED) { 3357 bp->b_resid = 0; 3358 hammer_ip_replace_bulk(hmp, record); 3359 biodone(ap->a_bio); 3360 } else { 3361 hammer_io_direct_write(hmp, bio, record); 3362 } 3363 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3364 hammer_flush_inode(ip, 0); 3365 } else { 3366 bp->b_bio2.bio_offset = NOOFFSET; 3367 bp->b_error = error; 3368 bp->b_flags |= B_ERROR; 3369 biodone(ap->a_bio); 3370 } 3371 lwkt_reltoken(&hmp->fs_token); 3372 return(error); 3373 } 3374 3375 /* 3376 * dounlink - disconnect a directory entry 3377 * 3378 * XXX whiteout support not really in yet 3379 */ 3380 static int 3381 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3382 struct vnode *dvp, struct ucred *cred, 3383 int flags, int isdir) 3384 { 3385 struct namecache *ncp; 3386 hammer_inode_t dip; 3387 hammer_inode_t ip; 3388 hammer_mount_t hmp; 3389 struct hammer_cursor cursor; 3390 int64_t namekey; 3391 u_int32_t max_iterations; 3392 int nlen, error; 3393 3394 /* 3395 * Calculate the namekey and setup the key range for the scan. This 3396 * works kinda like a chained hash table where the lower 32 bits 3397 * of the namekey synthesize the chain. 3398 * 3399 * The key range is inclusive of both key_beg and key_end. 3400 */ 3401 dip = VTOI(dvp); 3402 ncp = nch->ncp; 3403 hmp = dip->hmp; 3404 3405 if (dip->flags & HAMMER_INODE_RO) 3406 return (EROFS); 3407 3408 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3409 &max_iterations); 3410 retry: 3411 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3412 cursor.key_beg.localization = dip->obj_localization + 3413 hammer_dir_localization(dip); 3414 cursor.key_beg.obj_id = dip->obj_id; 3415 cursor.key_beg.key = namekey; 3416 cursor.key_beg.create_tid = 0; 3417 cursor.key_beg.delete_tid = 0; 3418 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3419 cursor.key_beg.obj_type = 0; 3420 3421 cursor.key_end = cursor.key_beg; 3422 cursor.key_end.key += max_iterations; 3423 cursor.asof = dip->obj_asof; 3424 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3425 3426 /* 3427 * Scan all matching records (the chain), locate the one matching 3428 * the requested path component. info->last_error contains the 3429 * error code on search termination and could be 0, ENOENT, or 3430 * something else. 3431 * 3432 * The hammer_ip_*() functions merge in-memory records with on-disk 3433 * records for the purposes of the search. 3434 */ 3435 error = hammer_ip_first(&cursor); 3436 3437 while (error == 0) { 3438 error = hammer_ip_resolve_data(&cursor); 3439 if (error) 3440 break; 3441 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3442 KKASSERT(nlen > 0); 3443 if (ncp->nc_nlen == nlen && 3444 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3445 break; 3446 } 3447 error = hammer_ip_next(&cursor); 3448 } 3449 3450 /* 3451 * If all is ok we have to get the inode so we can adjust nlinks. 3452 * To avoid a deadlock with the flusher we must release the inode 3453 * lock on the directory when acquiring the inode for the entry. 3454 * 3455 * If the target is a directory, it must be empty. 3456 */ 3457 if (error == 0) { 3458 hammer_unlock(&cursor.ip->lock); 3459 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3460 hmp->asof, 3461 cursor.data->entry.localization, 3462 0, &error); 3463 hammer_lock_sh(&cursor.ip->lock); 3464 if (error == ENOENT) { 3465 kprintf("HAMMER: WARNING: Removing " 3466 "dirent w/missing inode \"%s\"\n" 3467 "\tobj_id = %016llx\n", 3468 ncp->nc_name, 3469 (long long)cursor.data->entry.obj_id); 3470 error = 0; 3471 } 3472 3473 /* 3474 * If isdir >= 0 we validate that the entry is or is not a 3475 * directory. If isdir < 0 we don't care. 3476 */ 3477 if (error == 0 && isdir >= 0 && ip) { 3478 if (isdir && 3479 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3480 error = ENOTDIR; 3481 } else if (isdir == 0 && 3482 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3483 error = EISDIR; 3484 } 3485 } 3486 3487 /* 3488 * If we are trying to remove a directory the directory must 3489 * be empty. 3490 * 3491 * The check directory code can loop and deadlock/retry. Our 3492 * own cursor's node locks must be released to avoid a 3-way 3493 * deadlock with the flusher if the check directory code 3494 * blocks. 3495 * 3496 * If any changes whatsoever have been made to the cursor 3497 * set EDEADLK and retry. 3498 * 3499 * WARNING: See warnings in hammer_unlock_cursor() 3500 * function. 3501 */ 3502 if (error == 0 && ip && ip->ino_data.obj_type == 3503 HAMMER_OBJTYPE_DIRECTORY) { 3504 hammer_unlock_cursor(&cursor); 3505 error = hammer_ip_check_directory_empty(trans, ip); 3506 hammer_lock_cursor(&cursor); 3507 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3508 kprintf("HAMMER: Warning: avoided deadlock " 3509 "on rmdir '%s'\n", 3510 ncp->nc_name); 3511 error = EDEADLK; 3512 } 3513 } 3514 3515 /* 3516 * Delete the directory entry. 3517 * 3518 * WARNING: hammer_ip_del_directory() may have to terminate 3519 * the cursor to avoid a deadlock. It is ok to call 3520 * hammer_done_cursor() twice. 3521 */ 3522 if (error == 0) { 3523 error = hammer_ip_del_directory(trans, &cursor, 3524 dip, ip); 3525 } 3526 hammer_done_cursor(&cursor); 3527 if (error == 0) { 3528 /* 3529 * Tell the namecache that we are now unlinked. 3530 */ 3531 cache_unlink(nch); 3532 3533 /* 3534 * NOTE: ip->vp, if non-NULL, cannot be directly 3535 * referenced without formally acquiring the 3536 * vp since the vp might have zero refs on it, 3537 * or in the middle of a reclaim, etc. 3538 * 3539 * NOTE: The cache_setunresolved() can rip the vp 3540 * out from under us since the vp may not have 3541 * any refs, in which case ip->vp will be NULL 3542 * from the outset. 3543 */ 3544 while (ip && ip->vp) { 3545 struct vnode *vp; 3546 3547 error = hammer_get_vnode(ip, &vp); 3548 if (error == 0 && vp) { 3549 vn_unlock(vp); 3550 hammer_knote(ip->vp, NOTE_DELETE); 3551 #if 0 3552 /* 3553 * Don't do this, it can deadlock 3554 * on concurrent rm's of hardlinks. 3555 * Shouldn't be needed any more. 3556 */ 3557 cache_inval_vp(ip->vp, CINV_DESTROY); 3558 #endif 3559 vrele(vp); 3560 break; 3561 } 3562 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3563 } 3564 } 3565 if (ip) 3566 hammer_rel_inode(ip, 0); 3567 } else { 3568 hammer_done_cursor(&cursor); 3569 } 3570 if (error == EDEADLK) 3571 goto retry; 3572 3573 return (error); 3574 } 3575 3576 /************************************************************************ 3577 * FIFO AND SPECFS OPS * 3578 ************************************************************************ 3579 * 3580 */ 3581 static int 3582 hammer_vop_fifoclose (struct vop_close_args *ap) 3583 { 3584 /* XXX update itimes */ 3585 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3586 } 3587 3588 static int 3589 hammer_vop_fiforead (struct vop_read_args *ap) 3590 { 3591 int error; 3592 3593 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3594 /* XXX update access time */ 3595 return (error); 3596 } 3597 3598 static int 3599 hammer_vop_fifowrite (struct vop_write_args *ap) 3600 { 3601 int error; 3602 3603 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3604 /* XXX update access time */ 3605 return (error); 3606 } 3607 3608 static 3609 int 3610 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3611 { 3612 int error; 3613 3614 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3615 if (error) 3616 error = hammer_vop_kqfilter(ap); 3617 return(error); 3618 } 3619 3620 /************************************************************************ 3621 * KQFILTER OPS * 3622 ************************************************************************ 3623 * 3624 */ 3625 static void filt_hammerdetach(struct knote *kn); 3626 static int filt_hammerread(struct knote *kn, long hint); 3627 static int filt_hammerwrite(struct knote *kn, long hint); 3628 static int filt_hammervnode(struct knote *kn, long hint); 3629 3630 static struct filterops hammerread_filtops = 3631 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3632 NULL, filt_hammerdetach, filt_hammerread }; 3633 static struct filterops hammerwrite_filtops = 3634 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3635 NULL, filt_hammerdetach, filt_hammerwrite }; 3636 static struct filterops hammervnode_filtops = 3637 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3638 NULL, filt_hammerdetach, filt_hammervnode }; 3639 3640 static 3641 int 3642 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3643 { 3644 struct vnode *vp = ap->a_vp; 3645 struct knote *kn = ap->a_kn; 3646 3647 switch (kn->kn_filter) { 3648 case EVFILT_READ: 3649 kn->kn_fop = &hammerread_filtops; 3650 break; 3651 case EVFILT_WRITE: 3652 kn->kn_fop = &hammerwrite_filtops; 3653 break; 3654 case EVFILT_VNODE: 3655 kn->kn_fop = &hammervnode_filtops; 3656 break; 3657 default: 3658 return (EOPNOTSUPP); 3659 } 3660 3661 kn->kn_hook = (caddr_t)vp; 3662 3663 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3664 3665 return(0); 3666 } 3667 3668 static void 3669 filt_hammerdetach(struct knote *kn) 3670 { 3671 struct vnode *vp = (void *)kn->kn_hook; 3672 3673 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3674 } 3675 3676 static int 3677 filt_hammerread(struct knote *kn, long hint) 3678 { 3679 struct vnode *vp = (void *)kn->kn_hook; 3680 hammer_inode_t ip = VTOI(vp); 3681 hammer_mount_t hmp = ip->hmp; 3682 off_t off; 3683 3684 if (hint == NOTE_REVOKE) { 3685 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3686 return(1); 3687 } 3688 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3689 off = ip->ino_data.size - kn->kn_fp->f_offset; 3690 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3691 lwkt_reltoken(&hmp->fs_token); 3692 if (kn->kn_sfflags & NOTE_OLDAPI) 3693 return(1); 3694 return (kn->kn_data != 0); 3695 } 3696 3697 static int 3698 filt_hammerwrite(struct knote *kn, long hint) 3699 { 3700 if (hint == NOTE_REVOKE) 3701 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3702 kn->kn_data = 0; 3703 return (1); 3704 } 3705 3706 static int 3707 filt_hammervnode(struct knote *kn, long hint) 3708 { 3709 if (kn->kn_sfflags & hint) 3710 kn->kn_fflags |= hint; 3711 if (hint == NOTE_REVOKE) { 3712 kn->kn_flags |= (EV_EOF | EV_NODATA); 3713 return (1); 3714 } 3715 return (kn->kn_fflags != 0); 3716 } 3717 3718