1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/fcntl.h> 39 #include <sys/namecache.h> 40 #include <sys/vnode.h> 41 #include <sys/lockf.h> 42 #include <sys/event.h> 43 #include <sys/stat.h> 44 #include <sys/dirent.h> 45 #include <sys/file.h> 46 #include <vm/vm_extern.h> 47 #include <vm/swap_pager.h> 48 #include <vfs/fifofs/fifo.h> 49 50 #include "hammer.h" 51 52 /* 53 * USERFS VNOPS 54 */ 55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 56 static int hammer_vop_fsync(struct vop_fsync_args *); 57 static int hammer_vop_read(struct vop_read_args *); 58 static int hammer_vop_write(struct vop_write_args *); 59 static int hammer_vop_access(struct vop_access_args *); 60 static int hammer_vop_advlock(struct vop_advlock_args *); 61 static int hammer_vop_close(struct vop_close_args *); 62 static int hammer_vop_ncreate(struct vop_ncreate_args *); 63 static int hammer_vop_getattr(struct vop_getattr_args *); 64 static int hammer_vop_nresolve(struct vop_nresolve_args *); 65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 66 static int hammer_vop_nlink(struct vop_nlink_args *); 67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 68 static int hammer_vop_nmknod(struct vop_nmknod_args *); 69 static int hammer_vop_open(struct vop_open_args *); 70 static int hammer_vop_print(struct vop_print_args *); 71 static int hammer_vop_readdir(struct vop_readdir_args *); 72 static int hammer_vop_readlink(struct vop_readlink_args *); 73 static int hammer_vop_nremove(struct vop_nremove_args *); 74 static int hammer_vop_nrename(struct vop_nrename_args *); 75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 76 static int hammer_vop_markatime(struct vop_markatime_args *); 77 static int hammer_vop_setattr(struct vop_setattr_args *); 78 static int hammer_vop_strategy(struct vop_strategy_args *); 79 static int hammer_vop_bmap(struct vop_bmap_args *ap); 80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 82 static int hammer_vop_ioctl(struct vop_ioctl_args *); 83 static int hammer_vop_mountctl(struct vop_mountctl_args *); 84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 85 86 static int hammer_vop_fifoclose (struct vop_close_args *); 87 static int hammer_vop_fiforead (struct vop_read_args *); 88 static int hammer_vop_fifowrite (struct vop_write_args *); 89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 90 91 struct vop_ops hammer_vnode_vops = { 92 .vop_default = vop_defaultop, 93 .vop_fsync = hammer_vop_fsync, 94 .vop_getpages = vop_stdgetpages, 95 .vop_putpages = vop_stdputpages, 96 .vop_read = hammer_vop_read, 97 .vop_write = hammer_vop_write, 98 .vop_access = hammer_vop_access, 99 .vop_advlock = hammer_vop_advlock, 100 .vop_close = hammer_vop_close, 101 .vop_ncreate = hammer_vop_ncreate, 102 .vop_getattr = hammer_vop_getattr, 103 .vop_inactive = hammer_vop_inactive, 104 .vop_reclaim = hammer_vop_reclaim, 105 .vop_nresolve = hammer_vop_nresolve, 106 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 107 .vop_nlink = hammer_vop_nlink, 108 .vop_nmkdir = hammer_vop_nmkdir, 109 .vop_nmknod = hammer_vop_nmknod, 110 .vop_open = hammer_vop_open, 111 .vop_pathconf = vop_stdpathconf, 112 .vop_print = hammer_vop_print, 113 .vop_readdir = hammer_vop_readdir, 114 .vop_readlink = hammer_vop_readlink, 115 .vop_nremove = hammer_vop_nremove, 116 .vop_nrename = hammer_vop_nrename, 117 .vop_nrmdir = hammer_vop_nrmdir, 118 .vop_markatime = hammer_vop_markatime, 119 .vop_setattr = hammer_vop_setattr, 120 .vop_bmap = hammer_vop_bmap, 121 .vop_strategy = hammer_vop_strategy, 122 .vop_nsymlink = hammer_vop_nsymlink, 123 .vop_nwhiteout = hammer_vop_nwhiteout, 124 .vop_ioctl = hammer_vop_ioctl, 125 .vop_mountctl = hammer_vop_mountctl, 126 .vop_kqfilter = hammer_vop_kqfilter 127 }; 128 129 struct vop_ops hammer_spec_vops = { 130 .vop_default = vop_defaultop, 131 .vop_fsync = hammer_vop_fsync, 132 .vop_read = vop_stdnoread, 133 .vop_write = vop_stdnowrite, 134 .vop_access = hammer_vop_access, 135 .vop_close = hammer_vop_close, 136 .vop_markatime = hammer_vop_markatime, 137 .vop_getattr = hammer_vop_getattr, 138 .vop_inactive = hammer_vop_inactive, 139 .vop_reclaim = hammer_vop_reclaim, 140 .vop_setattr = hammer_vop_setattr 141 }; 142 143 struct vop_ops hammer_fifo_vops = { 144 .vop_default = fifo_vnoperate, 145 .vop_fsync = hammer_vop_fsync, 146 .vop_read = hammer_vop_fiforead, 147 .vop_write = hammer_vop_fifowrite, 148 .vop_access = hammer_vop_access, 149 .vop_close = hammer_vop_fifoclose, 150 .vop_markatime = hammer_vop_markatime, 151 .vop_getattr = hammer_vop_getattr, 152 .vop_inactive = hammer_vop_inactive, 153 .vop_reclaim = hammer_vop_reclaim, 154 .vop_setattr = hammer_vop_setattr, 155 .vop_kqfilter = hammer_vop_fifokqfilter 156 }; 157 158 static __inline 159 void 160 hammer_knote(struct vnode *vp, int flags) 161 { 162 if (flags) 163 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 164 } 165 166 #ifdef DEBUG_TRUNCATE 167 struct hammer_inode *HammerTruncIp; 168 #endif 169 170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 171 struct vnode *dvp, struct ucred *cred, 172 int flags, int isdir); 173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 175 176 #if 0 177 static 178 int 179 hammer_vop_vnoperate(struct vop_generic_args *) 180 { 181 return (VOCALL(&hammer_vnode_vops, ap)); 182 } 183 #endif 184 185 /* 186 * hammer_vop_fsync { vp, waitfor } 187 * 188 * fsync() an inode to disk and wait for it to be completely committed 189 * such that the information would not be undone if a crash occured after 190 * return. 191 * 192 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 193 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 194 * operation. 195 * 196 * Ultimately the combination of a REDO log and use of fast storage 197 * to front-end cluster caches will make fsync fast, but it aint 198 * here yet. And, in anycase, we need real transactional 199 * all-or-nothing features which are not restricted to a single file. 200 */ 201 static 202 int 203 hammer_vop_fsync(struct vop_fsync_args *ap) 204 { 205 hammer_inode_t ip = VTOI(ap->a_vp); 206 hammer_mount_t hmp = ip->hmp; 207 int waitfor = ap->a_waitfor; 208 int mode; 209 210 lwkt_gettoken(&hmp->fs_token); 211 212 /* 213 * Fsync rule relaxation (default is either full synchronous flush 214 * or REDO semantics with synchronous flush). 215 */ 216 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 217 switch(hammer_fsync_mode) { 218 case 0: 219 mode0: 220 /* no REDO, full synchronous flush */ 221 goto skip; 222 case 1: 223 mode1: 224 /* no REDO, full asynchronous flush */ 225 if (waitfor == MNT_WAIT) 226 waitfor = MNT_NOWAIT; 227 goto skip; 228 case 2: 229 /* REDO semantics, synchronous flush */ 230 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 231 goto mode0; 232 mode = HAMMER_FLUSH_UNDOS_AUTO; 233 break; 234 case 3: 235 /* REDO semantics, relaxed asynchronous flush */ 236 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 237 goto mode1; 238 mode = HAMMER_FLUSH_UNDOS_RELAXED; 239 if (waitfor == MNT_WAIT) 240 waitfor = MNT_NOWAIT; 241 break; 242 case 4: 243 /* ignore the fsync() system call */ 244 lwkt_reltoken(&hmp->fs_token); 245 return(0); 246 default: 247 /* we have to do something */ 248 mode = HAMMER_FLUSH_UNDOS_RELAXED; 249 if (waitfor == MNT_WAIT) 250 waitfor = MNT_NOWAIT; 251 break; 252 } 253 254 /* 255 * Fast fsync only needs to flush the UNDO/REDO fifo if 256 * HAMMER_INODE_REDO is non-zero and the only modifications 257 * made to the file are write or write-extends. 258 */ 259 if ((ip->flags & HAMMER_INODE_REDO) && 260 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 261 ) { 262 ++hammer_count_fsyncs; 263 hammer_flusher_flush_undos(hmp, mode); 264 ip->redo_count = 0; 265 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 266 vclrisdirty(ip->vp); 267 lwkt_reltoken(&hmp->fs_token); 268 return(0); 269 } 270 271 /* 272 * REDO is enabled by fsync(), the idea being we really only 273 * want to lay down REDO records when programs are using 274 * fsync() heavily. The first fsync() on the file starts 275 * the gravy train going and later fsync()s keep it hot by 276 * resetting the redo_count. 277 * 278 * We weren't running REDOs before now so we have to fall 279 * through and do a full fsync of what we have. 280 */ 281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 283 ip->flags |= HAMMER_INODE_REDO; 284 ip->redo_count = 0; 285 } 286 } 287 skip: 288 289 /* 290 * Do a full flush sequence. 291 * 292 * Attempt to release the vnode while waiting for the inode to 293 * finish flushing. This can really mess up inactive->reclaim 294 * sequences so only do it if the vnode is active. 295 * 296 * WARNING! The VX lock functions must be used. vn_lock() will 297 * fail when this is part of a VOP_RECLAIM sequence. 298 */ 299 ++hammer_count_fsyncs; 300 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 301 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 302 if (waitfor == MNT_WAIT) { 303 int dorelock; 304 305 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 306 vx_unlock(ap->a_vp); 307 dorelock = 1; 308 } else { 309 dorelock = 0; 310 } 311 hammer_wait_inode(ip); 312 if (dorelock) 313 vx_lock(ap->a_vp); 314 } 315 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 316 vclrisdirty(ip->vp); 317 lwkt_reltoken(&hmp->fs_token); 318 return (ip->error); 319 } 320 321 /* 322 * hammer_vop_read { vp, uio, ioflag, cred } 323 * 324 * MPSAFE (for the cache safe does not require fs_token) 325 */ 326 static 327 int 328 hammer_vop_read(struct vop_read_args *ap) 329 { 330 struct hammer_transaction trans; 331 hammer_inode_t ip; 332 hammer_mount_t hmp; 333 off_t offset; 334 struct buf *bp; 335 struct uio *uio; 336 int error; 337 int n; 338 int seqcount; 339 int ioseqcount; 340 int blksize; 341 int bigread; 342 int got_trans; 343 size_t resid; 344 345 if (ap->a_vp->v_type != VREG) 346 return (EINVAL); 347 ip = VTOI(ap->a_vp); 348 hmp = ip->hmp; 349 error = 0; 350 got_trans = 0; 351 uio = ap->a_uio; 352 353 /* 354 * Attempt to shortcut directly to the VM object using lwbufs. 355 * This is much faster than instantiating buffer cache buffers. 356 */ 357 resid = uio->uio_resid; 358 error = vop_helper_read_shortcut(ap); 359 hammer_stats_file_read += resid - uio->uio_resid; 360 if (error) 361 return (error); 362 if (uio->uio_resid == 0) 363 goto finished; 364 365 /* 366 * Allow the UIO's size to override the sequential heuristic. 367 */ 368 blksize = hammer_blocksize(uio->uio_offset); 369 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 370 ioseqcount = (ap->a_ioflag >> 16); 371 if (seqcount < ioseqcount) 372 seqcount = ioseqcount; 373 374 /* 375 * If reading or writing a huge amount of data we have to break 376 * atomicy and allow the operation to be interrupted by a signal 377 * or it can DOS the machine. 378 */ 379 bigread = (uio->uio_resid > 100 * 1024 * 1024); 380 381 /* 382 * Access the data typically in HAMMER_BUFSIZE blocks via the 383 * buffer cache, but HAMMER may use a variable block size based 384 * on the offset. 385 * 386 * XXX Temporary hack, delay the start transaction while we remain 387 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 388 * locked-shared. 389 */ 390 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 391 int64_t base_offset; 392 int64_t file_limit; 393 394 blksize = hammer_blocksize(uio->uio_offset); 395 offset = (int)uio->uio_offset & (blksize - 1); 396 base_offset = uio->uio_offset - offset; 397 398 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 399 break; 400 401 /* 402 * MPSAFE 403 */ 404 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 405 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 406 bp->b_flags &= ~B_AGE; 407 error = 0; 408 goto skip; 409 } 410 if (ap->a_ioflag & IO_NRDELAY) { 411 bqrelse(bp); 412 return (EWOULDBLOCK); 413 } 414 415 /* 416 * MPUNSAFE 417 */ 418 if (got_trans == 0) { 419 hammer_start_transaction(&trans, ip->hmp); 420 got_trans = 1; 421 } 422 423 /* 424 * NOTE: A valid bp has already been acquired, but was not 425 * B_CACHE. 426 */ 427 if (hammer_cluster_enable) { 428 /* 429 * Use file_limit to prevent cluster_read() from 430 * creating buffers of the wrong block size past 431 * the demarc. 432 */ 433 file_limit = ip->ino_data.size; 434 if (base_offset < HAMMER_XDEMARC && 435 file_limit > HAMMER_XDEMARC) { 436 file_limit = HAMMER_XDEMARC; 437 } 438 error = cluster_readx(ap->a_vp, 439 file_limit, base_offset, 440 blksize, uio->uio_resid, 441 seqcount * BKVASIZE, &bp); 442 } else { 443 error = breadnx(ap->a_vp, base_offset, blksize, 444 NULL, NULL, 0, &bp); 445 } 446 if (error) { 447 brelse(bp); 448 break; 449 } 450 skip: 451 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 452 kprintf("doff %016jx read file %016jx@%016jx\n", 453 (intmax_t)bp->b_bio2.bio_offset, 454 (intmax_t)ip->obj_id, 455 (intmax_t)bp->b_loffset); 456 } 457 bp->b_flags &= ~B_IODEBUG; 458 if (blksize == HAMMER_XBUFSIZE) 459 bp->b_flags |= B_CLUSTEROK; 460 461 n = blksize - offset; 462 if (n > uio->uio_resid) 463 n = uio->uio_resid; 464 if (n > ip->ino_data.size - uio->uio_offset) 465 n = (int)(ip->ino_data.size - uio->uio_offset); 466 467 /* 468 * Set B_AGE, data has a lower priority than meta-data. 469 * 470 * Use a hold/unlock/drop sequence to run the uiomove 471 * with the buffer unlocked, avoiding deadlocks against 472 * read()s on mmap()'d spaces. 473 */ 474 bp->b_flags |= B_AGE; 475 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 476 bqrelse(bp); 477 478 if (error) 479 break; 480 hammer_stats_file_read += n; 481 } 482 483 finished: 484 485 /* 486 * Try to update the atime with just the inode lock for maximum 487 * concurrency. If we can't shortcut it we have to get the full 488 * blown transaction. 489 */ 490 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 491 hammer_start_transaction(&trans, ip->hmp); 492 got_trans = 1; 493 } 494 495 if (got_trans) { 496 if ((ip->flags & HAMMER_INODE_RO) == 0 && 497 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 498 lwkt_gettoken(&hmp->fs_token); 499 ip->ino_data.atime = trans.time; 500 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 501 hammer_done_transaction(&trans); 502 lwkt_reltoken(&hmp->fs_token); 503 } else { 504 hammer_done_transaction(&trans); 505 } 506 } 507 return (error); 508 } 509 510 /* 511 * hammer_vop_write { vp, uio, ioflag, cred } 512 */ 513 static 514 int 515 hammer_vop_write(struct vop_write_args *ap) 516 { 517 struct hammer_transaction trans; 518 struct hammer_inode *ip; 519 hammer_mount_t hmp; 520 thread_t td; 521 struct uio *uio; 522 int offset; 523 off_t base_offset; 524 int64_t cluster_eof; 525 struct buf *bp; 526 int kflags; 527 int error; 528 int n; 529 int flags; 530 int seqcount; 531 int bigwrite; 532 533 if (ap->a_vp->v_type != VREG) 534 return (EINVAL); 535 ip = VTOI(ap->a_vp); 536 hmp = ip->hmp; 537 error = 0; 538 kflags = 0; 539 seqcount = ap->a_ioflag >> 16; 540 541 if (ip->flags & HAMMER_INODE_RO) 542 return (EROFS); 543 544 /* 545 * Create a transaction to cover the operations we perform. 546 */ 547 hammer_start_transaction(&trans, hmp); 548 uio = ap->a_uio; 549 550 /* 551 * Check append mode 552 */ 553 if (ap->a_ioflag & IO_APPEND) 554 uio->uio_offset = ip->ino_data.size; 555 556 /* 557 * Check for illegal write offsets. Valid range is 0...2^63-1. 558 * 559 * NOTE: the base_off assignment is required to work around what 560 * I consider to be a GCC-4 optimization bug. 561 */ 562 if (uio->uio_offset < 0) { 563 hammer_done_transaction(&trans); 564 return (EFBIG); 565 } 566 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 567 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 568 hammer_done_transaction(&trans); 569 return (EFBIG); 570 } 571 572 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 573 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 574 hammer_done_transaction(&trans); 575 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 576 return (EFBIG); 577 } 578 579 /* 580 * If reading or writing a huge amount of data we have to break 581 * atomicy and allow the operation to be interrupted by a signal 582 * or it can DOS the machine. 583 * 584 * Preset redo_count so we stop generating REDOs earlier if the 585 * limit is exceeded. 586 * 587 * redo_count is heuristical, SMP races are ok 588 */ 589 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 590 if ((ip->flags & HAMMER_INODE_REDO) && 591 ip->redo_count < hammer_limit_redo) { 592 ip->redo_count += uio->uio_resid; 593 } 594 595 /* 596 * Access the data typically in HAMMER_BUFSIZE blocks via the 597 * buffer cache, but HAMMER may use a variable block size based 598 * on the offset. 599 */ 600 while (uio->uio_resid > 0) { 601 int fixsize = 0; 602 int blksize; 603 int blkmask; 604 int trivial; 605 int endofblk; 606 off_t nsize; 607 608 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 609 break; 610 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 611 break; 612 613 blksize = hammer_blocksize(uio->uio_offset); 614 615 /* 616 * Control the number of pending records associated with 617 * this inode. If too many have accumulated start a 618 * flush. Try to maintain a pipeline with the flusher. 619 * 620 * NOTE: It is possible for other sources to grow the 621 * records but not necessarily issue another flush, 622 * so use a timeout and ensure that a re-flush occurs. 623 */ 624 if (ip->rsv_recs >= hammer_limit_inode_recs) { 625 lwkt_gettoken(&hmp->fs_token); 626 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 627 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 628 ip->flags |= HAMMER_INODE_RECSW; 629 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 630 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 631 } 632 lwkt_reltoken(&hmp->fs_token); 633 } 634 635 /* 636 * Do not allow HAMMER to blow out the buffer cache. Very 637 * large UIOs can lockout other processes due to bwillwrite() 638 * mechanics. 639 * 640 * The hammer inode is not locked during these operations. 641 * The vnode is locked which can interfere with the pageout 642 * daemon for non-UIO_NOCOPY writes but should not interfere 643 * with the buffer cache. Even so, we cannot afford to 644 * allow the pageout daemon to build up too many dirty buffer 645 * cache buffers. 646 * 647 * Only call this if we aren't being recursively called from 648 * a virtual disk device (vn), else we may deadlock. 649 */ 650 if ((ap->a_ioflag & IO_RECURSE) == 0) 651 bwillwrite(blksize); 652 653 /* 654 * Calculate the blocksize at the current offset and figure 655 * out how much we can actually write. 656 */ 657 blkmask = blksize - 1; 658 offset = (int)uio->uio_offset & blkmask; 659 base_offset = uio->uio_offset & ~(int64_t)blkmask; 660 n = blksize - offset; 661 if (n > uio->uio_resid) { 662 n = uio->uio_resid; 663 endofblk = 0; 664 } else { 665 endofblk = 1; 666 } 667 nsize = uio->uio_offset + n; 668 if (nsize > ip->ino_data.size) { 669 if (uio->uio_offset > ip->ino_data.size) 670 trivial = 0; 671 else 672 trivial = 1; 673 nvextendbuf(ap->a_vp, 674 ip->ino_data.size, 675 nsize, 676 hammer_blocksize(ip->ino_data.size), 677 hammer_blocksize(nsize), 678 hammer_blockoff(ip->ino_data.size), 679 hammer_blockoff(nsize), 680 trivial); 681 fixsize = 1; 682 kflags |= NOTE_EXTEND; 683 } 684 685 if (uio->uio_segflg == UIO_NOCOPY) { 686 /* 687 * Issuing a write with the same data backing the 688 * buffer. Instantiate the buffer to collect the 689 * backing vm pages, then read-in any missing bits. 690 * 691 * This case is used by vop_stdputpages(). 692 */ 693 bp = getblk(ap->a_vp, base_offset, 694 blksize, GETBLK_BHEAVY, 0); 695 if ((bp->b_flags & B_CACHE) == 0) { 696 bqrelse(bp); 697 error = bread(ap->a_vp, base_offset, 698 blksize, &bp); 699 } 700 } else if (offset == 0 && uio->uio_resid >= blksize) { 701 /* 702 * Even though we are entirely overwriting the buffer 703 * we may still have to zero it out to avoid a 704 * mmap/write visibility issue. 705 */ 706 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 707 if ((bp->b_flags & B_CACHE) == 0) 708 vfs_bio_clrbuf(bp); 709 } else if (base_offset >= ip->ino_data.size) { 710 /* 711 * If the base offset of the buffer is beyond the 712 * file EOF, we don't have to issue a read. 713 */ 714 bp = getblk(ap->a_vp, base_offset, 715 blksize, GETBLK_BHEAVY, 0); 716 vfs_bio_clrbuf(bp); 717 } else { 718 /* 719 * Partial overwrite, read in any missing bits then 720 * replace the portion being written. 721 */ 722 error = bread(ap->a_vp, base_offset, blksize, &bp); 723 if (error == 0) 724 bheavy(bp); 725 } 726 if (error == 0) 727 error = uiomovebp(bp, bp->b_data + offset, n, uio); 728 729 lwkt_gettoken(&hmp->fs_token); 730 731 /* 732 * Generate REDO records if enabled and redo_count will not 733 * exceeded the limit. 734 * 735 * If redo_count exceeds the limit we stop generating records 736 * and clear HAMMER_INODE_REDO. This will cause the next 737 * fsync() to do a full meta-data sync instead of just an 738 * UNDO/REDO fifo update. 739 * 740 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 741 * will still be tracked. The tracks will be terminated 742 * when the related meta-data (including possible data 743 * modifications which are not tracked via REDO) is 744 * flushed. 745 */ 746 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 747 if (ip->redo_count < hammer_limit_redo) { 748 bp->b_flags |= B_VFSFLAG1; 749 error = hammer_generate_redo(&trans, ip, 750 base_offset + offset, 751 HAMMER_REDO_WRITE, 752 bp->b_data + offset, 753 (size_t)n); 754 } else { 755 ip->flags &= ~HAMMER_INODE_REDO; 756 } 757 } 758 759 /* 760 * If we screwed up we have to undo any VM size changes we 761 * made. 762 */ 763 if (error) { 764 brelse(bp); 765 if (fixsize) { 766 nvtruncbuf(ap->a_vp, ip->ino_data.size, 767 hammer_blocksize(ip->ino_data.size), 768 hammer_blockoff(ip->ino_data.size), 769 0); 770 } 771 lwkt_reltoken(&hmp->fs_token); 772 break; 773 } 774 kflags |= NOTE_WRITE; 775 hammer_stats_file_write += n; 776 if (blksize == HAMMER_XBUFSIZE) 777 bp->b_flags |= B_CLUSTEROK; 778 if (ip->ino_data.size < uio->uio_offset) { 779 ip->ino_data.size = uio->uio_offset; 780 flags = HAMMER_INODE_SDIRTY; 781 } else { 782 flags = 0; 783 } 784 ip->ino_data.mtime = trans.time; 785 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 786 hammer_modify_inode(&trans, ip, flags); 787 788 /* 789 * Once we dirty the buffer any cached zone-X offset 790 * becomes invalid. HAMMER NOTE: no-history mode cannot 791 * allow overwriting over the same data sector unless 792 * we provide UNDOs for the old data, which we don't. 793 */ 794 bp->b_bio2.bio_offset = NOOFFSET; 795 796 lwkt_reltoken(&hmp->fs_token); 797 798 /* 799 * Final buffer disposition. 800 * 801 * Because meta-data updates are deferred, HAMMER is 802 * especially sensitive to excessive bdwrite()s because 803 * the I/O stream is not broken up by disk reads. So the 804 * buffer cache simply cannot keep up. 805 * 806 * WARNING! blksize is variable. cluster_write() is 807 * expected to not blow up if it encounters 808 * buffers that do not match the passed blksize. 809 * 810 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 811 * The ip->rsv_recs check should burst-flush the data. 812 * If we queue it immediately the buf could be left 813 * locked on the device queue for a very long time. 814 * 815 * However, failing to flush a dirty buffer out when 816 * issued from the pageout daemon can result in a low 817 * memory deadlock against bio_page_alloc(), so we 818 * have to bawrite() on IO_ASYNC as well. 819 * 820 * NOTE! To avoid degenerate stalls due to mismatched block 821 * sizes we only honor IO_DIRECT on the write which 822 * abuts the end of the buffer. However, we must 823 * honor IO_SYNC in case someone is silly enough to 824 * configure a HAMMER file as swap, or when HAMMER 825 * is serving NFS (for commits). Ick ick. 826 */ 827 bp->b_flags |= B_AGE; 828 if (blksize == HAMMER_XBUFSIZE) 829 bp->b_flags |= B_CLUSTEROK; 830 831 if (ap->a_ioflag & IO_SYNC) { 832 bwrite(bp); 833 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 834 bawrite(bp); 835 } else if (ap->a_ioflag & IO_ASYNC) { 836 bawrite(bp); 837 } else if (hammer_cluster_enable && 838 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 839 if (base_offset < HAMMER_XDEMARC) 840 cluster_eof = hammer_blockdemarc(base_offset, 841 ip->ino_data.size); 842 else 843 cluster_eof = ip->ino_data.size; 844 cluster_write(bp, cluster_eof, blksize, seqcount); 845 } else { 846 bdwrite(bp); 847 } 848 } 849 hammer_done_transaction(&trans); 850 hammer_knote(ap->a_vp, kflags); 851 852 return (error); 853 } 854 855 /* 856 * hammer_vop_access { vp, mode, cred } 857 * 858 * MPSAFE - does not require fs_token 859 */ 860 static 861 int 862 hammer_vop_access(struct vop_access_args *ap) 863 { 864 struct hammer_inode *ip = VTOI(ap->a_vp); 865 uid_t uid; 866 gid_t gid; 867 int error; 868 869 ++hammer_stats_file_iopsr; 870 uid = hammer_to_unix_xid(&ip->ino_data.uid); 871 gid = hammer_to_unix_xid(&ip->ino_data.gid); 872 873 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 874 ip->ino_data.uflags); 875 return (error); 876 } 877 878 /* 879 * hammer_vop_advlock { vp, id, op, fl, flags } 880 * 881 * MPSAFE - does not require fs_token 882 */ 883 static 884 int 885 hammer_vop_advlock(struct vop_advlock_args *ap) 886 { 887 hammer_inode_t ip = VTOI(ap->a_vp); 888 889 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 890 } 891 892 /* 893 * hammer_vop_close { vp, fflag } 894 * 895 * We can only sync-on-close for normal closes. XXX disabled for now. 896 */ 897 static 898 int 899 hammer_vop_close(struct vop_close_args *ap) 900 { 901 #if 0 902 struct vnode *vp = ap->a_vp; 903 hammer_inode_t ip = VTOI(vp); 904 int waitfor; 905 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 906 if (vn_islocked(vp) == LK_EXCLUSIVE && 907 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 908 if (ip->flags & HAMMER_INODE_CLOSESYNC) 909 waitfor = MNT_WAIT; 910 else 911 waitfor = MNT_NOWAIT; 912 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 913 HAMMER_INODE_CLOSEASYNC); 914 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 915 } 916 } 917 #endif 918 return (vop_stdclose(ap)); 919 } 920 921 /* 922 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 923 * 924 * The operating system has already ensured that the directory entry 925 * does not exist and done all appropriate namespace locking. 926 */ 927 static 928 int 929 hammer_vop_ncreate(struct vop_ncreate_args *ap) 930 { 931 struct hammer_transaction trans; 932 struct hammer_inode *dip; 933 struct hammer_inode *nip; 934 struct nchandle *nch; 935 hammer_mount_t hmp; 936 int error; 937 938 nch = ap->a_nch; 939 dip = VTOI(ap->a_dvp); 940 hmp = dip->hmp; 941 942 if (dip->flags & HAMMER_INODE_RO) 943 return (EROFS); 944 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 945 return (error); 946 947 /* 948 * Create a transaction to cover the operations we perform. 949 */ 950 lwkt_gettoken(&hmp->fs_token); 951 hammer_start_transaction(&trans, hmp); 952 ++hammer_stats_file_iopsw; 953 954 /* 955 * Create a new filesystem object of the requested type. The 956 * returned inode will be referenced and shared-locked to prevent 957 * it from being moved to the flusher. 958 */ 959 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 960 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 961 NULL, &nip); 962 if (error) { 963 hkprintf("hammer_create_inode error %d\n", error); 964 hammer_done_transaction(&trans); 965 *ap->a_vpp = NULL; 966 lwkt_reltoken(&hmp->fs_token); 967 return (error); 968 } 969 970 /* 971 * Add the new filesystem object to the directory. This will also 972 * bump the inode's link count. 973 */ 974 error = hammer_ip_add_directory(&trans, dip, 975 nch->ncp->nc_name, nch->ncp->nc_nlen, 976 nip); 977 if (error) 978 hkprintf("hammer_ip_add_directory error %d\n", error); 979 980 /* 981 * Finish up. 982 */ 983 if (error) { 984 hammer_rel_inode(nip, 0); 985 hammer_done_transaction(&trans); 986 *ap->a_vpp = NULL; 987 } else { 988 error = hammer_get_vnode(nip, ap->a_vpp); 989 hammer_done_transaction(&trans); 990 hammer_rel_inode(nip, 0); 991 if (error == 0) { 992 cache_setunresolved(ap->a_nch); 993 cache_setvp(ap->a_nch, *ap->a_vpp); 994 } 995 hammer_knote(ap->a_dvp, NOTE_WRITE); 996 } 997 lwkt_reltoken(&hmp->fs_token); 998 return (error); 999 } 1000 1001 /* 1002 * hammer_vop_getattr { vp, vap } 1003 * 1004 * Retrieve an inode's attribute information. When accessing inodes 1005 * historically we fake the atime field to ensure consistent results. 1006 * The atime field is stored in the B-Tree element and allowed to be 1007 * updated without cycling the element. 1008 * 1009 * MPSAFE - does not require fs_token 1010 */ 1011 static 1012 int 1013 hammer_vop_getattr(struct vop_getattr_args *ap) 1014 { 1015 struct hammer_inode *ip = VTOI(ap->a_vp); 1016 struct vattr *vap = ap->a_vap; 1017 1018 /* 1019 * We want the fsid to be different when accessing a filesystem 1020 * with different as-of's so programs like diff don't think 1021 * the files are the same. 1022 * 1023 * We also want the fsid to be the same when comparing snapshots, 1024 * or when comparing mirrors (which might be backed by different 1025 * physical devices). HAMMER fsids are based on the PFS's 1026 * shared_uuid field. 1027 * 1028 * XXX there is a chance of collision here. The va_fsid reported 1029 * by stat is different from the more involved fsid used in the 1030 * mount structure. 1031 */ 1032 ++hammer_stats_file_iopsr; 1033 hammer_lock_sh(&ip->lock); 1034 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1035 (u_int32_t)(ip->obj_asof >> 32); 1036 1037 vap->va_fileid = ip->ino_leaf.base.obj_id; 1038 vap->va_mode = ip->ino_data.mode; 1039 vap->va_nlink = ip->ino_data.nlinks; 1040 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1041 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1042 vap->va_rmajor = 0; 1043 vap->va_rminor = 0; 1044 vap->va_size = ip->ino_data.size; 1045 1046 /* 1047 * Special case for @@PFS softlinks. The actual size of the 1048 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1049 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1050 * 1051 * Note that userspace hammer command does not allow users to 1052 * create a @@PFS softlink under an existing other PFS (id!=0) 1053 * so the ip localization here for @@PFS softlink is always 0. 1054 */ 1055 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1056 ip->ino_data.size == 10 && 1057 ip->obj_asof == HAMMER_MAX_TID && 1058 ip->obj_localization == 0 && 1059 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1060 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1061 vap->va_size = 26; 1062 else 1063 vap->va_size = 10; 1064 } 1065 1066 /* 1067 * We must provide a consistent atime and mtime for snapshots 1068 * so people can do a 'tar cf - ... | md5' on them and get 1069 * consistent results. 1070 */ 1071 if (ip->flags & HAMMER_INODE_RO) { 1072 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1073 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1074 } else { 1075 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1076 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1077 } 1078 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1079 vap->va_flags = ip->ino_data.uflags; 1080 vap->va_gen = 1; /* hammer inums are unique for all time */ 1081 vap->va_blocksize = HAMMER_BUFSIZE; 1082 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1083 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1084 ~HAMMER_XBUFMASK64; 1085 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1086 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1087 ~HAMMER_BUFMASK64; 1088 } else { 1089 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1090 } 1091 1092 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1093 vap->va_filerev = 0; /* XXX */ 1094 vap->va_uid_uuid = ip->ino_data.uid; 1095 vap->va_gid_uuid = ip->ino_data.gid; 1096 vap->va_fsid_uuid = ip->hmp->fsid; 1097 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1098 VA_FSID_UUID_VALID; 1099 1100 switch (ip->ino_data.obj_type) { 1101 case HAMMER_OBJTYPE_CDEV: 1102 case HAMMER_OBJTYPE_BDEV: 1103 vap->va_rmajor = ip->ino_data.rmajor; 1104 vap->va_rminor = ip->ino_data.rminor; 1105 break; 1106 default: 1107 break; 1108 } 1109 hammer_unlock(&ip->lock); 1110 return(0); 1111 } 1112 1113 /* 1114 * hammer_vop_nresolve { nch, dvp, cred } 1115 * 1116 * Locate the requested directory entry. 1117 */ 1118 static 1119 int 1120 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1121 { 1122 struct hammer_transaction trans; 1123 struct namecache *ncp; 1124 hammer_mount_t hmp; 1125 hammer_inode_t dip; 1126 hammer_inode_t ip; 1127 hammer_tid_t asof; 1128 struct hammer_cursor cursor; 1129 struct vnode *vp; 1130 int64_t namekey; 1131 int error; 1132 int i; 1133 int nlen; 1134 int flags; 1135 int ispfs; 1136 int64_t obj_id; 1137 u_int32_t localization; 1138 u_int32_t max_iterations; 1139 1140 /* 1141 * Misc initialization, plus handle as-of name extensions. Look for 1142 * the '@@' extension. Note that as-of files and directories cannot 1143 * be modified. 1144 */ 1145 dip = VTOI(ap->a_dvp); 1146 ncp = ap->a_nch->ncp; 1147 asof = dip->obj_asof; 1148 localization = dip->obj_localization; /* for code consistency */ 1149 nlen = ncp->nc_nlen; 1150 flags = dip->flags & HAMMER_INODE_RO; 1151 ispfs = 0; 1152 hmp = dip->hmp; 1153 1154 lwkt_gettoken(&hmp->fs_token); 1155 hammer_simple_transaction(&trans, hmp); 1156 ++hammer_stats_file_iopsr; 1157 1158 for (i = 0; i < nlen; ++i) { 1159 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1160 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1161 &ispfs, &asof, &localization); 1162 if (error != 0) { 1163 i = nlen; 1164 break; 1165 } 1166 if (asof != HAMMER_MAX_TID) 1167 flags |= HAMMER_INODE_RO; 1168 break; 1169 } 1170 } 1171 nlen = i; 1172 1173 /* 1174 * If this is a PFS softlink we dive into the PFS 1175 */ 1176 if (ispfs && nlen == 0) { 1177 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1178 asof, localization, 1179 flags, &error); 1180 if (error == 0) { 1181 error = hammer_get_vnode(ip, &vp); 1182 hammer_rel_inode(ip, 0); 1183 } else { 1184 vp = NULL; 1185 } 1186 if (error == 0) { 1187 vn_unlock(vp); 1188 cache_setvp(ap->a_nch, vp); 1189 vrele(vp); 1190 } 1191 goto done; 1192 } 1193 1194 /* 1195 * If there is no path component the time extension is relative to dip. 1196 * e.g. "fubar/@@<snapshot>" 1197 * 1198 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1199 * e.g. "fubar/.@@<snapshot>" 1200 * 1201 * ".." is handled by the kernel. We do not currently handle 1202 * "..@<snapshot>". 1203 */ 1204 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1205 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1206 asof, dip->obj_localization, 1207 flags, &error); 1208 if (error == 0) { 1209 error = hammer_get_vnode(ip, &vp); 1210 hammer_rel_inode(ip, 0); 1211 } else { 1212 vp = NULL; 1213 } 1214 if (error == 0) { 1215 vn_unlock(vp); 1216 cache_setvp(ap->a_nch, vp); 1217 vrele(vp); 1218 } 1219 goto done; 1220 } 1221 1222 /* 1223 * Calculate the namekey and setup the key range for the scan. This 1224 * works kinda like a chained hash table where the lower 32 bits 1225 * of the namekey synthesize the chain. 1226 * 1227 * The key range is inclusive of both key_beg and key_end. 1228 */ 1229 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1230 &max_iterations); 1231 1232 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1233 cursor.key_beg.localization = dip->obj_localization + 1234 hammer_dir_localization(dip); 1235 cursor.key_beg.obj_id = dip->obj_id; 1236 cursor.key_beg.key = namekey; 1237 cursor.key_beg.create_tid = 0; 1238 cursor.key_beg.delete_tid = 0; 1239 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1240 cursor.key_beg.obj_type = 0; 1241 1242 cursor.key_end = cursor.key_beg; 1243 cursor.key_end.key += max_iterations; 1244 cursor.asof = asof; 1245 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1246 1247 /* 1248 * Scan all matching records (the chain), locate the one matching 1249 * the requested path component. 1250 * 1251 * The hammer_ip_*() functions merge in-memory records with on-disk 1252 * records for the purposes of the search. 1253 */ 1254 obj_id = 0; 1255 localization = HAMMER_DEF_LOCALIZATION; 1256 1257 if (error == 0) { 1258 error = hammer_ip_first(&cursor); 1259 while (error == 0) { 1260 error = hammer_ip_resolve_data(&cursor); 1261 if (error) 1262 break; 1263 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1264 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1265 obj_id = cursor.data->entry.obj_id; 1266 localization = cursor.data->entry.localization; 1267 break; 1268 } 1269 error = hammer_ip_next(&cursor); 1270 } 1271 } 1272 hammer_done_cursor(&cursor); 1273 1274 /* 1275 * Lookup the obj_id. This should always succeed. If it does not 1276 * the filesystem may be damaged and we return a dummy inode. 1277 */ 1278 if (error == 0) { 1279 ip = hammer_get_inode(&trans, dip, obj_id, 1280 asof, localization, 1281 flags, &error); 1282 if (error == ENOENT) { 1283 kprintf("HAMMER: WARNING: Missing " 1284 "inode for dirent \"%s\"\n" 1285 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1286 ncp->nc_name, 1287 (long long)obj_id, (long long)asof, 1288 localization); 1289 error = 0; 1290 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1291 asof, localization, 1292 flags, &error); 1293 } 1294 if (error == 0) { 1295 error = hammer_get_vnode(ip, &vp); 1296 hammer_rel_inode(ip, 0); 1297 } else { 1298 vp = NULL; 1299 } 1300 if (error == 0) { 1301 vn_unlock(vp); 1302 cache_setvp(ap->a_nch, vp); 1303 vrele(vp); 1304 } 1305 } else if (error == ENOENT) { 1306 cache_setvp(ap->a_nch, NULL); 1307 } 1308 done: 1309 hammer_done_transaction(&trans); 1310 lwkt_reltoken(&hmp->fs_token); 1311 return (error); 1312 } 1313 1314 /* 1315 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1316 * 1317 * Locate the parent directory of a directory vnode. 1318 * 1319 * dvp is referenced but not locked. *vpp must be returned referenced and 1320 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1321 * at the root, instead it could indicate that the directory we were in was 1322 * removed. 1323 * 1324 * NOTE: as-of sequences are not linked into the directory structure. If 1325 * we are at the root with a different asof then the mount point, reload 1326 * the same directory with the mount point's asof. I'm not sure what this 1327 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1328 * get confused, but it hasn't been tested. 1329 */ 1330 static 1331 int 1332 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1333 { 1334 struct hammer_transaction trans; 1335 struct hammer_inode *dip; 1336 struct hammer_inode *ip; 1337 hammer_mount_t hmp; 1338 int64_t parent_obj_id; 1339 u_int32_t parent_obj_localization; 1340 hammer_tid_t asof; 1341 int error; 1342 1343 dip = VTOI(ap->a_dvp); 1344 asof = dip->obj_asof; 1345 hmp = dip->hmp; 1346 1347 /* 1348 * Whos are parent? This could be the root of a pseudo-filesystem 1349 * whos parent is in another localization domain. 1350 */ 1351 lwkt_gettoken(&hmp->fs_token); 1352 parent_obj_id = dip->ino_data.parent_obj_id; 1353 if (dip->obj_id == HAMMER_OBJID_ROOT) 1354 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1355 else 1356 parent_obj_localization = dip->obj_localization; 1357 1358 /* 1359 * It's probably a PFS root when dip->ino_data.parent_obj_id is 0. 1360 */ 1361 if (parent_obj_id == 0) { 1362 if (dip->obj_id == HAMMER_OBJID_ROOT && 1363 asof != hmp->asof) { 1364 parent_obj_id = dip->obj_id; 1365 asof = hmp->asof; 1366 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1367 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1368 (long long)dip->obj_asof); 1369 } else { 1370 *ap->a_vpp = NULL; 1371 lwkt_reltoken(&hmp->fs_token); 1372 return ENOENT; 1373 } 1374 } 1375 1376 hammer_simple_transaction(&trans, hmp); 1377 ++hammer_stats_file_iopsr; 1378 1379 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1380 asof, parent_obj_localization, 1381 dip->flags, &error); 1382 if (ip) { 1383 error = hammer_get_vnode(ip, ap->a_vpp); 1384 hammer_rel_inode(ip, 0); 1385 } else { 1386 *ap->a_vpp = NULL; 1387 } 1388 hammer_done_transaction(&trans); 1389 lwkt_reltoken(&hmp->fs_token); 1390 return (error); 1391 } 1392 1393 /* 1394 * hammer_vop_nlink { nch, dvp, vp, cred } 1395 */ 1396 static 1397 int 1398 hammer_vop_nlink(struct vop_nlink_args *ap) 1399 { 1400 struct hammer_transaction trans; 1401 struct hammer_inode *dip; 1402 struct hammer_inode *ip; 1403 struct nchandle *nch; 1404 hammer_mount_t hmp; 1405 int error; 1406 1407 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1408 return(EXDEV); 1409 1410 nch = ap->a_nch; 1411 dip = VTOI(ap->a_dvp); 1412 ip = VTOI(ap->a_vp); 1413 hmp = dip->hmp; 1414 1415 if (dip->obj_localization != ip->obj_localization) 1416 return(EXDEV); 1417 1418 if (dip->flags & HAMMER_INODE_RO) 1419 return (EROFS); 1420 if (ip->flags & HAMMER_INODE_RO) 1421 return (EROFS); 1422 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1423 return (error); 1424 1425 /* 1426 * Create a transaction to cover the operations we perform. 1427 */ 1428 lwkt_gettoken(&hmp->fs_token); 1429 hammer_start_transaction(&trans, hmp); 1430 ++hammer_stats_file_iopsw; 1431 1432 /* 1433 * Add the filesystem object to the directory. Note that neither 1434 * dip nor ip are referenced or locked, but their vnodes are 1435 * referenced. This function will bump the inode's link count. 1436 */ 1437 error = hammer_ip_add_directory(&trans, dip, 1438 nch->ncp->nc_name, nch->ncp->nc_nlen, 1439 ip); 1440 1441 /* 1442 * Finish up. 1443 */ 1444 if (error == 0) { 1445 cache_setunresolved(nch); 1446 cache_setvp(nch, ap->a_vp); 1447 } 1448 hammer_done_transaction(&trans); 1449 hammer_knote(ap->a_vp, NOTE_LINK); 1450 hammer_knote(ap->a_dvp, NOTE_WRITE); 1451 lwkt_reltoken(&hmp->fs_token); 1452 return (error); 1453 } 1454 1455 /* 1456 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1457 * 1458 * The operating system has already ensured that the directory entry 1459 * does not exist and done all appropriate namespace locking. 1460 */ 1461 static 1462 int 1463 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1464 { 1465 struct hammer_transaction trans; 1466 struct hammer_inode *dip; 1467 struct hammer_inode *nip; 1468 struct nchandle *nch; 1469 hammer_mount_t hmp; 1470 int error; 1471 1472 nch = ap->a_nch; 1473 dip = VTOI(ap->a_dvp); 1474 hmp = dip->hmp; 1475 1476 if (dip->flags & HAMMER_INODE_RO) 1477 return (EROFS); 1478 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1479 return (error); 1480 1481 /* 1482 * Create a transaction to cover the operations we perform. 1483 */ 1484 lwkt_gettoken(&hmp->fs_token); 1485 hammer_start_transaction(&trans, hmp); 1486 ++hammer_stats_file_iopsw; 1487 1488 /* 1489 * Create a new filesystem object of the requested type. The 1490 * returned inode will be referenced but not locked. 1491 */ 1492 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1493 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1494 NULL, &nip); 1495 if (error) { 1496 hkprintf("hammer_mkdir error %d\n", error); 1497 hammer_done_transaction(&trans); 1498 *ap->a_vpp = NULL; 1499 lwkt_reltoken(&hmp->fs_token); 1500 return (error); 1501 } 1502 /* 1503 * Add the new filesystem object to the directory. This will also 1504 * bump the inode's link count. 1505 */ 1506 error = hammer_ip_add_directory(&trans, dip, 1507 nch->ncp->nc_name, nch->ncp->nc_nlen, 1508 nip); 1509 if (error) 1510 hkprintf("hammer_mkdir (add) error %d\n", error); 1511 1512 /* 1513 * Finish up. 1514 */ 1515 if (error) { 1516 hammer_rel_inode(nip, 0); 1517 *ap->a_vpp = NULL; 1518 } else { 1519 error = hammer_get_vnode(nip, ap->a_vpp); 1520 hammer_rel_inode(nip, 0); 1521 if (error == 0) { 1522 cache_setunresolved(ap->a_nch); 1523 cache_setvp(ap->a_nch, *ap->a_vpp); 1524 } 1525 } 1526 hammer_done_transaction(&trans); 1527 if (error == 0) 1528 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1529 lwkt_reltoken(&hmp->fs_token); 1530 return (error); 1531 } 1532 1533 /* 1534 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1535 * 1536 * The operating system has already ensured that the directory entry 1537 * does not exist and done all appropriate namespace locking. 1538 */ 1539 static 1540 int 1541 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1542 { 1543 struct hammer_transaction trans; 1544 struct hammer_inode *dip; 1545 struct hammer_inode *nip; 1546 struct nchandle *nch; 1547 hammer_mount_t hmp; 1548 int error; 1549 1550 nch = ap->a_nch; 1551 dip = VTOI(ap->a_dvp); 1552 hmp = dip->hmp; 1553 1554 if (dip->flags & HAMMER_INODE_RO) 1555 return (EROFS); 1556 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1557 return (error); 1558 1559 /* 1560 * Create a transaction to cover the operations we perform. 1561 */ 1562 lwkt_gettoken(&hmp->fs_token); 1563 hammer_start_transaction(&trans, hmp); 1564 ++hammer_stats_file_iopsw; 1565 1566 /* 1567 * Create a new filesystem object of the requested type. The 1568 * returned inode will be referenced but not locked. 1569 * 1570 * If mknod specifies a directory a pseudo-fs is created. 1571 */ 1572 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1573 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1574 NULL, &nip); 1575 if (error) { 1576 hammer_done_transaction(&trans); 1577 *ap->a_vpp = NULL; 1578 lwkt_reltoken(&hmp->fs_token); 1579 return (error); 1580 } 1581 1582 /* 1583 * Add the new filesystem object to the directory. This will also 1584 * bump the inode's link count. 1585 */ 1586 error = hammer_ip_add_directory(&trans, dip, 1587 nch->ncp->nc_name, nch->ncp->nc_nlen, 1588 nip); 1589 1590 /* 1591 * Finish up. 1592 */ 1593 if (error) { 1594 hammer_rel_inode(nip, 0); 1595 *ap->a_vpp = NULL; 1596 } else { 1597 error = hammer_get_vnode(nip, ap->a_vpp); 1598 hammer_rel_inode(nip, 0); 1599 if (error == 0) { 1600 cache_setunresolved(ap->a_nch); 1601 cache_setvp(ap->a_nch, *ap->a_vpp); 1602 } 1603 } 1604 hammer_done_transaction(&trans); 1605 if (error == 0) 1606 hammer_knote(ap->a_dvp, NOTE_WRITE); 1607 lwkt_reltoken(&hmp->fs_token); 1608 return (error); 1609 } 1610 1611 /* 1612 * hammer_vop_open { vp, mode, cred, fp } 1613 * 1614 * MPSAFE (does not require fs_token) 1615 */ 1616 static 1617 int 1618 hammer_vop_open(struct vop_open_args *ap) 1619 { 1620 hammer_inode_t ip; 1621 1622 ++hammer_stats_file_iopsr; 1623 ip = VTOI(ap->a_vp); 1624 1625 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1626 return (EROFS); 1627 return(vop_stdopen(ap)); 1628 } 1629 1630 /* 1631 * hammer_vop_print { vp } 1632 */ 1633 static 1634 int 1635 hammer_vop_print(struct vop_print_args *ap) 1636 { 1637 return EOPNOTSUPP; 1638 } 1639 1640 /* 1641 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1642 */ 1643 static 1644 int 1645 hammer_vop_readdir(struct vop_readdir_args *ap) 1646 { 1647 struct hammer_transaction trans; 1648 struct hammer_cursor cursor; 1649 struct hammer_inode *ip; 1650 hammer_mount_t hmp; 1651 struct uio *uio; 1652 hammer_base_elm_t base; 1653 int error; 1654 int cookie_index; 1655 int ncookies; 1656 off_t *cookies; 1657 off_t saveoff; 1658 int r; 1659 int dtype; 1660 1661 ++hammer_stats_file_iopsr; 1662 ip = VTOI(ap->a_vp); 1663 uio = ap->a_uio; 1664 saveoff = uio->uio_offset; 1665 hmp = ip->hmp; 1666 1667 if (ap->a_ncookies) { 1668 ncookies = uio->uio_resid / 16 + 1; 1669 if (ncookies > 1024) 1670 ncookies = 1024; 1671 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1672 cookie_index = 0; 1673 } else { 1674 ncookies = -1; 1675 cookies = NULL; 1676 cookie_index = 0; 1677 } 1678 1679 lwkt_gettoken(&hmp->fs_token); 1680 hammer_simple_transaction(&trans, hmp); 1681 1682 /* 1683 * Handle artificial entries 1684 * 1685 * It should be noted that the minimum value for a directory 1686 * hash key on-media is 0x0000000100000000, so we can use anything 1687 * less then that to represent our 'special' key space. 1688 */ 1689 error = 0; 1690 if (saveoff == 0) { 1691 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1692 if (r) 1693 goto done; 1694 if (cookies) 1695 cookies[cookie_index] = saveoff; 1696 ++saveoff; 1697 ++cookie_index; 1698 if (cookie_index == ncookies) 1699 goto done; 1700 } 1701 if (saveoff == 1) { 1702 if (ip->ino_data.parent_obj_id) { 1703 r = vop_write_dirent(&error, uio, 1704 ip->ino_data.parent_obj_id, 1705 DT_DIR, 2, ".."); 1706 } else { 1707 r = vop_write_dirent(&error, uio, 1708 ip->obj_id, DT_DIR, 2, ".."); 1709 } 1710 if (r) 1711 goto done; 1712 if (cookies) 1713 cookies[cookie_index] = saveoff; 1714 ++saveoff; 1715 ++cookie_index; 1716 if (cookie_index == ncookies) 1717 goto done; 1718 } 1719 1720 /* 1721 * Key range (begin and end inclusive) to scan. Directory keys 1722 * directly translate to a 64 bit 'seek' position. 1723 */ 1724 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1725 cursor.key_beg.localization = ip->obj_localization + 1726 hammer_dir_localization(ip); 1727 cursor.key_beg.obj_id = ip->obj_id; 1728 cursor.key_beg.create_tid = 0; 1729 cursor.key_beg.delete_tid = 0; 1730 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1731 cursor.key_beg.obj_type = 0; 1732 cursor.key_beg.key = saveoff; 1733 1734 cursor.key_end = cursor.key_beg; 1735 cursor.key_end.key = HAMMER_MAX_KEY; 1736 cursor.asof = ip->obj_asof; 1737 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1738 1739 error = hammer_ip_first(&cursor); 1740 1741 while (error == 0) { 1742 error = hammer_ip_resolve_data(&cursor); 1743 if (error) 1744 break; 1745 base = &cursor.leaf->base; 1746 saveoff = base->key; 1747 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1748 1749 if (base->obj_id != ip->obj_id) 1750 panic("readdir: bad record at %p", cursor.node); 1751 1752 /* 1753 * Convert pseudo-filesystems into softlinks 1754 */ 1755 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1756 r = vop_write_dirent( 1757 &error, uio, cursor.data->entry.obj_id, 1758 dtype, 1759 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1760 (void *)cursor.data->entry.name); 1761 if (r) 1762 break; 1763 ++saveoff; 1764 if (cookies) 1765 cookies[cookie_index] = base->key; 1766 ++cookie_index; 1767 if (cookie_index == ncookies) 1768 break; 1769 error = hammer_ip_next(&cursor); 1770 } 1771 hammer_done_cursor(&cursor); 1772 1773 done: 1774 hammer_done_transaction(&trans); 1775 1776 if (ap->a_eofflag) 1777 *ap->a_eofflag = (error == ENOENT); 1778 uio->uio_offset = saveoff; 1779 if (error && cookie_index == 0) { 1780 if (error == ENOENT) 1781 error = 0; 1782 if (cookies) { 1783 kfree(cookies, M_TEMP); 1784 *ap->a_ncookies = 0; 1785 *ap->a_cookies = NULL; 1786 } 1787 } else { 1788 if (error == ENOENT) 1789 error = 0; 1790 if (cookies) { 1791 *ap->a_ncookies = cookie_index; 1792 *ap->a_cookies = cookies; 1793 } 1794 } 1795 lwkt_reltoken(&hmp->fs_token); 1796 return(error); 1797 } 1798 1799 /* 1800 * hammer_vop_readlink { vp, uio, cred } 1801 */ 1802 static 1803 int 1804 hammer_vop_readlink(struct vop_readlink_args *ap) 1805 { 1806 struct hammer_transaction trans; 1807 struct hammer_cursor cursor; 1808 struct hammer_inode *ip; 1809 hammer_mount_t hmp; 1810 char buf[32]; 1811 u_int32_t localization; 1812 hammer_pseudofs_inmem_t pfsm; 1813 int error; 1814 1815 ip = VTOI(ap->a_vp); 1816 hmp = ip->hmp; 1817 1818 lwkt_gettoken(&hmp->fs_token); 1819 1820 /* 1821 * Shortcut if the symlink data was stuffed into ino_data. 1822 * 1823 * Also expand special "@@PFS%05d" softlinks (expansion only 1824 * occurs for non-historical (current) accesses made from the 1825 * primary filesystem). 1826 * 1827 * Note that userspace hammer command does not allow users to 1828 * create a @@PFS softlink under an existing other PFS (id!=0) 1829 * so the ip localization here for @@PFS softlink is always 0. 1830 */ 1831 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1832 char *ptr; 1833 int bytes; 1834 1835 ptr = ip->ino_data.ext.symlink; 1836 bytes = (int)ip->ino_data.size; 1837 if (bytes == 10 && 1838 ip->obj_asof == HAMMER_MAX_TID && 1839 ip->obj_localization == 0 && 1840 strncmp(ptr, "@@PFS", 5) == 0) { 1841 hammer_simple_transaction(&trans, hmp); 1842 bcopy(ptr + 5, buf, 5); 1843 buf[5] = 0; 1844 localization = strtoul(buf, NULL, 10) << 16; 1845 pfsm = hammer_load_pseudofs(&trans, localization, 1846 &error); 1847 if (error == 0) { 1848 if (pfsm->pfsd.mirror_flags & 1849 HAMMER_PFSD_SLAVE) { 1850 /* vap->va_size == 26 */ 1851 ksnprintf(buf, sizeof(buf), 1852 "@@0x%016llx:%05d", 1853 (long long)pfsm->pfsd.sync_end_tid, 1854 localization >> 16); 1855 } else { 1856 /* vap->va_size == 10 */ 1857 ksnprintf(buf, sizeof(buf), 1858 "@@-1:%05d", 1859 localization >> 16); 1860 #if 0 1861 ksnprintf(buf, sizeof(buf), 1862 "@@0x%016llx:%05d", 1863 (long long)HAMMER_MAX_TID, 1864 localization >> 16); 1865 #endif 1866 } 1867 ptr = buf; 1868 bytes = strlen(buf); 1869 } 1870 if (pfsm) 1871 hammer_rel_pseudofs(hmp, pfsm); 1872 hammer_done_transaction(&trans); 1873 } 1874 error = uiomove(ptr, bytes, ap->a_uio); 1875 lwkt_reltoken(&hmp->fs_token); 1876 return(error); 1877 } 1878 1879 /* 1880 * Long version 1881 */ 1882 hammer_simple_transaction(&trans, hmp); 1883 ++hammer_stats_file_iopsr; 1884 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1885 1886 /* 1887 * Key range (begin and end inclusive) to scan. Directory keys 1888 * directly translate to a 64 bit 'seek' position. 1889 */ 1890 cursor.key_beg.localization = ip->obj_localization + 1891 HAMMER_LOCALIZE_MISC; 1892 cursor.key_beg.obj_id = ip->obj_id; 1893 cursor.key_beg.create_tid = 0; 1894 cursor.key_beg.delete_tid = 0; 1895 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1896 cursor.key_beg.obj_type = 0; 1897 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1898 cursor.asof = ip->obj_asof; 1899 cursor.flags |= HAMMER_CURSOR_ASOF; 1900 1901 error = hammer_ip_lookup(&cursor); 1902 if (error == 0) { 1903 error = hammer_ip_resolve_data(&cursor); 1904 if (error == 0) { 1905 KKASSERT(cursor.leaf->data_len >= 1906 HAMMER_SYMLINK_NAME_OFF); 1907 error = uiomove(cursor.data->symlink.name, 1908 cursor.leaf->data_len - 1909 HAMMER_SYMLINK_NAME_OFF, 1910 ap->a_uio); 1911 } 1912 } 1913 hammer_done_cursor(&cursor); 1914 hammer_done_transaction(&trans); 1915 lwkt_reltoken(&hmp->fs_token); 1916 return(error); 1917 } 1918 1919 /* 1920 * hammer_vop_nremove { nch, dvp, cred } 1921 */ 1922 static 1923 int 1924 hammer_vop_nremove(struct vop_nremove_args *ap) 1925 { 1926 struct hammer_transaction trans; 1927 struct hammer_inode *dip; 1928 hammer_mount_t hmp; 1929 int error; 1930 1931 dip = VTOI(ap->a_dvp); 1932 hmp = dip->hmp; 1933 1934 if (hammer_nohistory(dip) == 0 && 1935 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1936 return (error); 1937 } 1938 1939 lwkt_gettoken(&hmp->fs_token); 1940 hammer_start_transaction(&trans, hmp); 1941 ++hammer_stats_file_iopsw; 1942 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1943 hammer_done_transaction(&trans); 1944 if (error == 0) 1945 hammer_knote(ap->a_dvp, NOTE_WRITE); 1946 lwkt_reltoken(&hmp->fs_token); 1947 return (error); 1948 } 1949 1950 /* 1951 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1952 */ 1953 static 1954 int 1955 hammer_vop_nrename(struct vop_nrename_args *ap) 1956 { 1957 struct hammer_transaction trans; 1958 struct namecache *fncp; 1959 struct namecache *tncp; 1960 struct hammer_inode *fdip; 1961 struct hammer_inode *tdip; 1962 struct hammer_inode *ip; 1963 hammer_mount_t hmp; 1964 struct hammer_cursor cursor; 1965 int64_t namekey; 1966 u_int32_t max_iterations; 1967 int nlen, error; 1968 1969 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1970 return(EXDEV); 1971 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1972 return(EXDEV); 1973 1974 fdip = VTOI(ap->a_fdvp); 1975 tdip = VTOI(ap->a_tdvp); 1976 fncp = ap->a_fnch->ncp; 1977 tncp = ap->a_tnch->ncp; 1978 ip = VTOI(fncp->nc_vp); 1979 KKASSERT(ip != NULL); 1980 1981 hmp = ip->hmp; 1982 1983 if (fdip->obj_localization != tdip->obj_localization) 1984 return(EXDEV); 1985 if (fdip->obj_localization != ip->obj_localization) 1986 return(EXDEV); 1987 1988 if (fdip->flags & HAMMER_INODE_RO) 1989 return (EROFS); 1990 if (tdip->flags & HAMMER_INODE_RO) 1991 return (EROFS); 1992 if (ip->flags & HAMMER_INODE_RO) 1993 return (EROFS); 1994 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1995 return (error); 1996 1997 lwkt_gettoken(&hmp->fs_token); 1998 hammer_start_transaction(&trans, hmp); 1999 ++hammer_stats_file_iopsw; 2000 2001 /* 2002 * Remove tncp from the target directory and then link ip as 2003 * tncp. XXX pass trans to dounlink 2004 * 2005 * Force the inode sync-time to match the transaction so it is 2006 * in-sync with the creation of the target directory entry. 2007 */ 2008 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 2009 ap->a_cred, 0, -1); 2010 if (error == 0 || error == ENOENT) { 2011 error = hammer_ip_add_directory(&trans, tdip, 2012 tncp->nc_name, tncp->nc_nlen, 2013 ip); 2014 if (error == 0) { 2015 ip->ino_data.parent_obj_id = tdip->obj_id; 2016 ip->ino_data.ctime = trans.time; 2017 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2018 } 2019 } 2020 if (error) 2021 goto failed; /* XXX */ 2022 2023 /* 2024 * Locate the record in the originating directory and remove it. 2025 * 2026 * Calculate the namekey and setup the key range for the scan. This 2027 * works kinda like a chained hash table where the lower 32 bits 2028 * of the namekey synthesize the chain. 2029 * 2030 * The key range is inclusive of both key_beg and key_end. 2031 */ 2032 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2033 &max_iterations); 2034 retry: 2035 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2036 cursor.key_beg.localization = fdip->obj_localization + 2037 hammer_dir_localization(fdip); 2038 cursor.key_beg.obj_id = fdip->obj_id; 2039 cursor.key_beg.key = namekey; 2040 cursor.key_beg.create_tid = 0; 2041 cursor.key_beg.delete_tid = 0; 2042 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2043 cursor.key_beg.obj_type = 0; 2044 2045 cursor.key_end = cursor.key_beg; 2046 cursor.key_end.key += max_iterations; 2047 cursor.asof = fdip->obj_asof; 2048 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2049 2050 /* 2051 * Scan all matching records (the chain), locate the one matching 2052 * the requested path component. 2053 * 2054 * The hammer_ip_*() functions merge in-memory records with on-disk 2055 * records for the purposes of the search. 2056 */ 2057 error = hammer_ip_first(&cursor); 2058 while (error == 0) { 2059 if (hammer_ip_resolve_data(&cursor) != 0) 2060 break; 2061 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2062 KKASSERT(nlen > 0); 2063 if (fncp->nc_nlen == nlen && 2064 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2065 break; 2066 } 2067 error = hammer_ip_next(&cursor); 2068 } 2069 2070 /* 2071 * If all is ok we have to get the inode so we can adjust nlinks. 2072 * 2073 * WARNING: hammer_ip_del_directory() may have to terminate the 2074 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2075 * twice. 2076 */ 2077 if (error == 0) 2078 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2079 2080 /* 2081 * XXX A deadlock here will break rename's atomicy for the purposes 2082 * of crash recovery. 2083 */ 2084 if (error == EDEADLK) { 2085 hammer_done_cursor(&cursor); 2086 goto retry; 2087 } 2088 2089 /* 2090 * Cleanup and tell the kernel that the rename succeeded. 2091 * 2092 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2093 * without formally acquiring the vp since the vp might 2094 * have zero refs on it, or in the middle of a reclaim, 2095 * etc. 2096 */ 2097 hammer_done_cursor(&cursor); 2098 if (error == 0) { 2099 cache_rename(ap->a_fnch, ap->a_tnch); 2100 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2101 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2102 while (ip->vp) { 2103 struct vnode *vp; 2104 2105 error = hammer_get_vnode(ip, &vp); 2106 if (error == 0 && vp) { 2107 vn_unlock(vp); 2108 hammer_knote(ip->vp, NOTE_RENAME); 2109 vrele(vp); 2110 break; 2111 } 2112 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2113 } 2114 } 2115 2116 failed: 2117 hammer_done_transaction(&trans); 2118 lwkt_reltoken(&hmp->fs_token); 2119 return (error); 2120 } 2121 2122 /* 2123 * hammer_vop_nrmdir { nch, dvp, cred } 2124 */ 2125 static 2126 int 2127 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2128 { 2129 struct hammer_transaction trans; 2130 struct hammer_inode *dip; 2131 hammer_mount_t hmp; 2132 int error; 2133 2134 dip = VTOI(ap->a_dvp); 2135 hmp = dip->hmp; 2136 2137 if (hammer_nohistory(dip) == 0 && 2138 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2139 return (error); 2140 } 2141 2142 lwkt_gettoken(&hmp->fs_token); 2143 hammer_start_transaction(&trans, hmp); 2144 ++hammer_stats_file_iopsw; 2145 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2146 hammer_done_transaction(&trans); 2147 if (error == 0) 2148 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2149 lwkt_reltoken(&hmp->fs_token); 2150 return (error); 2151 } 2152 2153 /* 2154 * hammer_vop_markatime { vp, cred } 2155 */ 2156 static 2157 int 2158 hammer_vop_markatime(struct vop_markatime_args *ap) 2159 { 2160 struct hammer_transaction trans; 2161 struct hammer_inode *ip; 2162 hammer_mount_t hmp; 2163 2164 ip = VTOI(ap->a_vp); 2165 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2166 return (EROFS); 2167 if (ip->flags & HAMMER_INODE_RO) 2168 return (EROFS); 2169 hmp = ip->hmp; 2170 if (hmp->mp->mnt_flag & MNT_NOATIME) 2171 return (0); 2172 lwkt_gettoken(&hmp->fs_token); 2173 hammer_start_transaction(&trans, hmp); 2174 ++hammer_stats_file_iopsw; 2175 2176 ip->ino_data.atime = trans.time; 2177 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2178 hammer_done_transaction(&trans); 2179 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2180 lwkt_reltoken(&hmp->fs_token); 2181 return (0); 2182 } 2183 2184 /* 2185 * hammer_vop_setattr { vp, vap, cred } 2186 */ 2187 static 2188 int 2189 hammer_vop_setattr(struct vop_setattr_args *ap) 2190 { 2191 struct hammer_transaction trans; 2192 struct hammer_inode *ip; 2193 struct vattr *vap; 2194 hammer_mount_t hmp; 2195 int modflags; 2196 int error; 2197 int truncating; 2198 int blksize; 2199 int kflags; 2200 #if 0 2201 int64_t aligned_size; 2202 #endif 2203 u_int32_t flags; 2204 2205 vap = ap->a_vap; 2206 ip = ap->a_vp->v_data; 2207 modflags = 0; 2208 kflags = 0; 2209 hmp = ip->hmp; 2210 2211 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2212 return(EROFS); 2213 if (ip->flags & HAMMER_INODE_RO) 2214 return (EROFS); 2215 if (hammer_nohistory(ip) == 0 && 2216 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2217 return (error); 2218 } 2219 2220 lwkt_gettoken(&hmp->fs_token); 2221 hammer_start_transaction(&trans, hmp); 2222 ++hammer_stats_file_iopsw; 2223 error = 0; 2224 2225 if (vap->va_flags != VNOVAL) { 2226 flags = ip->ino_data.uflags; 2227 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2228 hammer_to_unix_xid(&ip->ino_data.uid), 2229 ap->a_cred); 2230 if (error == 0) { 2231 if (ip->ino_data.uflags != flags) { 2232 ip->ino_data.uflags = flags; 2233 ip->ino_data.ctime = trans.time; 2234 modflags |= HAMMER_INODE_DDIRTY; 2235 kflags |= NOTE_ATTRIB; 2236 } 2237 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2238 error = 0; 2239 goto done; 2240 } 2241 } 2242 goto done; 2243 } 2244 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2245 error = EPERM; 2246 goto done; 2247 } 2248 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2249 mode_t cur_mode = ip->ino_data.mode; 2250 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2251 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2252 uuid_t uuid_uid; 2253 uuid_t uuid_gid; 2254 2255 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2256 ap->a_cred, 2257 &cur_uid, &cur_gid, &cur_mode); 2258 if (error == 0) { 2259 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2260 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2261 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2262 sizeof(uuid_uid)) || 2263 bcmp(&uuid_gid, &ip->ino_data.gid, 2264 sizeof(uuid_gid)) || 2265 ip->ino_data.mode != cur_mode 2266 ) { 2267 ip->ino_data.uid = uuid_uid; 2268 ip->ino_data.gid = uuid_gid; 2269 ip->ino_data.mode = cur_mode; 2270 ip->ino_data.ctime = trans.time; 2271 modflags |= HAMMER_INODE_DDIRTY; 2272 } 2273 kflags |= NOTE_ATTRIB; 2274 } 2275 } 2276 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2277 switch(ap->a_vp->v_type) { 2278 case VREG: 2279 if (vap->va_size == ip->ino_data.size) 2280 break; 2281 2282 /* 2283 * Log the operation if in fast-fsync mode or if 2284 * there are unterminated redo write records present. 2285 * 2286 * The second check is needed so the recovery code 2287 * properly truncates write redos even if nominal 2288 * REDO operations is turned off due to excessive 2289 * writes, because the related records might be 2290 * destroyed and never lay down a TERM_WRITE. 2291 */ 2292 if ((ip->flags & HAMMER_INODE_REDO) || 2293 (ip->flags & HAMMER_INODE_RDIRTY)) { 2294 error = hammer_generate_redo(&trans, ip, 2295 vap->va_size, 2296 HAMMER_REDO_TRUNC, 2297 NULL, 0); 2298 } 2299 blksize = hammer_blocksize(vap->va_size); 2300 2301 /* 2302 * XXX break atomicy, we can deadlock the backend 2303 * if we do not release the lock. Probably not a 2304 * big deal here. 2305 */ 2306 if (vap->va_size < ip->ino_data.size) { 2307 nvtruncbuf(ap->a_vp, vap->va_size, 2308 blksize, 2309 hammer_blockoff(vap->va_size), 2310 0); 2311 truncating = 1; 2312 kflags |= NOTE_WRITE; 2313 } else { 2314 nvextendbuf(ap->a_vp, 2315 ip->ino_data.size, 2316 vap->va_size, 2317 hammer_blocksize(ip->ino_data.size), 2318 hammer_blocksize(vap->va_size), 2319 hammer_blockoff(ip->ino_data.size), 2320 hammer_blockoff(vap->va_size), 2321 0); 2322 truncating = 0; 2323 kflags |= NOTE_WRITE | NOTE_EXTEND; 2324 } 2325 ip->ino_data.size = vap->va_size; 2326 ip->ino_data.mtime = trans.time; 2327 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2328 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2329 2330 /* 2331 * On-media truncation is cached in the inode until 2332 * the inode is synchronized. We must immediately 2333 * handle any frontend records. 2334 */ 2335 if (truncating) { 2336 hammer_ip_frontend_trunc(ip, vap->va_size); 2337 #ifdef DEBUG_TRUNCATE 2338 if (HammerTruncIp == NULL) 2339 HammerTruncIp = ip; 2340 #endif 2341 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2342 ip->flags |= HAMMER_INODE_TRUNCATED; 2343 ip->trunc_off = vap->va_size; 2344 hammer_inode_dirty(ip); 2345 #ifdef DEBUG_TRUNCATE 2346 if (ip == HammerTruncIp) 2347 kprintf("truncate1 %016llx\n", 2348 (long long)ip->trunc_off); 2349 #endif 2350 } else if (ip->trunc_off > vap->va_size) { 2351 ip->trunc_off = vap->va_size; 2352 #ifdef DEBUG_TRUNCATE 2353 if (ip == HammerTruncIp) 2354 kprintf("truncate2 %016llx\n", 2355 (long long)ip->trunc_off); 2356 #endif 2357 } else { 2358 #ifdef DEBUG_TRUNCATE 2359 if (ip == HammerTruncIp) 2360 kprintf("truncate3 %016llx (ignored)\n", 2361 (long long)vap->va_size); 2362 #endif 2363 } 2364 } 2365 2366 #if 0 2367 /* 2368 * When truncating, nvtruncbuf() may have cleaned out 2369 * a portion of the last block on-disk in the buffer 2370 * cache. We must clean out any frontend records 2371 * for blocks beyond the new last block. 2372 */ 2373 aligned_size = (vap->va_size + (blksize - 1)) & 2374 ~(int64_t)(blksize - 1); 2375 if (truncating && vap->va_size < aligned_size) { 2376 aligned_size -= blksize; 2377 hammer_ip_frontend_trunc(ip, aligned_size); 2378 } 2379 #endif 2380 break; 2381 case VDATABASE: 2382 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2383 ip->flags |= HAMMER_INODE_TRUNCATED; 2384 ip->trunc_off = vap->va_size; 2385 hammer_inode_dirty(ip); 2386 } else if (ip->trunc_off > vap->va_size) { 2387 ip->trunc_off = vap->va_size; 2388 } 2389 hammer_ip_frontend_trunc(ip, vap->va_size); 2390 ip->ino_data.size = vap->va_size; 2391 ip->ino_data.mtime = trans.time; 2392 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2393 kflags |= NOTE_ATTRIB; 2394 break; 2395 default: 2396 error = EINVAL; 2397 goto done; 2398 } 2399 break; 2400 } 2401 if (vap->va_atime.tv_sec != VNOVAL) { 2402 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2403 modflags |= HAMMER_INODE_ATIME; 2404 kflags |= NOTE_ATTRIB; 2405 } 2406 if (vap->va_mtime.tv_sec != VNOVAL) { 2407 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2408 modflags |= HAMMER_INODE_MTIME; 2409 kflags |= NOTE_ATTRIB; 2410 } 2411 if (vap->va_mode != (mode_t)VNOVAL) { 2412 mode_t cur_mode = ip->ino_data.mode; 2413 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2414 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2415 2416 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2417 cur_uid, cur_gid, &cur_mode); 2418 if (error == 0 && ip->ino_data.mode != cur_mode) { 2419 ip->ino_data.mode = cur_mode; 2420 ip->ino_data.ctime = trans.time; 2421 modflags |= HAMMER_INODE_DDIRTY; 2422 kflags |= NOTE_ATTRIB; 2423 } 2424 } 2425 done: 2426 if (error == 0) 2427 hammer_modify_inode(&trans, ip, modflags); 2428 hammer_done_transaction(&trans); 2429 hammer_knote(ap->a_vp, kflags); 2430 lwkt_reltoken(&hmp->fs_token); 2431 return (error); 2432 } 2433 2434 /* 2435 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2436 */ 2437 static 2438 int 2439 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2440 { 2441 struct hammer_transaction trans; 2442 struct hammer_inode *dip; 2443 struct hammer_inode *nip; 2444 hammer_record_t record; 2445 struct nchandle *nch; 2446 hammer_mount_t hmp; 2447 int error; 2448 int bytes; 2449 2450 ap->a_vap->va_type = VLNK; 2451 2452 nch = ap->a_nch; 2453 dip = VTOI(ap->a_dvp); 2454 hmp = dip->hmp; 2455 2456 if (dip->flags & HAMMER_INODE_RO) 2457 return (EROFS); 2458 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2459 return (error); 2460 2461 /* 2462 * Create a transaction to cover the operations we perform. 2463 */ 2464 lwkt_gettoken(&hmp->fs_token); 2465 hammer_start_transaction(&trans, hmp); 2466 ++hammer_stats_file_iopsw; 2467 2468 /* 2469 * Create a new filesystem object of the requested type. The 2470 * returned inode will be referenced but not locked. 2471 */ 2472 2473 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2474 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2475 NULL, &nip); 2476 if (error) { 2477 hammer_done_transaction(&trans); 2478 *ap->a_vpp = NULL; 2479 lwkt_reltoken(&hmp->fs_token); 2480 return (error); 2481 } 2482 2483 /* 2484 * Add a record representing the symlink. symlink stores the link 2485 * as pure data, not a string, and is no \0 terminated. 2486 */ 2487 if (error == 0) { 2488 bytes = strlen(ap->a_target); 2489 2490 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2491 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2492 } else { 2493 record = hammer_alloc_mem_record(nip, bytes); 2494 record->type = HAMMER_MEM_RECORD_GENERAL; 2495 2496 record->leaf.base.localization = nip->obj_localization + 2497 HAMMER_LOCALIZE_MISC; 2498 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2499 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2500 record->leaf.data_len = bytes; 2501 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2502 bcopy(ap->a_target, record->data->symlink.name, bytes); 2503 error = hammer_ip_add_record(&trans, record); 2504 } 2505 2506 /* 2507 * Set the file size to the length of the link. 2508 */ 2509 if (error == 0) { 2510 nip->ino_data.size = bytes; 2511 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2512 } 2513 } 2514 if (error == 0) 2515 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2516 nch->ncp->nc_nlen, nip); 2517 2518 /* 2519 * Finish up. 2520 */ 2521 if (error) { 2522 hammer_rel_inode(nip, 0); 2523 *ap->a_vpp = NULL; 2524 } else { 2525 error = hammer_get_vnode(nip, ap->a_vpp); 2526 hammer_rel_inode(nip, 0); 2527 if (error == 0) { 2528 cache_setunresolved(ap->a_nch); 2529 cache_setvp(ap->a_nch, *ap->a_vpp); 2530 hammer_knote(ap->a_dvp, NOTE_WRITE); 2531 } 2532 } 2533 hammer_done_transaction(&trans); 2534 lwkt_reltoken(&hmp->fs_token); 2535 return (error); 2536 } 2537 2538 /* 2539 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2540 */ 2541 static 2542 int 2543 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2544 { 2545 struct hammer_transaction trans; 2546 struct hammer_inode *dip; 2547 hammer_mount_t hmp; 2548 int error; 2549 2550 dip = VTOI(ap->a_dvp); 2551 hmp = dip->hmp; 2552 2553 if (hammer_nohistory(dip) == 0 && 2554 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2555 return (error); 2556 } 2557 2558 lwkt_gettoken(&hmp->fs_token); 2559 hammer_start_transaction(&trans, hmp); 2560 ++hammer_stats_file_iopsw; 2561 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2562 ap->a_cred, ap->a_flags, -1); 2563 hammer_done_transaction(&trans); 2564 lwkt_reltoken(&hmp->fs_token); 2565 2566 return (error); 2567 } 2568 2569 /* 2570 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2571 */ 2572 static 2573 int 2574 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2575 { 2576 struct hammer_inode *ip = ap->a_vp->v_data; 2577 hammer_mount_t hmp = ip->hmp; 2578 int error; 2579 2580 ++hammer_stats_file_iopsr; 2581 lwkt_gettoken(&hmp->fs_token); 2582 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2583 ap->a_fflag, ap->a_cred); 2584 lwkt_reltoken(&hmp->fs_token); 2585 return (error); 2586 } 2587 2588 static 2589 int 2590 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2591 { 2592 static const struct mountctl_opt extraopt[] = { 2593 { HMNT_NOHISTORY, "nohistory" }, 2594 { HMNT_MASTERID, "master" }, 2595 { 0, NULL} 2596 2597 }; 2598 struct hammer_mount *hmp; 2599 struct mount *mp; 2600 int usedbytes; 2601 int error; 2602 2603 error = 0; 2604 usedbytes = 0; 2605 mp = ap->a_head.a_ops->head.vv_mount; 2606 KKASSERT(mp->mnt_data != NULL); 2607 hmp = (struct hammer_mount *)mp->mnt_data; 2608 2609 lwkt_gettoken(&hmp->fs_token); 2610 2611 switch(ap->a_op) { 2612 case MOUNTCTL_SET_EXPORT: 2613 if (ap->a_ctllen != sizeof(struct export_args)) 2614 error = EINVAL; 2615 else 2616 error = hammer_vfs_export(mp, ap->a_op, 2617 (const struct export_args *)ap->a_ctl); 2618 break; 2619 case MOUNTCTL_MOUNTFLAGS: 2620 { 2621 /* 2622 * Call standard mountctl VOP function 2623 * so we get user mount flags. 2624 */ 2625 error = vop_stdmountctl(ap); 2626 if (error) 2627 break; 2628 2629 usedbytes = *ap->a_res; 2630 2631 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2632 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2633 ap->a_buf, 2634 ap->a_buflen - usedbytes, 2635 &error); 2636 } 2637 2638 *ap->a_res += usedbytes; 2639 break; 2640 } 2641 default: 2642 error = vop_stdmountctl(ap); 2643 break; 2644 } 2645 lwkt_reltoken(&hmp->fs_token); 2646 return(error); 2647 } 2648 2649 /* 2650 * hammer_vop_strategy { vp, bio } 2651 * 2652 * Strategy call, used for regular file read & write only. Note that the 2653 * bp may represent a cluster. 2654 * 2655 * To simplify operation and allow better optimizations in the future, 2656 * this code does not make any assumptions with regards to buffer alignment 2657 * or size. 2658 */ 2659 static 2660 int 2661 hammer_vop_strategy(struct vop_strategy_args *ap) 2662 { 2663 struct buf *bp; 2664 int error; 2665 2666 bp = ap->a_bio->bio_buf; 2667 2668 switch(bp->b_cmd) { 2669 case BUF_CMD_READ: 2670 error = hammer_vop_strategy_read(ap); 2671 break; 2672 case BUF_CMD_WRITE: 2673 error = hammer_vop_strategy_write(ap); 2674 break; 2675 default: 2676 bp->b_error = error = EINVAL; 2677 bp->b_flags |= B_ERROR; 2678 biodone(ap->a_bio); 2679 break; 2680 } 2681 2682 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2683 2684 return (error); 2685 } 2686 2687 /* 2688 * Read from a regular file. Iterate the related records and fill in the 2689 * BIO/BUF. Gaps are zero-filled. 2690 * 2691 * The support code in hammer_object.c should be used to deal with mixed 2692 * in-memory and on-disk records. 2693 * 2694 * NOTE: Can be called from the cluster code with an oversized buf. 2695 * 2696 * XXX atime update 2697 */ 2698 static 2699 int 2700 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2701 { 2702 struct hammer_transaction trans; 2703 struct hammer_inode *ip; 2704 struct hammer_inode *dip; 2705 hammer_mount_t hmp; 2706 struct hammer_cursor cursor; 2707 hammer_base_elm_t base; 2708 hammer_off_t disk_offset; 2709 struct bio *bio; 2710 struct bio *nbio; 2711 struct buf *bp; 2712 int64_t rec_offset; 2713 int64_t ran_end; 2714 int64_t tmp64; 2715 int error; 2716 int boff; 2717 int roff; 2718 int n; 2719 int isdedupable; 2720 2721 bio = ap->a_bio; 2722 bp = bio->bio_buf; 2723 ip = ap->a_vp->v_data; 2724 hmp = ip->hmp; 2725 2726 /* 2727 * The zone-2 disk offset may have been set by the cluster code via 2728 * a BMAP operation, or else should be NOOFFSET. 2729 * 2730 * Checking the high bits for a match against zone-2 should suffice. 2731 * 2732 * In cases where a lot of data duplication is present it may be 2733 * more beneficial to drop through and doubule-buffer through the 2734 * device. 2735 */ 2736 nbio = push_bio(bio); 2737 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2738 HAMMER_ZONE_LARGE_DATA) { 2739 if (hammer_double_buffer == 0) { 2740 lwkt_gettoken(&hmp->fs_token); 2741 error = hammer_io_direct_read(hmp, nbio, NULL); 2742 lwkt_reltoken(&hmp->fs_token); 2743 return (error); 2744 } 2745 2746 /* 2747 * Try to shortcut requests for double_buffer mode too. 2748 * Since this mode runs through the device buffer cache 2749 * only compatible buffer sizes (meaning those generated 2750 * by normal filesystem buffers) are legal. 2751 */ 2752 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2753 lwkt_gettoken(&hmp->fs_token); 2754 error = hammer_io_indirect_read(hmp, nbio, NULL); 2755 lwkt_reltoken(&hmp->fs_token); 2756 return (error); 2757 } 2758 } 2759 2760 /* 2761 * Well, that sucked. Do it the hard way. If all the stars are 2762 * aligned we may still be able to issue a direct-read. 2763 */ 2764 lwkt_gettoken(&hmp->fs_token); 2765 hammer_simple_transaction(&trans, hmp); 2766 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2767 2768 /* 2769 * Key range (begin and end inclusive) to scan. Note that the key's 2770 * stored in the actual records represent BASE+LEN, not BASE. The 2771 * first record containing bio_offset will have a key > bio_offset. 2772 */ 2773 cursor.key_beg.localization = ip->obj_localization + 2774 HAMMER_LOCALIZE_MISC; 2775 cursor.key_beg.obj_id = ip->obj_id; 2776 cursor.key_beg.create_tid = 0; 2777 cursor.key_beg.delete_tid = 0; 2778 cursor.key_beg.obj_type = 0; 2779 cursor.key_beg.key = bio->bio_offset + 1; 2780 cursor.asof = ip->obj_asof; 2781 cursor.flags |= HAMMER_CURSOR_ASOF; 2782 2783 cursor.key_end = cursor.key_beg; 2784 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2785 #if 0 2786 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2787 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2788 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2789 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2790 } else 2791 #endif 2792 { 2793 ran_end = bio->bio_offset + bp->b_bufsize; 2794 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2795 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2796 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2797 if (tmp64 < ran_end) 2798 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2799 else 2800 cursor.key_end.key = ran_end + MAXPHYS + 1; 2801 } 2802 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2803 2804 /* 2805 * Set NOSWAPCACHE for cursor data extraction if double buffering 2806 * is disabled or (if the file is not marked cacheable via chflags 2807 * and vm.swapcache_use_chflags is enabled). 2808 */ 2809 if (hammer_double_buffer == 0 || 2810 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2811 vm_swapcache_use_chflags)) { 2812 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2813 } 2814 2815 error = hammer_ip_first(&cursor); 2816 boff = 0; 2817 2818 while (error == 0) { 2819 /* 2820 * Get the base file offset of the record. The key for 2821 * data records is (base + bytes) rather then (base). 2822 */ 2823 base = &cursor.leaf->base; 2824 rec_offset = base->key - cursor.leaf->data_len; 2825 2826 /* 2827 * Calculate the gap, if any, and zero-fill it. 2828 * 2829 * n is the offset of the start of the record verses our 2830 * current seek offset in the bio. 2831 */ 2832 n = (int)(rec_offset - (bio->bio_offset + boff)); 2833 if (n > 0) { 2834 if (n > bp->b_bufsize - boff) 2835 n = bp->b_bufsize - boff; 2836 bzero((char *)bp->b_data + boff, n); 2837 boff += n; 2838 n = 0; 2839 } 2840 2841 /* 2842 * Calculate the data offset in the record and the number 2843 * of bytes we can copy. 2844 * 2845 * There are two degenerate cases. First, boff may already 2846 * be at bp->b_bufsize. Secondly, the data offset within 2847 * the record may exceed the record's size. 2848 */ 2849 roff = -n; 2850 rec_offset += roff; 2851 n = cursor.leaf->data_len - roff; 2852 if (n <= 0) { 2853 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2854 n = 0; 2855 } else if (n > bp->b_bufsize - boff) { 2856 n = bp->b_bufsize - boff; 2857 } 2858 2859 /* 2860 * Deal with cached truncations. This cool bit of code 2861 * allows truncate()/ftruncate() to avoid having to sync 2862 * the file. 2863 * 2864 * If the frontend is truncated then all backend records are 2865 * subject to the frontend's truncation. 2866 * 2867 * If the backend is truncated then backend records on-disk 2868 * (but not in-memory) are subject to the backend's 2869 * truncation. In-memory records owned by the backend 2870 * represent data written after the truncation point on the 2871 * backend and must not be truncated. 2872 * 2873 * Truncate operations deal with frontend buffer cache 2874 * buffers and frontend-owned in-memory records synchronously. 2875 */ 2876 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2877 if (hammer_cursor_ondisk(&cursor)/* || 2878 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2879 if (ip->trunc_off <= rec_offset) 2880 n = 0; 2881 else if (ip->trunc_off < rec_offset + n) 2882 n = (int)(ip->trunc_off - rec_offset); 2883 } 2884 } 2885 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2886 if (hammer_cursor_ondisk(&cursor)) { 2887 if (ip->sync_trunc_off <= rec_offset) 2888 n = 0; 2889 else if (ip->sync_trunc_off < rec_offset + n) 2890 n = (int)(ip->sync_trunc_off - rec_offset); 2891 } 2892 } 2893 2894 /* 2895 * Try to issue a direct read into our bio if possible, 2896 * otherwise resolve the element data into a hammer_buffer 2897 * and copy. 2898 * 2899 * The buffer on-disk should be zerod past any real 2900 * truncation point, but may not be for any synthesized 2901 * truncation point from above. 2902 * 2903 * NOTE: disk_offset is only valid if the cursor data is 2904 * on-disk. 2905 */ 2906 disk_offset = cursor.leaf->data_offset + roff; 2907 isdedupable = (boff == 0 && n == bp->b_bufsize && 2908 hammer_cursor_ondisk(&cursor) && 2909 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2910 2911 if (isdedupable && hammer_double_buffer == 0) { 2912 /* 2913 * Direct read case 2914 */ 2915 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2916 HAMMER_ZONE_LARGE_DATA); 2917 nbio->bio_offset = disk_offset; 2918 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2919 if (hammer_live_dedup && error == 0) 2920 hammer_dedup_cache_add(ip, cursor.leaf); 2921 goto done; 2922 } else if (isdedupable) { 2923 /* 2924 * Async I/O case for reading from backing store 2925 * and copying the data to the filesystem buffer. 2926 * live-dedup has to verify the data anyway if it 2927 * gets a hit later so we can just add the entry 2928 * now. 2929 */ 2930 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2931 HAMMER_ZONE_LARGE_DATA); 2932 nbio->bio_offset = disk_offset; 2933 if (hammer_live_dedup) 2934 hammer_dedup_cache_add(ip, cursor.leaf); 2935 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2936 goto done; 2937 } else if (n) { 2938 error = hammer_ip_resolve_data(&cursor); 2939 if (error == 0) { 2940 if (hammer_live_dedup && isdedupable) 2941 hammer_dedup_cache_add(ip, cursor.leaf); 2942 bcopy((char *)cursor.data + roff, 2943 (char *)bp->b_data + boff, n); 2944 } 2945 } 2946 if (error) 2947 break; 2948 2949 /* 2950 * We have to be sure that the only elements added to the 2951 * dedup cache are those which are already on-media. 2952 */ 2953 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2954 hammer_dedup_cache_add(ip, cursor.leaf); 2955 2956 /* 2957 * Iterate until we have filled the request. 2958 */ 2959 boff += n; 2960 if (boff == bp->b_bufsize) 2961 break; 2962 error = hammer_ip_next(&cursor); 2963 } 2964 2965 /* 2966 * There may have been a gap after the last record 2967 */ 2968 if (error == ENOENT) 2969 error = 0; 2970 if (error == 0 && boff != bp->b_bufsize) { 2971 KKASSERT(boff < bp->b_bufsize); 2972 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2973 /* boff = bp->b_bufsize; */ 2974 } 2975 2976 /* 2977 * Disallow swapcache operation on the vnode buffer if double 2978 * buffering is enabled, the swapcache will get the data via 2979 * the block device buffer. 2980 */ 2981 if (hammer_double_buffer) 2982 bp->b_flags |= B_NOTMETA; 2983 2984 /* 2985 * Cleanup 2986 */ 2987 bp->b_resid = 0; 2988 bp->b_error = error; 2989 if (error) 2990 bp->b_flags |= B_ERROR; 2991 biodone(ap->a_bio); 2992 2993 done: 2994 /* 2995 * Cache the b-tree node for the last data read in cache[1]. 2996 * 2997 * If we hit the file EOF then also cache the node in the 2998 * governing director's cache[3], it will be used to initialize 2999 * the inode's cache[1] for any inodes looked up via the directory. 3000 * 3001 * This doesn't reduce disk accesses since the B-Tree chain is 3002 * likely cached, but it does reduce cpu overhead when looking 3003 * up file offsets for cpdup/tar/cpio style iterations. 3004 */ 3005 if (cursor.node) 3006 hammer_cache_node(&ip->cache[1], cursor.node); 3007 if (ran_end >= ip->ino_data.size) { 3008 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 3009 ip->obj_asof, ip->obj_localization); 3010 if (dip) { 3011 hammer_cache_node(&dip->cache[3], cursor.node); 3012 hammer_rel_inode(dip, 0); 3013 } 3014 } 3015 hammer_done_cursor(&cursor); 3016 hammer_done_transaction(&trans); 3017 lwkt_reltoken(&hmp->fs_token); 3018 return(error); 3019 } 3020 3021 /* 3022 * BMAP operation - used to support cluster_read() only. 3023 * 3024 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3025 * 3026 * This routine may return EOPNOTSUPP if the opration is not supported for 3027 * the specified offset. The contents of the pointer arguments do not 3028 * need to be initialized in that case. 3029 * 3030 * If a disk address is available and properly aligned return 0 with 3031 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3032 * to the run-length relative to that offset. Callers may assume that 3033 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3034 * large, so return EOPNOTSUPP if it is not sufficiently large. 3035 */ 3036 static 3037 int 3038 hammer_vop_bmap(struct vop_bmap_args *ap) 3039 { 3040 struct hammer_transaction trans; 3041 struct hammer_inode *ip; 3042 hammer_mount_t hmp; 3043 struct hammer_cursor cursor; 3044 hammer_base_elm_t base; 3045 int64_t rec_offset; 3046 int64_t ran_end; 3047 int64_t tmp64; 3048 int64_t base_offset; 3049 int64_t base_disk_offset; 3050 int64_t last_offset; 3051 hammer_off_t last_disk_offset; 3052 hammer_off_t disk_offset; 3053 int rec_len; 3054 int error; 3055 int blksize; 3056 3057 ++hammer_stats_file_iopsr; 3058 ip = ap->a_vp->v_data; 3059 hmp = ip->hmp; 3060 3061 /* 3062 * We can only BMAP regular files. We can't BMAP database files, 3063 * directories, etc. 3064 */ 3065 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3066 return(EOPNOTSUPP); 3067 3068 /* 3069 * bmap is typically called with runp/runb both NULL when used 3070 * for writing. We do not support BMAP for writing atm. 3071 */ 3072 if (ap->a_cmd != BUF_CMD_READ) 3073 return(EOPNOTSUPP); 3074 3075 /* 3076 * Scan the B-Tree to acquire blockmap addresses, then translate 3077 * to raw addresses. 3078 */ 3079 lwkt_gettoken(&hmp->fs_token); 3080 hammer_simple_transaction(&trans, hmp); 3081 #if 0 3082 kprintf("bmap_beg %016llx ip->cache %p\n", 3083 (long long)ap->a_loffset, ip->cache[1]); 3084 #endif 3085 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3086 3087 /* 3088 * Key range (begin and end inclusive) to scan. Note that the key's 3089 * stored in the actual records represent BASE+LEN, not BASE. The 3090 * first record containing bio_offset will have a key > bio_offset. 3091 */ 3092 cursor.key_beg.localization = ip->obj_localization + 3093 HAMMER_LOCALIZE_MISC; 3094 cursor.key_beg.obj_id = ip->obj_id; 3095 cursor.key_beg.create_tid = 0; 3096 cursor.key_beg.delete_tid = 0; 3097 cursor.key_beg.obj_type = 0; 3098 if (ap->a_runb) 3099 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3100 else 3101 cursor.key_beg.key = ap->a_loffset + 1; 3102 if (cursor.key_beg.key < 0) 3103 cursor.key_beg.key = 0; 3104 cursor.asof = ip->obj_asof; 3105 cursor.flags |= HAMMER_CURSOR_ASOF; 3106 3107 cursor.key_end = cursor.key_beg; 3108 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3109 3110 ran_end = ap->a_loffset + MAXPHYS; 3111 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3112 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3113 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3114 if (tmp64 < ran_end) 3115 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3116 else 3117 cursor.key_end.key = ran_end + MAXPHYS + 1; 3118 3119 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3120 3121 error = hammer_ip_first(&cursor); 3122 base_offset = last_offset = 0; 3123 base_disk_offset = last_disk_offset = 0; 3124 3125 while (error == 0) { 3126 /* 3127 * Get the base file offset of the record. The key for 3128 * data records is (base + bytes) rather then (base). 3129 * 3130 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3131 * The extra bytes should be zero on-disk and the BMAP op 3132 * should still be ok. 3133 */ 3134 base = &cursor.leaf->base; 3135 rec_offset = base->key - cursor.leaf->data_len; 3136 rec_len = cursor.leaf->data_len; 3137 3138 /* 3139 * Incorporate any cached truncation. 3140 * 3141 * NOTE: Modifications to rec_len based on synthesized 3142 * truncation points remove the guarantee that any extended 3143 * data on disk is zero (since the truncations may not have 3144 * taken place on-media yet). 3145 */ 3146 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3147 if (hammer_cursor_ondisk(&cursor) || 3148 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3149 if (ip->trunc_off <= rec_offset) 3150 rec_len = 0; 3151 else if (ip->trunc_off < rec_offset + rec_len) 3152 rec_len = (int)(ip->trunc_off - rec_offset); 3153 } 3154 } 3155 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3156 if (hammer_cursor_ondisk(&cursor)) { 3157 if (ip->sync_trunc_off <= rec_offset) 3158 rec_len = 0; 3159 else if (ip->sync_trunc_off < rec_offset + rec_len) 3160 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3161 } 3162 } 3163 3164 /* 3165 * Accumulate information. If we have hit a discontiguous 3166 * block reset base_offset unless we are already beyond the 3167 * requested offset. If we are, that's it, we stop. 3168 */ 3169 if (error) 3170 break; 3171 if (hammer_cursor_ondisk(&cursor)) { 3172 disk_offset = cursor.leaf->data_offset; 3173 if (rec_offset != last_offset || 3174 disk_offset != last_disk_offset) { 3175 if (rec_offset > ap->a_loffset) 3176 break; 3177 base_offset = rec_offset; 3178 base_disk_offset = disk_offset; 3179 } 3180 last_offset = rec_offset + rec_len; 3181 last_disk_offset = disk_offset + rec_len; 3182 3183 if (hammer_live_dedup) 3184 hammer_dedup_cache_add(ip, cursor.leaf); 3185 } 3186 3187 error = hammer_ip_next(&cursor); 3188 } 3189 3190 #if 0 3191 kprintf("BMAP %016llx: %016llx - %016llx\n", 3192 (long long)ap->a_loffset, 3193 (long long)base_offset, 3194 (long long)last_offset); 3195 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3196 (long long)base_disk_offset, 3197 (long long)last_disk_offset); 3198 #endif 3199 3200 if (cursor.node) { 3201 hammer_cache_node(&ip->cache[1], cursor.node); 3202 #if 0 3203 kprintf("bmap_end2 %016llx ip->cache %p\n", 3204 (long long)ap->a_loffset, ip->cache[1]); 3205 #endif 3206 } 3207 hammer_done_cursor(&cursor); 3208 hammer_done_transaction(&trans); 3209 lwkt_reltoken(&hmp->fs_token); 3210 3211 /* 3212 * If we couldn't find any records or the records we did find were 3213 * all behind the requested offset, return failure. A forward 3214 * truncation can leave a hole w/ no on-disk records. 3215 */ 3216 if (last_offset == 0 || last_offset < ap->a_loffset) 3217 return (EOPNOTSUPP); 3218 3219 /* 3220 * Figure out the block size at the requested offset and adjust 3221 * our limits so the cluster_read() does not create inappropriately 3222 * sized buffer cache buffers. 3223 */ 3224 blksize = hammer_blocksize(ap->a_loffset); 3225 if (hammer_blocksize(base_offset) != blksize) { 3226 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3227 } 3228 if (last_offset != ap->a_loffset && 3229 hammer_blocksize(last_offset - 1) != blksize) { 3230 last_offset = hammer_blockdemarc(ap->a_loffset, 3231 last_offset - 1); 3232 } 3233 3234 /* 3235 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3236 * from occuring. 3237 */ 3238 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3239 3240 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3241 /* 3242 * Only large-data zones can be direct-IOd 3243 */ 3244 error = EOPNOTSUPP; 3245 } else if ((disk_offset & HAMMER_BUFMASK) || 3246 (last_offset - ap->a_loffset) < blksize) { 3247 /* 3248 * doffsetp is not aligned or the forward run size does 3249 * not cover a whole buffer, disallow the direct I/O. 3250 */ 3251 error = EOPNOTSUPP; 3252 } else { 3253 /* 3254 * We're good. 3255 */ 3256 *ap->a_doffsetp = disk_offset; 3257 if (ap->a_runb) { 3258 *ap->a_runb = ap->a_loffset - base_offset; 3259 KKASSERT(*ap->a_runb >= 0); 3260 } 3261 if (ap->a_runp) { 3262 *ap->a_runp = last_offset - ap->a_loffset; 3263 KKASSERT(*ap->a_runp >= 0); 3264 } 3265 error = 0; 3266 } 3267 return(error); 3268 } 3269 3270 /* 3271 * Write to a regular file. Because this is a strategy call the OS is 3272 * trying to actually get data onto the media. 3273 */ 3274 static 3275 int 3276 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3277 { 3278 hammer_record_t record; 3279 hammer_mount_t hmp; 3280 hammer_inode_t ip; 3281 struct bio *bio; 3282 struct buf *bp; 3283 int blksize __debugvar; 3284 int bytes; 3285 int error; 3286 3287 bio = ap->a_bio; 3288 bp = bio->bio_buf; 3289 ip = ap->a_vp->v_data; 3290 hmp = ip->hmp; 3291 3292 blksize = hammer_blocksize(bio->bio_offset); 3293 KKASSERT(bp->b_bufsize == blksize); 3294 3295 if (ip->flags & HAMMER_INODE_RO) { 3296 bp->b_error = EROFS; 3297 bp->b_flags |= B_ERROR; 3298 biodone(ap->a_bio); 3299 return(EROFS); 3300 } 3301 3302 lwkt_gettoken(&hmp->fs_token); 3303 3304 /* 3305 * Disallow swapcache operation on the vnode buffer if double 3306 * buffering is enabled, the swapcache will get the data via 3307 * the block device buffer. 3308 */ 3309 if (hammer_double_buffer) 3310 bp->b_flags |= B_NOTMETA; 3311 3312 /* 3313 * Interlock with inode destruction (no in-kernel or directory 3314 * topology visibility). If we queue new IO while trying to 3315 * destroy the inode we can deadlock the vtrunc call in 3316 * hammer_inode_unloadable_check(). 3317 * 3318 * Besides, there's no point flushing a bp associated with an 3319 * inode that is being destroyed on-media and has no kernel 3320 * references. 3321 */ 3322 if ((ip->flags | ip->sync_flags) & 3323 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3324 bp->b_resid = 0; 3325 biodone(ap->a_bio); 3326 lwkt_reltoken(&hmp->fs_token); 3327 return(0); 3328 } 3329 3330 /* 3331 * Reserve space and issue a direct-write from the front-end. 3332 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3333 * allocations. 3334 * 3335 * An in-memory record will be installed to reference the storage 3336 * until the flusher can get to it. 3337 * 3338 * Since we own the high level bio the front-end will not try to 3339 * do a direct-read until the write completes. 3340 * 3341 * NOTE: The only time we do not reserve a full-sized buffers 3342 * worth of data is if the file is small. We do not try to 3343 * allocate a fragment (from the small-data zone) at the end of 3344 * an otherwise large file as this can lead to wildly separated 3345 * data. 3346 */ 3347 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3348 KKASSERT(bio->bio_offset < ip->ino_data.size); 3349 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3350 bytes = bp->b_bufsize; 3351 else 3352 bytes = ((int)ip->ino_data.size + 15) & ~15; 3353 3354 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3355 bytes, &error); 3356 3357 /* 3358 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3359 * in hammer_vop_write(). We must flag the record so the proper 3360 * REDO_TERM_WRITE entry is generated during the flush. 3361 */ 3362 if (record) { 3363 if (bp->b_flags & B_VFSFLAG1) { 3364 record->flags |= HAMMER_RECF_REDO; 3365 bp->b_flags &= ~B_VFSFLAG1; 3366 } 3367 if (record->flags & HAMMER_RECF_DEDUPED) { 3368 bp->b_resid = 0; 3369 hammer_ip_replace_bulk(hmp, record); 3370 biodone(ap->a_bio); 3371 } else { 3372 hammer_io_direct_write(hmp, bio, record); 3373 } 3374 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3375 hammer_flush_inode(ip, 0); 3376 } else { 3377 bp->b_bio2.bio_offset = NOOFFSET; 3378 bp->b_error = error; 3379 bp->b_flags |= B_ERROR; 3380 biodone(ap->a_bio); 3381 } 3382 lwkt_reltoken(&hmp->fs_token); 3383 return(error); 3384 } 3385 3386 /* 3387 * dounlink - disconnect a directory entry 3388 * 3389 * XXX whiteout support not really in yet 3390 */ 3391 static int 3392 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3393 struct vnode *dvp, struct ucred *cred, 3394 int flags, int isdir) 3395 { 3396 struct namecache *ncp; 3397 hammer_inode_t dip; 3398 hammer_inode_t ip; 3399 hammer_mount_t hmp; 3400 struct hammer_cursor cursor; 3401 int64_t namekey; 3402 u_int32_t max_iterations; 3403 int nlen, error; 3404 3405 /* 3406 * Calculate the namekey and setup the key range for the scan. This 3407 * works kinda like a chained hash table where the lower 32 bits 3408 * of the namekey synthesize the chain. 3409 * 3410 * The key range is inclusive of both key_beg and key_end. 3411 */ 3412 dip = VTOI(dvp); 3413 ncp = nch->ncp; 3414 hmp = dip->hmp; 3415 3416 if (dip->flags & HAMMER_INODE_RO) 3417 return (EROFS); 3418 3419 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3420 &max_iterations); 3421 retry: 3422 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3423 cursor.key_beg.localization = dip->obj_localization + 3424 hammer_dir_localization(dip); 3425 cursor.key_beg.obj_id = dip->obj_id; 3426 cursor.key_beg.key = namekey; 3427 cursor.key_beg.create_tid = 0; 3428 cursor.key_beg.delete_tid = 0; 3429 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3430 cursor.key_beg.obj_type = 0; 3431 3432 cursor.key_end = cursor.key_beg; 3433 cursor.key_end.key += max_iterations; 3434 cursor.asof = dip->obj_asof; 3435 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3436 3437 /* 3438 * Scan all matching records (the chain), locate the one matching 3439 * the requested path component. info->last_error contains the 3440 * error code on search termination and could be 0, ENOENT, or 3441 * something else. 3442 * 3443 * The hammer_ip_*() functions merge in-memory records with on-disk 3444 * records for the purposes of the search. 3445 */ 3446 error = hammer_ip_first(&cursor); 3447 3448 while (error == 0) { 3449 error = hammer_ip_resolve_data(&cursor); 3450 if (error) 3451 break; 3452 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3453 KKASSERT(nlen > 0); 3454 if (ncp->nc_nlen == nlen && 3455 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3456 break; 3457 } 3458 error = hammer_ip_next(&cursor); 3459 } 3460 3461 /* 3462 * If all is ok we have to get the inode so we can adjust nlinks. 3463 * To avoid a deadlock with the flusher we must release the inode 3464 * lock on the directory when acquiring the inode for the entry. 3465 * 3466 * If the target is a directory, it must be empty. 3467 */ 3468 if (error == 0) { 3469 hammer_unlock(&cursor.ip->lock); 3470 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3471 hmp->asof, 3472 cursor.data->entry.localization, 3473 0, &error); 3474 hammer_lock_sh(&cursor.ip->lock); 3475 if (error == ENOENT) { 3476 kprintf("HAMMER: WARNING: Removing " 3477 "dirent w/missing inode \"%s\"\n" 3478 "\tobj_id = %016llx\n", 3479 ncp->nc_name, 3480 (long long)cursor.data->entry.obj_id); 3481 error = 0; 3482 } 3483 3484 /* 3485 * If isdir >= 0 we validate that the entry is or is not a 3486 * directory. If isdir < 0 we don't care. 3487 */ 3488 if (error == 0 && isdir >= 0 && ip) { 3489 if (isdir && 3490 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3491 error = ENOTDIR; 3492 } else if (isdir == 0 && 3493 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3494 error = EISDIR; 3495 } 3496 } 3497 3498 /* 3499 * If we are trying to remove a directory the directory must 3500 * be empty. 3501 * 3502 * The check directory code can loop and deadlock/retry. Our 3503 * own cursor's node locks must be released to avoid a 3-way 3504 * deadlock with the flusher if the check directory code 3505 * blocks. 3506 * 3507 * If any changes whatsoever have been made to the cursor 3508 * set EDEADLK and retry. 3509 * 3510 * WARNING: See warnings in hammer_unlock_cursor() 3511 * function. 3512 */ 3513 if (error == 0 && ip && ip->ino_data.obj_type == 3514 HAMMER_OBJTYPE_DIRECTORY) { 3515 hammer_unlock_cursor(&cursor); 3516 error = hammer_ip_check_directory_empty(trans, ip); 3517 hammer_lock_cursor(&cursor); 3518 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3519 kprintf("HAMMER: Warning: avoided deadlock " 3520 "on rmdir '%s'\n", 3521 ncp->nc_name); 3522 error = EDEADLK; 3523 } 3524 } 3525 3526 /* 3527 * Delete the directory entry. 3528 * 3529 * WARNING: hammer_ip_del_directory() may have to terminate 3530 * the cursor to avoid a deadlock. It is ok to call 3531 * hammer_done_cursor() twice. 3532 */ 3533 if (error == 0) { 3534 error = hammer_ip_del_directory(trans, &cursor, 3535 dip, ip); 3536 } 3537 hammer_done_cursor(&cursor); 3538 if (error == 0) { 3539 /* 3540 * Tell the namecache that we are now unlinked. 3541 */ 3542 cache_unlink(nch); 3543 3544 /* 3545 * NOTE: ip->vp, if non-NULL, cannot be directly 3546 * referenced without formally acquiring the 3547 * vp since the vp might have zero refs on it, 3548 * or in the middle of a reclaim, etc. 3549 * 3550 * NOTE: The cache_setunresolved() can rip the vp 3551 * out from under us since the vp may not have 3552 * any refs, in which case ip->vp will be NULL 3553 * from the outset. 3554 */ 3555 while (ip && ip->vp) { 3556 struct vnode *vp; 3557 3558 error = hammer_get_vnode(ip, &vp); 3559 if (error == 0 && vp) { 3560 vn_unlock(vp); 3561 hammer_knote(ip->vp, NOTE_DELETE); 3562 #if 0 3563 /* 3564 * Don't do this, it can deadlock 3565 * on concurrent rm's of hardlinks. 3566 * Shouldn't be needed any more. 3567 */ 3568 cache_inval_vp(ip->vp, CINV_DESTROY); 3569 #endif 3570 vrele(vp); 3571 break; 3572 } 3573 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3574 } 3575 } 3576 if (ip) 3577 hammer_rel_inode(ip, 0); 3578 } else { 3579 hammer_done_cursor(&cursor); 3580 } 3581 if (error == EDEADLK) 3582 goto retry; 3583 3584 return (error); 3585 } 3586 3587 /************************************************************************ 3588 * FIFO AND SPECFS OPS * 3589 ************************************************************************ 3590 * 3591 */ 3592 static int 3593 hammer_vop_fifoclose (struct vop_close_args *ap) 3594 { 3595 /* XXX update itimes */ 3596 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3597 } 3598 3599 static int 3600 hammer_vop_fiforead (struct vop_read_args *ap) 3601 { 3602 int error; 3603 3604 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3605 /* XXX update access time */ 3606 return (error); 3607 } 3608 3609 static int 3610 hammer_vop_fifowrite (struct vop_write_args *ap) 3611 { 3612 int error; 3613 3614 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3615 /* XXX update access time */ 3616 return (error); 3617 } 3618 3619 static 3620 int 3621 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3622 { 3623 int error; 3624 3625 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3626 if (error) 3627 error = hammer_vop_kqfilter(ap); 3628 return(error); 3629 } 3630 3631 /************************************************************************ 3632 * KQFILTER OPS * 3633 ************************************************************************ 3634 * 3635 */ 3636 static void filt_hammerdetach(struct knote *kn); 3637 static int filt_hammerread(struct knote *kn, long hint); 3638 static int filt_hammerwrite(struct knote *kn, long hint); 3639 static int filt_hammervnode(struct knote *kn, long hint); 3640 3641 static struct filterops hammerread_filtops = 3642 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3643 NULL, filt_hammerdetach, filt_hammerread }; 3644 static struct filterops hammerwrite_filtops = 3645 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3646 NULL, filt_hammerdetach, filt_hammerwrite }; 3647 static struct filterops hammervnode_filtops = 3648 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3649 NULL, filt_hammerdetach, filt_hammervnode }; 3650 3651 static 3652 int 3653 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3654 { 3655 struct vnode *vp = ap->a_vp; 3656 struct knote *kn = ap->a_kn; 3657 3658 switch (kn->kn_filter) { 3659 case EVFILT_READ: 3660 kn->kn_fop = &hammerread_filtops; 3661 break; 3662 case EVFILT_WRITE: 3663 kn->kn_fop = &hammerwrite_filtops; 3664 break; 3665 case EVFILT_VNODE: 3666 kn->kn_fop = &hammervnode_filtops; 3667 break; 3668 default: 3669 return (EOPNOTSUPP); 3670 } 3671 3672 kn->kn_hook = (caddr_t)vp; 3673 3674 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3675 3676 return(0); 3677 } 3678 3679 static void 3680 filt_hammerdetach(struct knote *kn) 3681 { 3682 struct vnode *vp = (void *)kn->kn_hook; 3683 3684 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3685 } 3686 3687 static int 3688 filt_hammerread(struct knote *kn, long hint) 3689 { 3690 struct vnode *vp = (void *)kn->kn_hook; 3691 hammer_inode_t ip = VTOI(vp); 3692 hammer_mount_t hmp = ip->hmp; 3693 off_t off; 3694 3695 if (hint == NOTE_REVOKE) { 3696 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3697 return(1); 3698 } 3699 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3700 off = ip->ino_data.size - kn->kn_fp->f_offset; 3701 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3702 lwkt_reltoken(&hmp->fs_token); 3703 if (kn->kn_sfflags & NOTE_OLDAPI) 3704 return(1); 3705 return (kn->kn_data != 0); 3706 } 3707 3708 static int 3709 filt_hammerwrite(struct knote *kn, long hint) 3710 { 3711 if (hint == NOTE_REVOKE) 3712 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3713 kn->kn_data = 0; 3714 return (1); 3715 } 3716 3717 static int 3718 filt_hammervnode(struct knote *kn, long hint) 3719 { 3720 if (kn->kn_sfflags & hint) 3721 kn->kn_fflags |= hint; 3722 if (hint == NOTE_REVOKE) { 3723 kn->kn_flags |= (EV_EOF | EV_NODATA); 3724 return (1); 3725 } 3726 return (kn->kn_fflags != 0); 3727 } 3728 3729