1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/fcntl.h> 39 #include <sys/namecache.h> 40 #include <sys/vnode.h> 41 #include <sys/lockf.h> 42 #include <sys/event.h> 43 #include <sys/stat.h> 44 #include <sys/dirent.h> 45 #include <sys/file.h> 46 #include <vm/vm_extern.h> 47 #include <vm/swap_pager.h> 48 #include <vfs/fifofs/fifo.h> 49 50 #include "hammer.h" 51 52 /* 53 * USERFS VNOPS 54 */ 55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 56 static int hammer_vop_fsync(struct vop_fsync_args *); 57 static int hammer_vop_read(struct vop_read_args *); 58 static int hammer_vop_write(struct vop_write_args *); 59 static int hammer_vop_access(struct vop_access_args *); 60 static int hammer_vop_advlock(struct vop_advlock_args *); 61 static int hammer_vop_close(struct vop_close_args *); 62 static int hammer_vop_ncreate(struct vop_ncreate_args *); 63 static int hammer_vop_getattr(struct vop_getattr_args *); 64 static int hammer_vop_nresolve(struct vop_nresolve_args *); 65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 66 static int hammer_vop_nlink(struct vop_nlink_args *); 67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 68 static int hammer_vop_nmknod(struct vop_nmknod_args *); 69 static int hammer_vop_open(struct vop_open_args *); 70 static int hammer_vop_print(struct vop_print_args *); 71 static int hammer_vop_readdir(struct vop_readdir_args *); 72 static int hammer_vop_readlink(struct vop_readlink_args *); 73 static int hammer_vop_nremove(struct vop_nremove_args *); 74 static int hammer_vop_nrename(struct vop_nrename_args *); 75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 76 static int hammer_vop_markatime(struct vop_markatime_args *); 77 static int hammer_vop_setattr(struct vop_setattr_args *); 78 static int hammer_vop_strategy(struct vop_strategy_args *); 79 static int hammer_vop_bmap(struct vop_bmap_args *ap); 80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 82 static int hammer_vop_ioctl(struct vop_ioctl_args *); 83 static int hammer_vop_mountctl(struct vop_mountctl_args *); 84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 85 86 static int hammer_vop_fifoclose (struct vop_close_args *); 87 static int hammer_vop_fiforead (struct vop_read_args *); 88 static int hammer_vop_fifowrite (struct vop_write_args *); 89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 90 91 struct vop_ops hammer_vnode_vops = { 92 .vop_default = vop_defaultop, 93 .vop_fsync = hammer_vop_fsync, 94 .vop_getpages = vop_stdgetpages, 95 .vop_putpages = vop_stdputpages, 96 .vop_read = hammer_vop_read, 97 .vop_write = hammer_vop_write, 98 .vop_access = hammer_vop_access, 99 .vop_advlock = hammer_vop_advlock, 100 .vop_close = hammer_vop_close, 101 .vop_ncreate = hammer_vop_ncreate, 102 .vop_getattr = hammer_vop_getattr, 103 .vop_inactive = hammer_vop_inactive, 104 .vop_reclaim = hammer_vop_reclaim, 105 .vop_nresolve = hammer_vop_nresolve, 106 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 107 .vop_nlink = hammer_vop_nlink, 108 .vop_nmkdir = hammer_vop_nmkdir, 109 .vop_nmknod = hammer_vop_nmknod, 110 .vop_open = hammer_vop_open, 111 .vop_pathconf = vop_stdpathconf, 112 .vop_print = hammer_vop_print, 113 .vop_readdir = hammer_vop_readdir, 114 .vop_readlink = hammer_vop_readlink, 115 .vop_nremove = hammer_vop_nremove, 116 .vop_nrename = hammer_vop_nrename, 117 .vop_nrmdir = hammer_vop_nrmdir, 118 .vop_markatime = hammer_vop_markatime, 119 .vop_setattr = hammer_vop_setattr, 120 .vop_bmap = hammer_vop_bmap, 121 .vop_strategy = hammer_vop_strategy, 122 .vop_nsymlink = hammer_vop_nsymlink, 123 .vop_nwhiteout = hammer_vop_nwhiteout, 124 .vop_ioctl = hammer_vop_ioctl, 125 .vop_mountctl = hammer_vop_mountctl, 126 .vop_kqfilter = hammer_vop_kqfilter 127 }; 128 129 struct vop_ops hammer_spec_vops = { 130 .vop_default = vop_defaultop, 131 .vop_fsync = hammer_vop_fsync, 132 .vop_read = vop_stdnoread, 133 .vop_write = vop_stdnowrite, 134 .vop_access = hammer_vop_access, 135 .vop_close = hammer_vop_close, 136 .vop_markatime = hammer_vop_markatime, 137 .vop_getattr = hammer_vop_getattr, 138 .vop_inactive = hammer_vop_inactive, 139 .vop_reclaim = hammer_vop_reclaim, 140 .vop_setattr = hammer_vop_setattr 141 }; 142 143 struct vop_ops hammer_fifo_vops = { 144 .vop_default = fifo_vnoperate, 145 .vop_fsync = hammer_vop_fsync, 146 .vop_read = hammer_vop_fiforead, 147 .vop_write = hammer_vop_fifowrite, 148 .vop_access = hammer_vop_access, 149 .vop_close = hammer_vop_fifoclose, 150 .vop_markatime = hammer_vop_markatime, 151 .vop_getattr = hammer_vop_getattr, 152 .vop_inactive = hammer_vop_inactive, 153 .vop_reclaim = hammer_vop_reclaim, 154 .vop_setattr = hammer_vop_setattr, 155 .vop_kqfilter = hammer_vop_fifokqfilter 156 }; 157 158 static __inline 159 void 160 hammer_knote(struct vnode *vp, int flags) 161 { 162 if (flags) 163 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 164 } 165 166 #ifdef DEBUG_TRUNCATE 167 struct hammer_inode *HammerTruncIp; 168 #endif 169 170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 171 struct vnode *dvp, struct ucred *cred, 172 int flags, int isdir); 173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 175 176 #if 0 177 static 178 int 179 hammer_vop_vnoperate(struct vop_generic_args *) 180 { 181 return (VOCALL(&hammer_vnode_vops, ap)); 182 } 183 #endif 184 185 /* 186 * hammer_vop_fsync { vp, waitfor } 187 * 188 * fsync() an inode to disk and wait for it to be completely committed 189 * such that the information would not be undone if a crash occured after 190 * return. 191 * 192 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 193 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 194 * operation. 195 * 196 * Ultimately the combination of a REDO log and use of fast storage 197 * to front-end cluster caches will make fsync fast, but it aint 198 * here yet. And, in anycase, we need real transactional 199 * all-or-nothing features which are not restricted to a single file. 200 */ 201 static 202 int 203 hammer_vop_fsync(struct vop_fsync_args *ap) 204 { 205 hammer_inode_t ip = VTOI(ap->a_vp); 206 hammer_mount_t hmp = ip->hmp; 207 int waitfor = ap->a_waitfor; 208 int mode; 209 210 lwkt_gettoken(&hmp->fs_token); 211 212 /* 213 * Fsync rule relaxation (default is either full synchronous flush 214 * or REDO semantics with synchronous flush). 215 */ 216 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 217 switch(hammer_fsync_mode) { 218 case 0: 219 mode0: 220 /* no REDO, full synchronous flush */ 221 goto skip; 222 case 1: 223 mode1: 224 /* no REDO, full asynchronous flush */ 225 if (waitfor == MNT_WAIT) 226 waitfor = MNT_NOWAIT; 227 goto skip; 228 case 2: 229 /* REDO semantics, synchronous flush */ 230 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 231 goto mode0; 232 mode = HAMMER_FLUSH_UNDOS_AUTO; 233 break; 234 case 3: 235 /* REDO semantics, relaxed asynchronous flush */ 236 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 237 goto mode1; 238 mode = HAMMER_FLUSH_UNDOS_RELAXED; 239 if (waitfor == MNT_WAIT) 240 waitfor = MNT_NOWAIT; 241 break; 242 case 4: 243 /* ignore the fsync() system call */ 244 lwkt_reltoken(&hmp->fs_token); 245 return(0); 246 default: 247 /* we have to do something */ 248 mode = HAMMER_FLUSH_UNDOS_RELAXED; 249 if (waitfor == MNT_WAIT) 250 waitfor = MNT_NOWAIT; 251 break; 252 } 253 254 /* 255 * Fast fsync only needs to flush the UNDO/REDO fifo if 256 * HAMMER_INODE_REDO is non-zero and the only modifications 257 * made to the file are write or write-extends. 258 */ 259 if ((ip->flags & HAMMER_INODE_REDO) && 260 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 261 ) { 262 ++hammer_count_fsyncs; 263 hammer_flusher_flush_undos(hmp, mode); 264 ip->redo_count = 0; 265 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 266 vclrisdirty(ip->vp); 267 lwkt_reltoken(&hmp->fs_token); 268 return(0); 269 } 270 271 /* 272 * REDO is enabled by fsync(), the idea being we really only 273 * want to lay down REDO records when programs are using 274 * fsync() heavily. The first fsync() on the file starts 275 * the gravy train going and later fsync()s keep it hot by 276 * resetting the redo_count. 277 * 278 * We weren't running REDOs before now so we have to fall 279 * through and do a full fsync of what we have. 280 */ 281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 283 ip->flags |= HAMMER_INODE_REDO; 284 ip->redo_count = 0; 285 } 286 } 287 skip: 288 289 /* 290 * Do a full flush sequence. 291 * 292 * Attempt to release the vnode while waiting for the inode to 293 * finish flushing. This can really mess up inactive->reclaim 294 * sequences so only do it if the vnode is active. 295 * 296 * WARNING! The VX lock functions must be used. vn_lock() will 297 * fail when this is part of a VOP_RECLAIM sequence. 298 */ 299 ++hammer_count_fsyncs; 300 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 301 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 302 if (waitfor == MNT_WAIT) { 303 int dorelock; 304 305 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) { 306 vx_unlock(ap->a_vp); 307 dorelock = 1; 308 } else { 309 dorelock = 0; 310 } 311 hammer_wait_inode(ip); 312 if (dorelock) 313 vx_lock(ap->a_vp); 314 } 315 if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0) 316 vclrisdirty(ip->vp); 317 lwkt_reltoken(&hmp->fs_token); 318 return (ip->error); 319 } 320 321 /* 322 * hammer_vop_read { vp, uio, ioflag, cred } 323 * 324 * MPSAFE (for the cache safe does not require fs_token) 325 */ 326 static 327 int 328 hammer_vop_read(struct vop_read_args *ap) 329 { 330 struct hammer_transaction trans; 331 hammer_inode_t ip; 332 hammer_mount_t hmp; 333 off_t offset; 334 struct buf *bp; 335 struct uio *uio; 336 int error; 337 int n; 338 int seqcount; 339 int ioseqcount; 340 int blksize; 341 int bigread; 342 int got_trans; 343 size_t resid; 344 345 if (ap->a_vp->v_type != VREG) 346 return (EINVAL); 347 ip = VTOI(ap->a_vp); 348 hmp = ip->hmp; 349 error = 0; 350 got_trans = 0; 351 uio = ap->a_uio; 352 353 /* 354 * Attempt to shortcut directly to the VM object using lwbufs. 355 * This is much faster than instantiating buffer cache buffers. 356 */ 357 resid = uio->uio_resid; 358 error = vop_helper_read_shortcut(ap); 359 hammer_stats_file_read += resid - uio->uio_resid; 360 if (error) 361 return (error); 362 if (uio->uio_resid == 0) 363 goto finished; 364 365 /* 366 * Allow the UIO's size to override the sequential heuristic. 367 */ 368 blksize = hammer_blocksize(uio->uio_offset); 369 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 370 ioseqcount = (ap->a_ioflag >> 16); 371 if (seqcount < ioseqcount) 372 seqcount = ioseqcount; 373 374 /* 375 * If reading or writing a huge amount of data we have to break 376 * atomicy and allow the operation to be interrupted by a signal 377 * or it can DOS the machine. 378 */ 379 bigread = (uio->uio_resid > 100 * 1024 * 1024); 380 381 /* 382 * Access the data typically in HAMMER_BUFSIZE blocks via the 383 * buffer cache, but HAMMER may use a variable block size based 384 * on the offset. 385 * 386 * XXX Temporary hack, delay the start transaction while we remain 387 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 388 * locked-shared. 389 */ 390 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 391 int64_t base_offset; 392 int64_t file_limit; 393 394 blksize = hammer_blocksize(uio->uio_offset); 395 offset = (int)uio->uio_offset & (blksize - 1); 396 base_offset = uio->uio_offset - offset; 397 398 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 399 break; 400 401 /* 402 * MPSAFE 403 */ 404 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 405 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 406 bp->b_flags &= ~B_AGE; 407 error = 0; 408 goto skip; 409 } 410 if (ap->a_ioflag & IO_NRDELAY) { 411 bqrelse(bp); 412 return (EWOULDBLOCK); 413 } 414 415 /* 416 * MPUNSAFE 417 */ 418 if (got_trans == 0) { 419 hammer_start_transaction(&trans, ip->hmp); 420 got_trans = 1; 421 } 422 423 /* 424 * NOTE: A valid bp has already been acquired, but was not 425 * B_CACHE. 426 */ 427 if (hammer_cluster_enable) { 428 /* 429 * Use file_limit to prevent cluster_read() from 430 * creating buffers of the wrong block size past 431 * the demarc. 432 */ 433 file_limit = ip->ino_data.size; 434 if (base_offset < HAMMER_XDEMARC && 435 file_limit > HAMMER_XDEMARC) { 436 file_limit = HAMMER_XDEMARC; 437 } 438 error = cluster_readx(ap->a_vp, 439 file_limit, base_offset, 440 blksize, uio->uio_resid, 441 seqcount * BKVASIZE, &bp); 442 } else { 443 error = breadnx(ap->a_vp, base_offset, blksize, 444 NULL, NULL, 0, &bp); 445 } 446 if (error) { 447 brelse(bp); 448 break; 449 } 450 skip: 451 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 452 kprintf("doff %016jx read file %016jx@%016jx\n", 453 (intmax_t)bp->b_bio2.bio_offset, 454 (intmax_t)ip->obj_id, 455 (intmax_t)bp->b_loffset); 456 } 457 bp->b_flags &= ~B_IODEBUG; 458 if (blksize == HAMMER_XBUFSIZE) 459 bp->b_flags |= B_CLUSTEROK; 460 461 n = blksize - offset; 462 if (n > uio->uio_resid) 463 n = uio->uio_resid; 464 if (n > ip->ino_data.size - uio->uio_offset) 465 n = (int)(ip->ino_data.size - uio->uio_offset); 466 467 /* 468 * Set B_AGE, data has a lower priority than meta-data. 469 * 470 * Use a hold/unlock/drop sequence to run the uiomove 471 * with the buffer unlocked, avoiding deadlocks against 472 * read()s on mmap()'d spaces. 473 */ 474 bp->b_flags |= B_AGE; 475 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 476 bqrelse(bp); 477 478 if (error) 479 break; 480 hammer_stats_file_read += n; 481 } 482 483 finished: 484 485 /* 486 * Try to update the atime with just the inode lock for maximum 487 * concurrency. If we can't shortcut it we have to get the full 488 * blown transaction. 489 */ 490 if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) { 491 hammer_start_transaction(&trans, ip->hmp); 492 got_trans = 1; 493 } 494 495 if (got_trans) { 496 if ((ip->flags & HAMMER_INODE_RO) == 0 && 497 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 498 lwkt_gettoken(&hmp->fs_token); 499 ip->ino_data.atime = trans.time; 500 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 501 hammer_done_transaction(&trans); 502 lwkt_reltoken(&hmp->fs_token); 503 } else { 504 hammer_done_transaction(&trans); 505 } 506 } 507 return (error); 508 } 509 510 /* 511 * hammer_vop_write { vp, uio, ioflag, cred } 512 */ 513 static 514 int 515 hammer_vop_write(struct vop_write_args *ap) 516 { 517 struct hammer_transaction trans; 518 struct hammer_inode *ip; 519 hammer_mount_t hmp; 520 thread_t td; 521 struct uio *uio; 522 int offset; 523 off_t base_offset; 524 int64_t cluster_eof; 525 struct buf *bp; 526 int kflags; 527 int error; 528 int n; 529 int flags; 530 int seqcount; 531 int bigwrite; 532 533 if (ap->a_vp->v_type != VREG) 534 return (EINVAL); 535 ip = VTOI(ap->a_vp); 536 hmp = ip->hmp; 537 error = 0; 538 kflags = 0; 539 seqcount = ap->a_ioflag >> 16; 540 541 if (ip->flags & HAMMER_INODE_RO) 542 return (EROFS); 543 544 /* 545 * Create a transaction to cover the operations we perform. 546 */ 547 hammer_start_transaction(&trans, hmp); 548 uio = ap->a_uio; 549 550 /* 551 * Check append mode 552 */ 553 if (ap->a_ioflag & IO_APPEND) 554 uio->uio_offset = ip->ino_data.size; 555 556 /* 557 * Check for illegal write offsets. Valid range is 0...2^63-1. 558 * 559 * NOTE: the base_off assignment is required to work around what 560 * I consider to be a GCC-4 optimization bug. 561 */ 562 if (uio->uio_offset < 0) { 563 hammer_done_transaction(&trans); 564 return (EFBIG); 565 } 566 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 567 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 568 hammer_done_transaction(&trans); 569 return (EFBIG); 570 } 571 572 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 573 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 574 hammer_done_transaction(&trans); 575 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 576 return (EFBIG); 577 } 578 579 /* 580 * If reading or writing a huge amount of data we have to break 581 * atomicy and allow the operation to be interrupted by a signal 582 * or it can DOS the machine. 583 * 584 * Preset redo_count so we stop generating REDOs earlier if the 585 * limit is exceeded. 586 * 587 * redo_count is heuristical, SMP races are ok 588 */ 589 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 590 if ((ip->flags & HAMMER_INODE_REDO) && 591 ip->redo_count < hammer_limit_redo) { 592 ip->redo_count += uio->uio_resid; 593 } 594 595 /* 596 * Access the data typically in HAMMER_BUFSIZE blocks via the 597 * buffer cache, but HAMMER may use a variable block size based 598 * on the offset. 599 */ 600 while (uio->uio_resid > 0) { 601 int fixsize = 0; 602 int blksize; 603 int blkmask; 604 int trivial; 605 int endofblk; 606 off_t nsize; 607 608 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 609 break; 610 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 611 break; 612 613 blksize = hammer_blocksize(uio->uio_offset); 614 615 /* 616 * Control the number of pending records associated with 617 * this inode. If too many have accumulated start a 618 * flush. Try to maintain a pipeline with the flusher. 619 * 620 * NOTE: It is possible for other sources to grow the 621 * records but not necessarily issue another flush, 622 * so use a timeout and ensure that a re-flush occurs. 623 */ 624 if (ip->rsv_recs >= hammer_limit_inode_recs) { 625 lwkt_gettoken(&hmp->fs_token); 626 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 627 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 628 ip->flags |= HAMMER_INODE_RECSW; 629 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 630 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 631 } 632 lwkt_reltoken(&hmp->fs_token); 633 } 634 635 /* 636 * Do not allow HAMMER to blow out the buffer cache. Very 637 * large UIOs can lockout other processes due to bwillwrite() 638 * mechanics. 639 * 640 * The hammer inode is not locked during these operations. 641 * The vnode is locked which can interfere with the pageout 642 * daemon for non-UIO_NOCOPY writes but should not interfere 643 * with the buffer cache. Even so, we cannot afford to 644 * allow the pageout daemon to build up too many dirty buffer 645 * cache buffers. 646 * 647 * Only call this if we aren't being recursively called from 648 * a virtual disk device (vn), else we may deadlock. 649 */ 650 if ((ap->a_ioflag & IO_RECURSE) == 0) 651 bwillwrite(blksize); 652 653 /* 654 * Calculate the blocksize at the current offset and figure 655 * out how much we can actually write. 656 */ 657 blkmask = blksize - 1; 658 offset = (int)uio->uio_offset & blkmask; 659 base_offset = uio->uio_offset & ~(int64_t)blkmask; 660 n = blksize - offset; 661 if (n > uio->uio_resid) { 662 n = uio->uio_resid; 663 endofblk = 0; 664 } else { 665 endofblk = 1; 666 } 667 nsize = uio->uio_offset + n; 668 if (nsize > ip->ino_data.size) { 669 if (uio->uio_offset > ip->ino_data.size) 670 trivial = 0; 671 else 672 trivial = 1; 673 nvextendbuf(ap->a_vp, 674 ip->ino_data.size, 675 nsize, 676 hammer_blocksize(ip->ino_data.size), 677 hammer_blocksize(nsize), 678 hammer_blockoff(ip->ino_data.size), 679 hammer_blockoff(nsize), 680 trivial); 681 fixsize = 1; 682 kflags |= NOTE_EXTEND; 683 } 684 685 if (uio->uio_segflg == UIO_NOCOPY) { 686 /* 687 * Issuing a write with the same data backing the 688 * buffer. Instantiate the buffer to collect the 689 * backing vm pages, then read-in any missing bits. 690 * 691 * This case is used by vop_stdputpages(). 692 */ 693 bp = getblk(ap->a_vp, base_offset, 694 blksize, GETBLK_BHEAVY, 0); 695 if ((bp->b_flags & B_CACHE) == 0) { 696 bqrelse(bp); 697 error = bread(ap->a_vp, base_offset, 698 blksize, &bp); 699 } 700 } else if (offset == 0 && uio->uio_resid >= blksize) { 701 /* 702 * Even though we are entirely overwriting the buffer 703 * we may still have to zero it out to avoid a 704 * mmap/write visibility issue. 705 */ 706 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 707 if ((bp->b_flags & B_CACHE) == 0) 708 vfs_bio_clrbuf(bp); 709 } else if (base_offset >= ip->ino_data.size) { 710 /* 711 * If the base offset of the buffer is beyond the 712 * file EOF, we don't have to issue a read. 713 */ 714 bp = getblk(ap->a_vp, base_offset, 715 blksize, GETBLK_BHEAVY, 0); 716 vfs_bio_clrbuf(bp); 717 } else { 718 /* 719 * Partial overwrite, read in any missing bits then 720 * replace the portion being written. 721 */ 722 error = bread(ap->a_vp, base_offset, blksize, &bp); 723 if (error == 0) 724 bheavy(bp); 725 } 726 if (error == 0) 727 error = uiomovebp(bp, bp->b_data + offset, n, uio); 728 729 lwkt_gettoken(&hmp->fs_token); 730 731 /* 732 * Generate REDO records if enabled and redo_count will not 733 * exceeded the limit. 734 * 735 * If redo_count exceeds the limit we stop generating records 736 * and clear HAMMER_INODE_REDO. This will cause the next 737 * fsync() to do a full meta-data sync instead of just an 738 * UNDO/REDO fifo update. 739 * 740 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 741 * will still be tracked. The tracks will be terminated 742 * when the related meta-data (including possible data 743 * modifications which are not tracked via REDO) is 744 * flushed. 745 */ 746 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 747 if (ip->redo_count < hammer_limit_redo) { 748 bp->b_flags |= B_VFSFLAG1; 749 error = hammer_generate_redo(&trans, ip, 750 base_offset + offset, 751 HAMMER_REDO_WRITE, 752 bp->b_data + offset, 753 (size_t)n); 754 } else { 755 ip->flags &= ~HAMMER_INODE_REDO; 756 } 757 } 758 759 /* 760 * If we screwed up we have to undo any VM size changes we 761 * made. 762 */ 763 if (error) { 764 brelse(bp); 765 if (fixsize) { 766 nvtruncbuf(ap->a_vp, ip->ino_data.size, 767 hammer_blocksize(ip->ino_data.size), 768 hammer_blockoff(ip->ino_data.size), 769 0); 770 } 771 break; 772 } 773 kflags |= NOTE_WRITE; 774 hammer_stats_file_write += n; 775 if (blksize == HAMMER_XBUFSIZE) 776 bp->b_flags |= B_CLUSTEROK; 777 if (ip->ino_data.size < uio->uio_offset) { 778 ip->ino_data.size = uio->uio_offset; 779 flags = HAMMER_INODE_SDIRTY; 780 } else { 781 flags = 0; 782 } 783 ip->ino_data.mtime = trans.time; 784 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 785 hammer_modify_inode(&trans, ip, flags); 786 787 /* 788 * Once we dirty the buffer any cached zone-X offset 789 * becomes invalid. HAMMER NOTE: no-history mode cannot 790 * allow overwriting over the same data sector unless 791 * we provide UNDOs for the old data, which we don't. 792 */ 793 bp->b_bio2.bio_offset = NOOFFSET; 794 795 lwkt_reltoken(&hmp->fs_token); 796 797 /* 798 * Final buffer disposition. 799 * 800 * Because meta-data updates are deferred, HAMMER is 801 * especially sensitive to excessive bdwrite()s because 802 * the I/O stream is not broken up by disk reads. So the 803 * buffer cache simply cannot keep up. 804 * 805 * WARNING! blksize is variable. cluster_write() is 806 * expected to not blow up if it encounters 807 * buffers that do not match the passed blksize. 808 * 809 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 810 * The ip->rsv_recs check should burst-flush the data. 811 * If we queue it immediately the buf could be left 812 * locked on the device queue for a very long time. 813 * 814 * However, failing to flush a dirty buffer out when 815 * issued from the pageout daemon can result in a low 816 * memory deadlock against bio_page_alloc(), so we 817 * have to bawrite() on IO_ASYNC as well. 818 * 819 * NOTE! To avoid degenerate stalls due to mismatched block 820 * sizes we only honor IO_DIRECT on the write which 821 * abuts the end of the buffer. However, we must 822 * honor IO_SYNC in case someone is silly enough to 823 * configure a HAMMER file as swap, or when HAMMER 824 * is serving NFS (for commits). Ick ick. 825 */ 826 bp->b_flags |= B_AGE; 827 if (blksize == HAMMER_XBUFSIZE) 828 bp->b_flags |= B_CLUSTEROK; 829 830 if (ap->a_ioflag & IO_SYNC) { 831 bwrite(bp); 832 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 833 bawrite(bp); 834 } else if (ap->a_ioflag & IO_ASYNC) { 835 bawrite(bp); 836 } else if (hammer_cluster_enable && 837 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 838 if (base_offset < HAMMER_XDEMARC) 839 cluster_eof = hammer_blockdemarc(base_offset, 840 ip->ino_data.size); 841 else 842 cluster_eof = ip->ino_data.size; 843 cluster_write(bp, cluster_eof, blksize, seqcount); 844 } else { 845 bdwrite(bp); 846 } 847 } 848 hammer_done_transaction(&trans); 849 hammer_knote(ap->a_vp, kflags); 850 851 return (error); 852 } 853 854 /* 855 * hammer_vop_access { vp, mode, cred } 856 * 857 * MPSAFE - does not require fs_token 858 */ 859 static 860 int 861 hammer_vop_access(struct vop_access_args *ap) 862 { 863 struct hammer_inode *ip = VTOI(ap->a_vp); 864 uid_t uid; 865 gid_t gid; 866 int error; 867 868 ++hammer_stats_file_iopsr; 869 uid = hammer_to_unix_xid(&ip->ino_data.uid); 870 gid = hammer_to_unix_xid(&ip->ino_data.gid); 871 872 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 873 ip->ino_data.uflags); 874 return (error); 875 } 876 877 /* 878 * hammer_vop_advlock { vp, id, op, fl, flags } 879 * 880 * MPSAFE - does not require fs_token 881 */ 882 static 883 int 884 hammer_vop_advlock(struct vop_advlock_args *ap) 885 { 886 hammer_inode_t ip = VTOI(ap->a_vp); 887 888 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 889 } 890 891 /* 892 * hammer_vop_close { vp, fflag } 893 * 894 * We can only sync-on-close for normal closes. XXX disabled for now. 895 */ 896 static 897 int 898 hammer_vop_close(struct vop_close_args *ap) 899 { 900 #if 0 901 struct vnode *vp = ap->a_vp; 902 hammer_inode_t ip = VTOI(vp); 903 int waitfor; 904 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 905 if (vn_islocked(vp) == LK_EXCLUSIVE && 906 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 907 if (ip->flags & HAMMER_INODE_CLOSESYNC) 908 waitfor = MNT_WAIT; 909 else 910 waitfor = MNT_NOWAIT; 911 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 912 HAMMER_INODE_CLOSEASYNC); 913 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 914 } 915 } 916 #endif 917 return (vop_stdclose(ap)); 918 } 919 920 /* 921 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 922 * 923 * The operating system has already ensured that the directory entry 924 * does not exist and done all appropriate namespace locking. 925 */ 926 static 927 int 928 hammer_vop_ncreate(struct vop_ncreate_args *ap) 929 { 930 struct hammer_transaction trans; 931 struct hammer_inode *dip; 932 struct hammer_inode *nip; 933 struct nchandle *nch; 934 hammer_mount_t hmp; 935 int error; 936 937 nch = ap->a_nch; 938 dip = VTOI(ap->a_dvp); 939 hmp = dip->hmp; 940 941 if (dip->flags & HAMMER_INODE_RO) 942 return (EROFS); 943 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 944 return (error); 945 946 /* 947 * Create a transaction to cover the operations we perform. 948 */ 949 lwkt_gettoken(&hmp->fs_token); 950 hammer_start_transaction(&trans, hmp); 951 ++hammer_stats_file_iopsw; 952 953 /* 954 * Create a new filesystem object of the requested type. The 955 * returned inode will be referenced and shared-locked to prevent 956 * it from being moved to the flusher. 957 */ 958 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 959 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 960 NULL, &nip); 961 if (error) { 962 hkprintf("hammer_create_inode error %d\n", error); 963 hammer_done_transaction(&trans); 964 *ap->a_vpp = NULL; 965 lwkt_reltoken(&hmp->fs_token); 966 return (error); 967 } 968 969 /* 970 * Add the new filesystem object to the directory. This will also 971 * bump the inode's link count. 972 */ 973 error = hammer_ip_add_directory(&trans, dip, 974 nch->ncp->nc_name, nch->ncp->nc_nlen, 975 nip); 976 if (error) 977 hkprintf("hammer_ip_add_directory error %d\n", error); 978 979 /* 980 * Finish up. 981 */ 982 if (error) { 983 hammer_rel_inode(nip, 0); 984 hammer_done_transaction(&trans); 985 *ap->a_vpp = NULL; 986 } else { 987 error = hammer_get_vnode(nip, ap->a_vpp); 988 hammer_done_transaction(&trans); 989 hammer_rel_inode(nip, 0); 990 if (error == 0) { 991 cache_setunresolved(ap->a_nch); 992 cache_setvp(ap->a_nch, *ap->a_vpp); 993 } 994 hammer_knote(ap->a_dvp, NOTE_WRITE); 995 } 996 lwkt_reltoken(&hmp->fs_token); 997 return (error); 998 } 999 1000 /* 1001 * hammer_vop_getattr { vp, vap } 1002 * 1003 * Retrieve an inode's attribute information. When accessing inodes 1004 * historically we fake the atime field to ensure consistent results. 1005 * The atime field is stored in the B-Tree element and allowed to be 1006 * updated without cycling the element. 1007 * 1008 * MPSAFE - does not require fs_token 1009 */ 1010 static 1011 int 1012 hammer_vop_getattr(struct vop_getattr_args *ap) 1013 { 1014 struct hammer_inode *ip = VTOI(ap->a_vp); 1015 struct vattr *vap = ap->a_vap; 1016 1017 /* 1018 * We want the fsid to be different when accessing a filesystem 1019 * with different as-of's so programs like diff don't think 1020 * the files are the same. 1021 * 1022 * We also want the fsid to be the same when comparing snapshots, 1023 * or when comparing mirrors (which might be backed by different 1024 * physical devices). HAMMER fsids are based on the PFS's 1025 * shared_uuid field. 1026 * 1027 * XXX there is a chance of collision here. The va_fsid reported 1028 * by stat is different from the more involved fsid used in the 1029 * mount structure. 1030 */ 1031 ++hammer_stats_file_iopsr; 1032 hammer_lock_sh(&ip->lock); 1033 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1034 (u_int32_t)(ip->obj_asof >> 32); 1035 1036 vap->va_fileid = ip->ino_leaf.base.obj_id; 1037 vap->va_mode = ip->ino_data.mode; 1038 vap->va_nlink = ip->ino_data.nlinks; 1039 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1040 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1041 vap->va_rmajor = 0; 1042 vap->va_rminor = 0; 1043 vap->va_size = ip->ino_data.size; 1044 1045 /* 1046 * Special case for @@PFS softlinks. The actual size of the 1047 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1048 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1049 */ 1050 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1051 ip->ino_data.size == 10 && 1052 ip->obj_asof == HAMMER_MAX_TID && 1053 ip->obj_localization == 0 && 1054 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1055 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1056 vap->va_size = 26; 1057 else 1058 vap->va_size = 10; 1059 } 1060 1061 /* 1062 * We must provide a consistent atime and mtime for snapshots 1063 * so people can do a 'tar cf - ... | md5' on them and get 1064 * consistent results. 1065 */ 1066 if (ip->flags & HAMMER_INODE_RO) { 1067 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1068 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1069 } else { 1070 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1071 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1072 } 1073 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1074 vap->va_flags = ip->ino_data.uflags; 1075 vap->va_gen = 1; /* hammer inums are unique for all time */ 1076 vap->va_blocksize = HAMMER_BUFSIZE; 1077 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1078 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1079 ~HAMMER_XBUFMASK64; 1080 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1081 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1082 ~HAMMER_BUFMASK64; 1083 } else { 1084 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1085 } 1086 1087 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1088 vap->va_filerev = 0; /* XXX */ 1089 vap->va_uid_uuid = ip->ino_data.uid; 1090 vap->va_gid_uuid = ip->ino_data.gid; 1091 vap->va_fsid_uuid = ip->hmp->fsid; 1092 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1093 VA_FSID_UUID_VALID; 1094 1095 switch (ip->ino_data.obj_type) { 1096 case HAMMER_OBJTYPE_CDEV: 1097 case HAMMER_OBJTYPE_BDEV: 1098 vap->va_rmajor = ip->ino_data.rmajor; 1099 vap->va_rminor = ip->ino_data.rminor; 1100 break; 1101 default: 1102 break; 1103 } 1104 hammer_unlock(&ip->lock); 1105 return(0); 1106 } 1107 1108 /* 1109 * hammer_vop_nresolve { nch, dvp, cred } 1110 * 1111 * Locate the requested directory entry. 1112 */ 1113 static 1114 int 1115 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1116 { 1117 struct hammer_transaction trans; 1118 struct namecache *ncp; 1119 hammer_mount_t hmp; 1120 hammer_inode_t dip; 1121 hammer_inode_t ip; 1122 hammer_tid_t asof; 1123 struct hammer_cursor cursor; 1124 struct vnode *vp; 1125 int64_t namekey; 1126 int error; 1127 int i; 1128 int nlen; 1129 int flags; 1130 int ispfs; 1131 int64_t obj_id; 1132 u_int32_t localization; 1133 u_int32_t max_iterations; 1134 1135 /* 1136 * Misc initialization, plus handle as-of name extensions. Look for 1137 * the '@@' extension. Note that as-of files and directories cannot 1138 * be modified. 1139 */ 1140 dip = VTOI(ap->a_dvp); 1141 ncp = ap->a_nch->ncp; 1142 asof = dip->obj_asof; 1143 localization = dip->obj_localization; /* for code consistency */ 1144 nlen = ncp->nc_nlen; 1145 flags = dip->flags & HAMMER_INODE_RO; 1146 ispfs = 0; 1147 hmp = dip->hmp; 1148 1149 lwkt_gettoken(&hmp->fs_token); 1150 hammer_simple_transaction(&trans, hmp); 1151 ++hammer_stats_file_iopsr; 1152 1153 for (i = 0; i < nlen; ++i) { 1154 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1155 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1156 &ispfs, &asof, &localization); 1157 if (error != 0) { 1158 i = nlen; 1159 break; 1160 } 1161 if (asof != HAMMER_MAX_TID) 1162 flags |= HAMMER_INODE_RO; 1163 break; 1164 } 1165 } 1166 nlen = i; 1167 1168 /* 1169 * If this is a PFS softlink we dive into the PFS 1170 */ 1171 if (ispfs && nlen == 0) { 1172 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1173 asof, localization, 1174 flags, &error); 1175 if (error == 0) { 1176 error = hammer_get_vnode(ip, &vp); 1177 hammer_rel_inode(ip, 0); 1178 } else { 1179 vp = NULL; 1180 } 1181 if (error == 0) { 1182 vn_unlock(vp); 1183 cache_setvp(ap->a_nch, vp); 1184 vrele(vp); 1185 } 1186 goto done; 1187 } 1188 1189 /* 1190 * If there is no path component the time extension is relative to dip. 1191 * e.g. "fubar/@@<snapshot>" 1192 * 1193 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1194 * e.g. "fubar/.@@<snapshot>" 1195 * 1196 * ".." is handled by the kernel. We do not currently handle 1197 * "..@<snapshot>". 1198 */ 1199 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1200 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1201 asof, dip->obj_localization, 1202 flags, &error); 1203 if (error == 0) { 1204 error = hammer_get_vnode(ip, &vp); 1205 hammer_rel_inode(ip, 0); 1206 } else { 1207 vp = NULL; 1208 } 1209 if (error == 0) { 1210 vn_unlock(vp); 1211 cache_setvp(ap->a_nch, vp); 1212 vrele(vp); 1213 } 1214 goto done; 1215 } 1216 1217 /* 1218 * Calculate the namekey and setup the key range for the scan. This 1219 * works kinda like a chained hash table where the lower 32 bits 1220 * of the namekey synthesize the chain. 1221 * 1222 * The key range is inclusive of both key_beg and key_end. 1223 */ 1224 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1225 &max_iterations); 1226 1227 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1228 cursor.key_beg.localization = dip->obj_localization + 1229 hammer_dir_localization(dip); 1230 cursor.key_beg.obj_id = dip->obj_id; 1231 cursor.key_beg.key = namekey; 1232 cursor.key_beg.create_tid = 0; 1233 cursor.key_beg.delete_tid = 0; 1234 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1235 cursor.key_beg.obj_type = 0; 1236 1237 cursor.key_end = cursor.key_beg; 1238 cursor.key_end.key += max_iterations; 1239 cursor.asof = asof; 1240 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1241 1242 /* 1243 * Scan all matching records (the chain), locate the one matching 1244 * the requested path component. 1245 * 1246 * The hammer_ip_*() functions merge in-memory records with on-disk 1247 * records for the purposes of the search. 1248 */ 1249 obj_id = 0; 1250 localization = HAMMER_DEF_LOCALIZATION; 1251 1252 if (error == 0) { 1253 error = hammer_ip_first(&cursor); 1254 while (error == 0) { 1255 error = hammer_ip_resolve_data(&cursor); 1256 if (error) 1257 break; 1258 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1259 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1260 obj_id = cursor.data->entry.obj_id; 1261 localization = cursor.data->entry.localization; 1262 break; 1263 } 1264 error = hammer_ip_next(&cursor); 1265 } 1266 } 1267 hammer_done_cursor(&cursor); 1268 1269 /* 1270 * Lookup the obj_id. This should always succeed. If it does not 1271 * the filesystem may be damaged and we return a dummy inode. 1272 */ 1273 if (error == 0) { 1274 ip = hammer_get_inode(&trans, dip, obj_id, 1275 asof, localization, 1276 flags, &error); 1277 if (error == ENOENT) { 1278 kprintf("HAMMER: WARNING: Missing " 1279 "inode for dirent \"%s\"\n" 1280 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1281 ncp->nc_name, 1282 (long long)obj_id, (long long)asof, 1283 localization); 1284 error = 0; 1285 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1286 asof, localization, 1287 flags, &error); 1288 } 1289 if (error == 0) { 1290 error = hammer_get_vnode(ip, &vp); 1291 hammer_rel_inode(ip, 0); 1292 } else { 1293 vp = NULL; 1294 } 1295 if (error == 0) { 1296 vn_unlock(vp); 1297 cache_setvp(ap->a_nch, vp); 1298 vrele(vp); 1299 } 1300 } else if (error == ENOENT) { 1301 cache_setvp(ap->a_nch, NULL); 1302 } 1303 done: 1304 hammer_done_transaction(&trans); 1305 lwkt_reltoken(&hmp->fs_token); 1306 return (error); 1307 } 1308 1309 /* 1310 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1311 * 1312 * Locate the parent directory of a directory vnode. 1313 * 1314 * dvp is referenced but not locked. *vpp must be returned referenced and 1315 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1316 * at the root, instead it could indicate that the directory we were in was 1317 * removed. 1318 * 1319 * NOTE: as-of sequences are not linked into the directory structure. If 1320 * we are at the root with a different asof then the mount point, reload 1321 * the same directory with the mount point's asof. I'm not sure what this 1322 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1323 * get confused, but it hasn't been tested. 1324 */ 1325 static 1326 int 1327 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1328 { 1329 struct hammer_transaction trans; 1330 struct hammer_inode *dip; 1331 struct hammer_inode *ip; 1332 hammer_mount_t hmp; 1333 int64_t parent_obj_id; 1334 u_int32_t parent_obj_localization; 1335 hammer_tid_t asof; 1336 int error; 1337 1338 dip = VTOI(ap->a_dvp); 1339 asof = dip->obj_asof; 1340 hmp = dip->hmp; 1341 1342 /* 1343 * Whos are parent? This could be the root of a pseudo-filesystem 1344 * whos parent is in another localization domain. 1345 */ 1346 lwkt_gettoken(&hmp->fs_token); 1347 parent_obj_id = dip->ino_data.parent_obj_id; 1348 if (dip->obj_id == HAMMER_OBJID_ROOT) 1349 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1350 else 1351 parent_obj_localization = dip->obj_localization; 1352 1353 if (parent_obj_id == 0) { 1354 if (dip->obj_id == HAMMER_OBJID_ROOT && 1355 asof != hmp->asof) { 1356 parent_obj_id = dip->obj_id; 1357 asof = hmp->asof; 1358 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1359 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1360 (long long)dip->obj_asof); 1361 } else { 1362 *ap->a_vpp = NULL; 1363 lwkt_reltoken(&hmp->fs_token); 1364 return ENOENT; 1365 } 1366 } 1367 1368 hammer_simple_transaction(&trans, hmp); 1369 ++hammer_stats_file_iopsr; 1370 1371 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1372 asof, parent_obj_localization, 1373 dip->flags, &error); 1374 if (ip) { 1375 error = hammer_get_vnode(ip, ap->a_vpp); 1376 hammer_rel_inode(ip, 0); 1377 } else { 1378 *ap->a_vpp = NULL; 1379 } 1380 hammer_done_transaction(&trans); 1381 lwkt_reltoken(&hmp->fs_token); 1382 return (error); 1383 } 1384 1385 /* 1386 * hammer_vop_nlink { nch, dvp, vp, cred } 1387 */ 1388 static 1389 int 1390 hammer_vop_nlink(struct vop_nlink_args *ap) 1391 { 1392 struct hammer_transaction trans; 1393 struct hammer_inode *dip; 1394 struct hammer_inode *ip; 1395 struct nchandle *nch; 1396 hammer_mount_t hmp; 1397 int error; 1398 1399 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1400 return(EXDEV); 1401 1402 nch = ap->a_nch; 1403 dip = VTOI(ap->a_dvp); 1404 ip = VTOI(ap->a_vp); 1405 hmp = dip->hmp; 1406 1407 if (dip->obj_localization != ip->obj_localization) 1408 return(EXDEV); 1409 1410 if (dip->flags & HAMMER_INODE_RO) 1411 return (EROFS); 1412 if (ip->flags & HAMMER_INODE_RO) 1413 return (EROFS); 1414 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1415 return (error); 1416 1417 /* 1418 * Create a transaction to cover the operations we perform. 1419 */ 1420 lwkt_gettoken(&hmp->fs_token); 1421 hammer_start_transaction(&trans, hmp); 1422 ++hammer_stats_file_iopsw; 1423 1424 /* 1425 * Add the filesystem object to the directory. Note that neither 1426 * dip nor ip are referenced or locked, but their vnodes are 1427 * referenced. This function will bump the inode's link count. 1428 */ 1429 error = hammer_ip_add_directory(&trans, dip, 1430 nch->ncp->nc_name, nch->ncp->nc_nlen, 1431 ip); 1432 1433 /* 1434 * Finish up. 1435 */ 1436 if (error == 0) { 1437 cache_setunresolved(nch); 1438 cache_setvp(nch, ap->a_vp); 1439 } 1440 hammer_done_transaction(&trans); 1441 hammer_knote(ap->a_vp, NOTE_LINK); 1442 hammer_knote(ap->a_dvp, NOTE_WRITE); 1443 lwkt_reltoken(&hmp->fs_token); 1444 return (error); 1445 } 1446 1447 /* 1448 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1449 * 1450 * The operating system has already ensured that the directory entry 1451 * does not exist and done all appropriate namespace locking. 1452 */ 1453 static 1454 int 1455 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1456 { 1457 struct hammer_transaction trans; 1458 struct hammer_inode *dip; 1459 struct hammer_inode *nip; 1460 struct nchandle *nch; 1461 hammer_mount_t hmp; 1462 int error; 1463 1464 nch = ap->a_nch; 1465 dip = VTOI(ap->a_dvp); 1466 hmp = dip->hmp; 1467 1468 if (dip->flags & HAMMER_INODE_RO) 1469 return (EROFS); 1470 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1471 return (error); 1472 1473 /* 1474 * Create a transaction to cover the operations we perform. 1475 */ 1476 lwkt_gettoken(&hmp->fs_token); 1477 hammer_start_transaction(&trans, hmp); 1478 ++hammer_stats_file_iopsw; 1479 1480 /* 1481 * Create a new filesystem object of the requested type. The 1482 * returned inode will be referenced but not locked. 1483 */ 1484 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1485 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1486 NULL, &nip); 1487 if (error) { 1488 hkprintf("hammer_mkdir error %d\n", error); 1489 hammer_done_transaction(&trans); 1490 *ap->a_vpp = NULL; 1491 lwkt_reltoken(&hmp->fs_token); 1492 return (error); 1493 } 1494 /* 1495 * Add the new filesystem object to the directory. This will also 1496 * bump the inode's link count. 1497 */ 1498 error = hammer_ip_add_directory(&trans, dip, 1499 nch->ncp->nc_name, nch->ncp->nc_nlen, 1500 nip); 1501 if (error) 1502 hkprintf("hammer_mkdir (add) error %d\n", error); 1503 1504 /* 1505 * Finish up. 1506 */ 1507 if (error) { 1508 hammer_rel_inode(nip, 0); 1509 *ap->a_vpp = NULL; 1510 } else { 1511 error = hammer_get_vnode(nip, ap->a_vpp); 1512 hammer_rel_inode(nip, 0); 1513 if (error == 0) { 1514 cache_setunresolved(ap->a_nch); 1515 cache_setvp(ap->a_nch, *ap->a_vpp); 1516 } 1517 } 1518 hammer_done_transaction(&trans); 1519 if (error == 0) 1520 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1521 lwkt_reltoken(&hmp->fs_token); 1522 return (error); 1523 } 1524 1525 /* 1526 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1527 * 1528 * The operating system has already ensured that the directory entry 1529 * does not exist and done all appropriate namespace locking. 1530 */ 1531 static 1532 int 1533 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1534 { 1535 struct hammer_transaction trans; 1536 struct hammer_inode *dip; 1537 struct hammer_inode *nip; 1538 struct nchandle *nch; 1539 hammer_mount_t hmp; 1540 int error; 1541 1542 nch = ap->a_nch; 1543 dip = VTOI(ap->a_dvp); 1544 hmp = dip->hmp; 1545 1546 if (dip->flags & HAMMER_INODE_RO) 1547 return (EROFS); 1548 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1549 return (error); 1550 1551 /* 1552 * Create a transaction to cover the operations we perform. 1553 */ 1554 lwkt_gettoken(&hmp->fs_token); 1555 hammer_start_transaction(&trans, hmp); 1556 ++hammer_stats_file_iopsw; 1557 1558 /* 1559 * Create a new filesystem object of the requested type. The 1560 * returned inode will be referenced but not locked. 1561 * 1562 * If mknod specifies a directory a pseudo-fs is created. 1563 */ 1564 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1565 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1566 NULL, &nip); 1567 if (error) { 1568 hammer_done_transaction(&trans); 1569 *ap->a_vpp = NULL; 1570 lwkt_reltoken(&hmp->fs_token); 1571 return (error); 1572 } 1573 1574 /* 1575 * Add the new filesystem object to the directory. This will also 1576 * bump the inode's link count. 1577 */ 1578 error = hammer_ip_add_directory(&trans, dip, 1579 nch->ncp->nc_name, nch->ncp->nc_nlen, 1580 nip); 1581 1582 /* 1583 * Finish up. 1584 */ 1585 if (error) { 1586 hammer_rel_inode(nip, 0); 1587 *ap->a_vpp = NULL; 1588 } else { 1589 error = hammer_get_vnode(nip, ap->a_vpp); 1590 hammer_rel_inode(nip, 0); 1591 if (error == 0) { 1592 cache_setunresolved(ap->a_nch); 1593 cache_setvp(ap->a_nch, *ap->a_vpp); 1594 } 1595 } 1596 hammer_done_transaction(&trans); 1597 if (error == 0) 1598 hammer_knote(ap->a_dvp, NOTE_WRITE); 1599 lwkt_reltoken(&hmp->fs_token); 1600 return (error); 1601 } 1602 1603 /* 1604 * hammer_vop_open { vp, mode, cred, fp } 1605 * 1606 * MPSAFE (does not require fs_token) 1607 */ 1608 static 1609 int 1610 hammer_vop_open(struct vop_open_args *ap) 1611 { 1612 hammer_inode_t ip; 1613 1614 ++hammer_stats_file_iopsr; 1615 ip = VTOI(ap->a_vp); 1616 1617 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1618 return (EROFS); 1619 return(vop_stdopen(ap)); 1620 } 1621 1622 /* 1623 * hammer_vop_print { vp } 1624 */ 1625 static 1626 int 1627 hammer_vop_print(struct vop_print_args *ap) 1628 { 1629 return EOPNOTSUPP; 1630 } 1631 1632 /* 1633 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1634 */ 1635 static 1636 int 1637 hammer_vop_readdir(struct vop_readdir_args *ap) 1638 { 1639 struct hammer_transaction trans; 1640 struct hammer_cursor cursor; 1641 struct hammer_inode *ip; 1642 hammer_mount_t hmp; 1643 struct uio *uio; 1644 hammer_base_elm_t base; 1645 int error; 1646 int cookie_index; 1647 int ncookies; 1648 off_t *cookies; 1649 off_t saveoff; 1650 int r; 1651 int dtype; 1652 1653 ++hammer_stats_file_iopsr; 1654 ip = VTOI(ap->a_vp); 1655 uio = ap->a_uio; 1656 saveoff = uio->uio_offset; 1657 hmp = ip->hmp; 1658 1659 if (ap->a_ncookies) { 1660 ncookies = uio->uio_resid / 16 + 1; 1661 if (ncookies > 1024) 1662 ncookies = 1024; 1663 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1664 cookie_index = 0; 1665 } else { 1666 ncookies = -1; 1667 cookies = NULL; 1668 cookie_index = 0; 1669 } 1670 1671 lwkt_gettoken(&hmp->fs_token); 1672 hammer_simple_transaction(&trans, hmp); 1673 1674 /* 1675 * Handle artificial entries 1676 * 1677 * It should be noted that the minimum value for a directory 1678 * hash key on-media is 0x0000000100000000, so we can use anything 1679 * less then that to represent our 'special' key space. 1680 */ 1681 error = 0; 1682 if (saveoff == 0) { 1683 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1684 if (r) 1685 goto done; 1686 if (cookies) 1687 cookies[cookie_index] = saveoff; 1688 ++saveoff; 1689 ++cookie_index; 1690 if (cookie_index == ncookies) 1691 goto done; 1692 } 1693 if (saveoff == 1) { 1694 if (ip->ino_data.parent_obj_id) { 1695 r = vop_write_dirent(&error, uio, 1696 ip->ino_data.parent_obj_id, 1697 DT_DIR, 2, ".."); 1698 } else { 1699 r = vop_write_dirent(&error, uio, 1700 ip->obj_id, DT_DIR, 2, ".."); 1701 } 1702 if (r) 1703 goto done; 1704 if (cookies) 1705 cookies[cookie_index] = saveoff; 1706 ++saveoff; 1707 ++cookie_index; 1708 if (cookie_index == ncookies) 1709 goto done; 1710 } 1711 1712 /* 1713 * Key range (begin and end inclusive) to scan. Directory keys 1714 * directly translate to a 64 bit 'seek' position. 1715 */ 1716 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1717 cursor.key_beg.localization = ip->obj_localization + 1718 hammer_dir_localization(ip); 1719 cursor.key_beg.obj_id = ip->obj_id; 1720 cursor.key_beg.create_tid = 0; 1721 cursor.key_beg.delete_tid = 0; 1722 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1723 cursor.key_beg.obj_type = 0; 1724 cursor.key_beg.key = saveoff; 1725 1726 cursor.key_end = cursor.key_beg; 1727 cursor.key_end.key = HAMMER_MAX_KEY; 1728 cursor.asof = ip->obj_asof; 1729 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1730 1731 error = hammer_ip_first(&cursor); 1732 1733 while (error == 0) { 1734 error = hammer_ip_resolve_data(&cursor); 1735 if (error) 1736 break; 1737 base = &cursor.leaf->base; 1738 saveoff = base->key; 1739 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1740 1741 if (base->obj_id != ip->obj_id) 1742 panic("readdir: bad record at %p", cursor.node); 1743 1744 /* 1745 * Convert pseudo-filesystems into softlinks 1746 */ 1747 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1748 r = vop_write_dirent( 1749 &error, uio, cursor.data->entry.obj_id, 1750 dtype, 1751 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1752 (void *)cursor.data->entry.name); 1753 if (r) 1754 break; 1755 ++saveoff; 1756 if (cookies) 1757 cookies[cookie_index] = base->key; 1758 ++cookie_index; 1759 if (cookie_index == ncookies) 1760 break; 1761 error = hammer_ip_next(&cursor); 1762 } 1763 hammer_done_cursor(&cursor); 1764 1765 done: 1766 hammer_done_transaction(&trans); 1767 1768 if (ap->a_eofflag) 1769 *ap->a_eofflag = (error == ENOENT); 1770 uio->uio_offset = saveoff; 1771 if (error && cookie_index == 0) { 1772 if (error == ENOENT) 1773 error = 0; 1774 if (cookies) { 1775 kfree(cookies, M_TEMP); 1776 *ap->a_ncookies = 0; 1777 *ap->a_cookies = NULL; 1778 } 1779 } else { 1780 if (error == ENOENT) 1781 error = 0; 1782 if (cookies) { 1783 *ap->a_ncookies = cookie_index; 1784 *ap->a_cookies = cookies; 1785 } 1786 } 1787 lwkt_reltoken(&hmp->fs_token); 1788 return(error); 1789 } 1790 1791 /* 1792 * hammer_vop_readlink { vp, uio, cred } 1793 */ 1794 static 1795 int 1796 hammer_vop_readlink(struct vop_readlink_args *ap) 1797 { 1798 struct hammer_transaction trans; 1799 struct hammer_cursor cursor; 1800 struct hammer_inode *ip; 1801 hammer_mount_t hmp; 1802 char buf[32]; 1803 u_int32_t localization; 1804 hammer_pseudofs_inmem_t pfsm; 1805 int error; 1806 1807 ip = VTOI(ap->a_vp); 1808 hmp = ip->hmp; 1809 1810 lwkt_gettoken(&hmp->fs_token); 1811 1812 /* 1813 * Shortcut if the symlink data was stuffed into ino_data. 1814 * 1815 * Also expand special "@@PFS%05d" softlinks (expansion only 1816 * occurs for non-historical (current) accesses made from the 1817 * primary filesystem). 1818 */ 1819 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1820 char *ptr; 1821 int bytes; 1822 1823 ptr = ip->ino_data.ext.symlink; 1824 bytes = (int)ip->ino_data.size; 1825 if (bytes == 10 && 1826 ip->obj_asof == HAMMER_MAX_TID && 1827 ip->obj_localization == 0 && 1828 strncmp(ptr, "@@PFS", 5) == 0) { 1829 hammer_simple_transaction(&trans, hmp); 1830 bcopy(ptr + 5, buf, 5); 1831 buf[5] = 0; 1832 localization = strtoul(buf, NULL, 10) << 16; 1833 pfsm = hammer_load_pseudofs(&trans, localization, 1834 &error); 1835 if (error == 0) { 1836 if (pfsm->pfsd.mirror_flags & 1837 HAMMER_PFSD_SLAVE) { 1838 /* vap->va_size == 26 */ 1839 ksnprintf(buf, sizeof(buf), 1840 "@@0x%016llx:%05d", 1841 (long long)pfsm->pfsd.sync_end_tid, 1842 localization >> 16); 1843 } else { 1844 /* vap->va_size == 10 */ 1845 ksnprintf(buf, sizeof(buf), 1846 "@@-1:%05d", 1847 localization >> 16); 1848 #if 0 1849 ksnprintf(buf, sizeof(buf), 1850 "@@0x%016llx:%05d", 1851 (long long)HAMMER_MAX_TID, 1852 localization >> 16); 1853 #endif 1854 } 1855 ptr = buf; 1856 bytes = strlen(buf); 1857 } 1858 if (pfsm) 1859 hammer_rel_pseudofs(hmp, pfsm); 1860 hammer_done_transaction(&trans); 1861 } 1862 error = uiomove(ptr, bytes, ap->a_uio); 1863 lwkt_reltoken(&hmp->fs_token); 1864 return(error); 1865 } 1866 1867 /* 1868 * Long version 1869 */ 1870 hammer_simple_transaction(&trans, hmp); 1871 ++hammer_stats_file_iopsr; 1872 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1873 1874 /* 1875 * Key range (begin and end inclusive) to scan. Directory keys 1876 * directly translate to a 64 bit 'seek' position. 1877 */ 1878 cursor.key_beg.localization = ip->obj_localization + 1879 HAMMER_LOCALIZE_MISC; 1880 cursor.key_beg.obj_id = ip->obj_id; 1881 cursor.key_beg.create_tid = 0; 1882 cursor.key_beg.delete_tid = 0; 1883 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1884 cursor.key_beg.obj_type = 0; 1885 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1886 cursor.asof = ip->obj_asof; 1887 cursor.flags |= HAMMER_CURSOR_ASOF; 1888 1889 error = hammer_ip_lookup(&cursor); 1890 if (error == 0) { 1891 error = hammer_ip_resolve_data(&cursor); 1892 if (error == 0) { 1893 KKASSERT(cursor.leaf->data_len >= 1894 HAMMER_SYMLINK_NAME_OFF); 1895 error = uiomove(cursor.data->symlink.name, 1896 cursor.leaf->data_len - 1897 HAMMER_SYMLINK_NAME_OFF, 1898 ap->a_uio); 1899 } 1900 } 1901 hammer_done_cursor(&cursor); 1902 hammer_done_transaction(&trans); 1903 lwkt_reltoken(&hmp->fs_token); 1904 return(error); 1905 } 1906 1907 /* 1908 * hammer_vop_nremove { nch, dvp, cred } 1909 */ 1910 static 1911 int 1912 hammer_vop_nremove(struct vop_nremove_args *ap) 1913 { 1914 struct hammer_transaction trans; 1915 struct hammer_inode *dip; 1916 hammer_mount_t hmp; 1917 int error; 1918 1919 dip = VTOI(ap->a_dvp); 1920 hmp = dip->hmp; 1921 1922 if (hammer_nohistory(dip) == 0 && 1923 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1924 return (error); 1925 } 1926 1927 lwkt_gettoken(&hmp->fs_token); 1928 hammer_start_transaction(&trans, hmp); 1929 ++hammer_stats_file_iopsw; 1930 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1931 hammer_done_transaction(&trans); 1932 if (error == 0) 1933 hammer_knote(ap->a_dvp, NOTE_WRITE); 1934 lwkt_reltoken(&hmp->fs_token); 1935 return (error); 1936 } 1937 1938 /* 1939 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1940 */ 1941 static 1942 int 1943 hammer_vop_nrename(struct vop_nrename_args *ap) 1944 { 1945 struct hammer_transaction trans; 1946 struct namecache *fncp; 1947 struct namecache *tncp; 1948 struct hammer_inode *fdip; 1949 struct hammer_inode *tdip; 1950 struct hammer_inode *ip; 1951 hammer_mount_t hmp; 1952 struct hammer_cursor cursor; 1953 int64_t namekey; 1954 u_int32_t max_iterations; 1955 int nlen, error; 1956 1957 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1958 return(EXDEV); 1959 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1960 return(EXDEV); 1961 1962 fdip = VTOI(ap->a_fdvp); 1963 tdip = VTOI(ap->a_tdvp); 1964 fncp = ap->a_fnch->ncp; 1965 tncp = ap->a_tnch->ncp; 1966 ip = VTOI(fncp->nc_vp); 1967 KKASSERT(ip != NULL); 1968 1969 hmp = ip->hmp; 1970 1971 if (fdip->obj_localization != tdip->obj_localization) 1972 return(EXDEV); 1973 if (fdip->obj_localization != ip->obj_localization) 1974 return(EXDEV); 1975 1976 if (fdip->flags & HAMMER_INODE_RO) 1977 return (EROFS); 1978 if (tdip->flags & HAMMER_INODE_RO) 1979 return (EROFS); 1980 if (ip->flags & HAMMER_INODE_RO) 1981 return (EROFS); 1982 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1983 return (error); 1984 1985 lwkt_gettoken(&hmp->fs_token); 1986 hammer_start_transaction(&trans, hmp); 1987 ++hammer_stats_file_iopsw; 1988 1989 /* 1990 * Remove tncp from the target directory and then link ip as 1991 * tncp. XXX pass trans to dounlink 1992 * 1993 * Force the inode sync-time to match the transaction so it is 1994 * in-sync with the creation of the target directory entry. 1995 */ 1996 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1997 ap->a_cred, 0, -1); 1998 if (error == 0 || error == ENOENT) { 1999 error = hammer_ip_add_directory(&trans, tdip, 2000 tncp->nc_name, tncp->nc_nlen, 2001 ip); 2002 if (error == 0) { 2003 ip->ino_data.parent_obj_id = tdip->obj_id; 2004 ip->ino_data.ctime = trans.time; 2005 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2006 } 2007 } 2008 if (error) 2009 goto failed; /* XXX */ 2010 2011 /* 2012 * Locate the record in the originating directory and remove it. 2013 * 2014 * Calculate the namekey and setup the key range for the scan. This 2015 * works kinda like a chained hash table where the lower 32 bits 2016 * of the namekey synthesize the chain. 2017 * 2018 * The key range is inclusive of both key_beg and key_end. 2019 */ 2020 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2021 &max_iterations); 2022 retry: 2023 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2024 cursor.key_beg.localization = fdip->obj_localization + 2025 hammer_dir_localization(fdip); 2026 cursor.key_beg.obj_id = fdip->obj_id; 2027 cursor.key_beg.key = namekey; 2028 cursor.key_beg.create_tid = 0; 2029 cursor.key_beg.delete_tid = 0; 2030 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2031 cursor.key_beg.obj_type = 0; 2032 2033 cursor.key_end = cursor.key_beg; 2034 cursor.key_end.key += max_iterations; 2035 cursor.asof = fdip->obj_asof; 2036 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2037 2038 /* 2039 * Scan all matching records (the chain), locate the one matching 2040 * the requested path component. 2041 * 2042 * The hammer_ip_*() functions merge in-memory records with on-disk 2043 * records for the purposes of the search. 2044 */ 2045 error = hammer_ip_first(&cursor); 2046 while (error == 0) { 2047 if (hammer_ip_resolve_data(&cursor) != 0) 2048 break; 2049 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2050 KKASSERT(nlen > 0); 2051 if (fncp->nc_nlen == nlen && 2052 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2053 break; 2054 } 2055 error = hammer_ip_next(&cursor); 2056 } 2057 2058 /* 2059 * If all is ok we have to get the inode so we can adjust nlinks. 2060 * 2061 * WARNING: hammer_ip_del_directory() may have to terminate the 2062 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2063 * twice. 2064 */ 2065 if (error == 0) 2066 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2067 2068 /* 2069 * XXX A deadlock here will break rename's atomicy for the purposes 2070 * of crash recovery. 2071 */ 2072 if (error == EDEADLK) { 2073 hammer_done_cursor(&cursor); 2074 goto retry; 2075 } 2076 2077 /* 2078 * Cleanup and tell the kernel that the rename succeeded. 2079 * 2080 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2081 * without formally acquiring the vp since the vp might 2082 * have zero refs on it, or in the middle of a reclaim, 2083 * etc. 2084 */ 2085 hammer_done_cursor(&cursor); 2086 if (error == 0) { 2087 cache_rename(ap->a_fnch, ap->a_tnch); 2088 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2089 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2090 while (ip->vp) { 2091 struct vnode *vp; 2092 2093 error = hammer_get_vnode(ip, &vp); 2094 if (error == 0 && vp) { 2095 vn_unlock(vp); 2096 hammer_knote(ip->vp, NOTE_RENAME); 2097 vrele(vp); 2098 break; 2099 } 2100 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2101 } 2102 } 2103 2104 failed: 2105 hammer_done_transaction(&trans); 2106 lwkt_reltoken(&hmp->fs_token); 2107 return (error); 2108 } 2109 2110 /* 2111 * hammer_vop_nrmdir { nch, dvp, cred } 2112 */ 2113 static 2114 int 2115 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2116 { 2117 struct hammer_transaction trans; 2118 struct hammer_inode *dip; 2119 hammer_mount_t hmp; 2120 int error; 2121 2122 dip = VTOI(ap->a_dvp); 2123 hmp = dip->hmp; 2124 2125 if (hammer_nohistory(dip) == 0 && 2126 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2127 return (error); 2128 } 2129 2130 lwkt_gettoken(&hmp->fs_token); 2131 hammer_start_transaction(&trans, hmp); 2132 ++hammer_stats_file_iopsw; 2133 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2134 hammer_done_transaction(&trans); 2135 if (error == 0) 2136 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2137 lwkt_reltoken(&hmp->fs_token); 2138 return (error); 2139 } 2140 2141 /* 2142 * hammer_vop_markatime { vp, cred } 2143 */ 2144 static 2145 int 2146 hammer_vop_markatime(struct vop_markatime_args *ap) 2147 { 2148 struct hammer_transaction trans; 2149 struct hammer_inode *ip; 2150 hammer_mount_t hmp; 2151 2152 ip = VTOI(ap->a_vp); 2153 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2154 return (EROFS); 2155 if (ip->flags & HAMMER_INODE_RO) 2156 return (EROFS); 2157 hmp = ip->hmp; 2158 if (hmp->mp->mnt_flag & MNT_NOATIME) 2159 return (0); 2160 lwkt_gettoken(&hmp->fs_token); 2161 hammer_start_transaction(&trans, hmp); 2162 ++hammer_stats_file_iopsw; 2163 2164 ip->ino_data.atime = trans.time; 2165 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2166 hammer_done_transaction(&trans); 2167 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2168 lwkt_reltoken(&hmp->fs_token); 2169 return (0); 2170 } 2171 2172 /* 2173 * hammer_vop_setattr { vp, vap, cred } 2174 */ 2175 static 2176 int 2177 hammer_vop_setattr(struct vop_setattr_args *ap) 2178 { 2179 struct hammer_transaction trans; 2180 struct hammer_inode *ip; 2181 struct vattr *vap; 2182 hammer_mount_t hmp; 2183 int modflags; 2184 int error; 2185 int truncating; 2186 int blksize; 2187 int kflags; 2188 #if 0 2189 int64_t aligned_size; 2190 #endif 2191 u_int32_t flags; 2192 2193 vap = ap->a_vap; 2194 ip = ap->a_vp->v_data; 2195 modflags = 0; 2196 kflags = 0; 2197 hmp = ip->hmp; 2198 2199 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2200 return(EROFS); 2201 if (ip->flags & HAMMER_INODE_RO) 2202 return (EROFS); 2203 if (hammer_nohistory(ip) == 0 && 2204 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2205 return (error); 2206 } 2207 2208 lwkt_gettoken(&hmp->fs_token); 2209 hammer_start_transaction(&trans, hmp); 2210 ++hammer_stats_file_iopsw; 2211 error = 0; 2212 2213 if (vap->va_flags != VNOVAL) { 2214 flags = ip->ino_data.uflags; 2215 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2216 hammer_to_unix_xid(&ip->ino_data.uid), 2217 ap->a_cred); 2218 if (error == 0) { 2219 if (ip->ino_data.uflags != flags) { 2220 ip->ino_data.uflags = flags; 2221 ip->ino_data.ctime = trans.time; 2222 modflags |= HAMMER_INODE_DDIRTY; 2223 kflags |= NOTE_ATTRIB; 2224 } 2225 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2226 error = 0; 2227 goto done; 2228 } 2229 } 2230 goto done; 2231 } 2232 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2233 error = EPERM; 2234 goto done; 2235 } 2236 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2237 mode_t cur_mode = ip->ino_data.mode; 2238 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2239 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2240 uuid_t uuid_uid; 2241 uuid_t uuid_gid; 2242 2243 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2244 ap->a_cred, 2245 &cur_uid, &cur_gid, &cur_mode); 2246 if (error == 0) { 2247 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2248 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2249 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2250 sizeof(uuid_uid)) || 2251 bcmp(&uuid_gid, &ip->ino_data.gid, 2252 sizeof(uuid_gid)) || 2253 ip->ino_data.mode != cur_mode 2254 ) { 2255 ip->ino_data.uid = uuid_uid; 2256 ip->ino_data.gid = uuid_gid; 2257 ip->ino_data.mode = cur_mode; 2258 ip->ino_data.ctime = trans.time; 2259 modflags |= HAMMER_INODE_DDIRTY; 2260 } 2261 kflags |= NOTE_ATTRIB; 2262 } 2263 } 2264 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2265 switch(ap->a_vp->v_type) { 2266 case VREG: 2267 if (vap->va_size == ip->ino_data.size) 2268 break; 2269 2270 /* 2271 * Log the operation if in fast-fsync mode or if 2272 * there are unterminated redo write records present. 2273 * 2274 * The second check is needed so the recovery code 2275 * properly truncates write redos even if nominal 2276 * REDO operations is turned off due to excessive 2277 * writes, because the related records might be 2278 * destroyed and never lay down a TERM_WRITE. 2279 */ 2280 if ((ip->flags & HAMMER_INODE_REDO) || 2281 (ip->flags & HAMMER_INODE_RDIRTY)) { 2282 error = hammer_generate_redo(&trans, ip, 2283 vap->va_size, 2284 HAMMER_REDO_TRUNC, 2285 NULL, 0); 2286 } 2287 blksize = hammer_blocksize(vap->va_size); 2288 2289 /* 2290 * XXX break atomicy, we can deadlock the backend 2291 * if we do not release the lock. Probably not a 2292 * big deal here. 2293 */ 2294 if (vap->va_size < ip->ino_data.size) { 2295 nvtruncbuf(ap->a_vp, vap->va_size, 2296 blksize, 2297 hammer_blockoff(vap->va_size), 2298 0); 2299 truncating = 1; 2300 kflags |= NOTE_WRITE; 2301 } else { 2302 nvextendbuf(ap->a_vp, 2303 ip->ino_data.size, 2304 vap->va_size, 2305 hammer_blocksize(ip->ino_data.size), 2306 hammer_blocksize(vap->va_size), 2307 hammer_blockoff(ip->ino_data.size), 2308 hammer_blockoff(vap->va_size), 2309 0); 2310 truncating = 0; 2311 kflags |= NOTE_WRITE | NOTE_EXTEND; 2312 } 2313 ip->ino_data.size = vap->va_size; 2314 ip->ino_data.mtime = trans.time; 2315 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2316 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2317 2318 /* 2319 * On-media truncation is cached in the inode until 2320 * the inode is synchronized. We must immediately 2321 * handle any frontend records. 2322 */ 2323 if (truncating) { 2324 hammer_ip_frontend_trunc(ip, vap->va_size); 2325 #ifdef DEBUG_TRUNCATE 2326 if (HammerTruncIp == NULL) 2327 HammerTruncIp = ip; 2328 #endif 2329 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2330 ip->flags |= HAMMER_INODE_TRUNCATED; 2331 ip->trunc_off = vap->va_size; 2332 hammer_inode_dirty(ip); 2333 #ifdef DEBUG_TRUNCATE 2334 if (ip == HammerTruncIp) 2335 kprintf("truncate1 %016llx\n", 2336 (long long)ip->trunc_off); 2337 #endif 2338 } else if (ip->trunc_off > vap->va_size) { 2339 ip->trunc_off = vap->va_size; 2340 #ifdef DEBUG_TRUNCATE 2341 if (ip == HammerTruncIp) 2342 kprintf("truncate2 %016llx\n", 2343 (long long)ip->trunc_off); 2344 #endif 2345 } else { 2346 #ifdef DEBUG_TRUNCATE 2347 if (ip == HammerTruncIp) 2348 kprintf("truncate3 %016llx (ignored)\n", 2349 (long long)vap->va_size); 2350 #endif 2351 } 2352 } 2353 2354 #if 0 2355 /* 2356 * When truncating, nvtruncbuf() may have cleaned out 2357 * a portion of the last block on-disk in the buffer 2358 * cache. We must clean out any frontend records 2359 * for blocks beyond the new last block. 2360 */ 2361 aligned_size = (vap->va_size + (blksize - 1)) & 2362 ~(int64_t)(blksize - 1); 2363 if (truncating && vap->va_size < aligned_size) { 2364 aligned_size -= blksize; 2365 hammer_ip_frontend_trunc(ip, aligned_size); 2366 } 2367 #endif 2368 break; 2369 case VDATABASE: 2370 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2371 ip->flags |= HAMMER_INODE_TRUNCATED; 2372 ip->trunc_off = vap->va_size; 2373 hammer_inode_dirty(ip); 2374 } else if (ip->trunc_off > vap->va_size) { 2375 ip->trunc_off = vap->va_size; 2376 } 2377 hammer_ip_frontend_trunc(ip, vap->va_size); 2378 ip->ino_data.size = vap->va_size; 2379 ip->ino_data.mtime = trans.time; 2380 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2381 kflags |= NOTE_ATTRIB; 2382 break; 2383 default: 2384 error = EINVAL; 2385 goto done; 2386 } 2387 break; 2388 } 2389 if (vap->va_atime.tv_sec != VNOVAL) { 2390 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2391 modflags |= HAMMER_INODE_ATIME; 2392 kflags |= NOTE_ATTRIB; 2393 } 2394 if (vap->va_mtime.tv_sec != VNOVAL) { 2395 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2396 modflags |= HAMMER_INODE_MTIME; 2397 kflags |= NOTE_ATTRIB; 2398 } 2399 if (vap->va_mode != (mode_t)VNOVAL) { 2400 mode_t cur_mode = ip->ino_data.mode; 2401 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2402 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2403 2404 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2405 cur_uid, cur_gid, &cur_mode); 2406 if (error == 0 && ip->ino_data.mode != cur_mode) { 2407 ip->ino_data.mode = cur_mode; 2408 ip->ino_data.ctime = trans.time; 2409 modflags |= HAMMER_INODE_DDIRTY; 2410 kflags |= NOTE_ATTRIB; 2411 } 2412 } 2413 done: 2414 if (error == 0) 2415 hammer_modify_inode(&trans, ip, modflags); 2416 hammer_done_transaction(&trans); 2417 hammer_knote(ap->a_vp, kflags); 2418 lwkt_reltoken(&hmp->fs_token); 2419 return (error); 2420 } 2421 2422 /* 2423 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2424 */ 2425 static 2426 int 2427 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2428 { 2429 struct hammer_transaction trans; 2430 struct hammer_inode *dip; 2431 struct hammer_inode *nip; 2432 hammer_record_t record; 2433 struct nchandle *nch; 2434 hammer_mount_t hmp; 2435 int error; 2436 int bytes; 2437 2438 ap->a_vap->va_type = VLNK; 2439 2440 nch = ap->a_nch; 2441 dip = VTOI(ap->a_dvp); 2442 hmp = dip->hmp; 2443 2444 if (dip->flags & HAMMER_INODE_RO) 2445 return (EROFS); 2446 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2447 return (error); 2448 2449 /* 2450 * Create a transaction to cover the operations we perform. 2451 */ 2452 lwkt_gettoken(&hmp->fs_token); 2453 hammer_start_transaction(&trans, hmp); 2454 ++hammer_stats_file_iopsw; 2455 2456 /* 2457 * Create a new filesystem object of the requested type. The 2458 * returned inode will be referenced but not locked. 2459 */ 2460 2461 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2462 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2463 NULL, &nip); 2464 if (error) { 2465 hammer_done_transaction(&trans); 2466 *ap->a_vpp = NULL; 2467 lwkt_reltoken(&hmp->fs_token); 2468 return (error); 2469 } 2470 2471 /* 2472 * Add a record representing the symlink. symlink stores the link 2473 * as pure data, not a string, and is no \0 terminated. 2474 */ 2475 if (error == 0) { 2476 bytes = strlen(ap->a_target); 2477 2478 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2479 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2480 } else { 2481 record = hammer_alloc_mem_record(nip, bytes); 2482 record->type = HAMMER_MEM_RECORD_GENERAL; 2483 2484 record->leaf.base.localization = nip->obj_localization + 2485 HAMMER_LOCALIZE_MISC; 2486 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2487 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2488 record->leaf.data_len = bytes; 2489 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2490 bcopy(ap->a_target, record->data->symlink.name, bytes); 2491 error = hammer_ip_add_record(&trans, record); 2492 } 2493 2494 /* 2495 * Set the file size to the length of the link. 2496 */ 2497 if (error == 0) { 2498 nip->ino_data.size = bytes; 2499 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2500 } 2501 } 2502 if (error == 0) 2503 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2504 nch->ncp->nc_nlen, nip); 2505 2506 /* 2507 * Finish up. 2508 */ 2509 if (error) { 2510 hammer_rel_inode(nip, 0); 2511 *ap->a_vpp = NULL; 2512 } else { 2513 error = hammer_get_vnode(nip, ap->a_vpp); 2514 hammer_rel_inode(nip, 0); 2515 if (error == 0) { 2516 cache_setunresolved(ap->a_nch); 2517 cache_setvp(ap->a_nch, *ap->a_vpp); 2518 hammer_knote(ap->a_dvp, NOTE_WRITE); 2519 } 2520 } 2521 hammer_done_transaction(&trans); 2522 lwkt_reltoken(&hmp->fs_token); 2523 return (error); 2524 } 2525 2526 /* 2527 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2528 */ 2529 static 2530 int 2531 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2532 { 2533 struct hammer_transaction trans; 2534 struct hammer_inode *dip; 2535 hammer_mount_t hmp; 2536 int error; 2537 2538 dip = VTOI(ap->a_dvp); 2539 hmp = dip->hmp; 2540 2541 if (hammer_nohistory(dip) == 0 && 2542 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2543 return (error); 2544 } 2545 2546 lwkt_gettoken(&hmp->fs_token); 2547 hammer_start_transaction(&trans, hmp); 2548 ++hammer_stats_file_iopsw; 2549 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2550 ap->a_cred, ap->a_flags, -1); 2551 hammer_done_transaction(&trans); 2552 lwkt_reltoken(&hmp->fs_token); 2553 2554 return (error); 2555 } 2556 2557 /* 2558 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2559 */ 2560 static 2561 int 2562 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2563 { 2564 struct hammer_inode *ip = ap->a_vp->v_data; 2565 hammer_mount_t hmp = ip->hmp; 2566 int error; 2567 2568 ++hammer_stats_file_iopsr; 2569 lwkt_gettoken(&hmp->fs_token); 2570 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2571 ap->a_fflag, ap->a_cred); 2572 lwkt_reltoken(&hmp->fs_token); 2573 return (error); 2574 } 2575 2576 static 2577 int 2578 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2579 { 2580 static const struct mountctl_opt extraopt[] = { 2581 { HMNT_NOHISTORY, "nohistory" }, 2582 { HMNT_MASTERID, "master" }, 2583 { 0, NULL} 2584 2585 }; 2586 struct hammer_mount *hmp; 2587 struct mount *mp; 2588 int usedbytes; 2589 int error; 2590 2591 error = 0; 2592 usedbytes = 0; 2593 mp = ap->a_head.a_ops->head.vv_mount; 2594 KKASSERT(mp->mnt_data != NULL); 2595 hmp = (struct hammer_mount *)mp->mnt_data; 2596 2597 lwkt_gettoken(&hmp->fs_token); 2598 2599 switch(ap->a_op) { 2600 case MOUNTCTL_SET_EXPORT: 2601 if (ap->a_ctllen != sizeof(struct export_args)) 2602 error = EINVAL; 2603 else 2604 error = hammer_vfs_export(mp, ap->a_op, 2605 (const struct export_args *)ap->a_ctl); 2606 break; 2607 case MOUNTCTL_MOUNTFLAGS: 2608 { 2609 /* 2610 * Call standard mountctl VOP function 2611 * so we get user mount flags. 2612 */ 2613 error = vop_stdmountctl(ap); 2614 if (error) 2615 break; 2616 2617 usedbytes = *ap->a_res; 2618 2619 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2620 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2621 ap->a_buf, 2622 ap->a_buflen - usedbytes, 2623 &error); 2624 } 2625 2626 *ap->a_res += usedbytes; 2627 break; 2628 } 2629 default: 2630 error = vop_stdmountctl(ap); 2631 break; 2632 } 2633 lwkt_reltoken(&hmp->fs_token); 2634 return(error); 2635 } 2636 2637 /* 2638 * hammer_vop_strategy { vp, bio } 2639 * 2640 * Strategy call, used for regular file read & write only. Note that the 2641 * bp may represent a cluster. 2642 * 2643 * To simplify operation and allow better optimizations in the future, 2644 * this code does not make any assumptions with regards to buffer alignment 2645 * or size. 2646 */ 2647 static 2648 int 2649 hammer_vop_strategy(struct vop_strategy_args *ap) 2650 { 2651 struct buf *bp; 2652 int error; 2653 2654 bp = ap->a_bio->bio_buf; 2655 2656 switch(bp->b_cmd) { 2657 case BUF_CMD_READ: 2658 error = hammer_vop_strategy_read(ap); 2659 break; 2660 case BUF_CMD_WRITE: 2661 error = hammer_vop_strategy_write(ap); 2662 break; 2663 default: 2664 bp->b_error = error = EINVAL; 2665 bp->b_flags |= B_ERROR; 2666 biodone(ap->a_bio); 2667 break; 2668 } 2669 2670 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2671 2672 return (error); 2673 } 2674 2675 /* 2676 * Read from a regular file. Iterate the related records and fill in the 2677 * BIO/BUF. Gaps are zero-filled. 2678 * 2679 * The support code in hammer_object.c should be used to deal with mixed 2680 * in-memory and on-disk records. 2681 * 2682 * NOTE: Can be called from the cluster code with an oversized buf. 2683 * 2684 * XXX atime update 2685 */ 2686 static 2687 int 2688 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2689 { 2690 struct hammer_transaction trans; 2691 struct hammer_inode *ip; 2692 struct hammer_inode *dip; 2693 hammer_mount_t hmp; 2694 struct hammer_cursor cursor; 2695 hammer_base_elm_t base; 2696 hammer_off_t disk_offset; 2697 struct bio *bio; 2698 struct bio *nbio; 2699 struct buf *bp; 2700 int64_t rec_offset; 2701 int64_t ran_end; 2702 int64_t tmp64; 2703 int error; 2704 int boff; 2705 int roff; 2706 int n; 2707 int isdedupable; 2708 2709 bio = ap->a_bio; 2710 bp = bio->bio_buf; 2711 ip = ap->a_vp->v_data; 2712 hmp = ip->hmp; 2713 2714 /* 2715 * The zone-2 disk offset may have been set by the cluster code via 2716 * a BMAP operation, or else should be NOOFFSET. 2717 * 2718 * Checking the high bits for a match against zone-2 should suffice. 2719 * 2720 * In cases where a lot of data duplication is present it may be 2721 * more beneficial to drop through and doubule-buffer through the 2722 * device. 2723 */ 2724 nbio = push_bio(bio); 2725 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2726 HAMMER_ZONE_LARGE_DATA) { 2727 if (hammer_double_buffer == 0) { 2728 lwkt_gettoken(&hmp->fs_token); 2729 error = hammer_io_direct_read(hmp, nbio, NULL); 2730 lwkt_reltoken(&hmp->fs_token); 2731 return (error); 2732 } 2733 2734 /* 2735 * Try to shortcut requests for double_buffer mode too. 2736 * Since this mode runs through the device buffer cache 2737 * only compatible buffer sizes (meaning those generated 2738 * by normal filesystem buffers) are legal. 2739 */ 2740 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2741 lwkt_gettoken(&hmp->fs_token); 2742 error = hammer_io_indirect_read(hmp, nbio, NULL); 2743 lwkt_reltoken(&hmp->fs_token); 2744 return (error); 2745 } 2746 } 2747 2748 /* 2749 * Well, that sucked. Do it the hard way. If all the stars are 2750 * aligned we may still be able to issue a direct-read. 2751 */ 2752 lwkt_gettoken(&hmp->fs_token); 2753 hammer_simple_transaction(&trans, hmp); 2754 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2755 2756 /* 2757 * Key range (begin and end inclusive) to scan. Note that the key's 2758 * stored in the actual records represent BASE+LEN, not BASE. The 2759 * first record containing bio_offset will have a key > bio_offset. 2760 */ 2761 cursor.key_beg.localization = ip->obj_localization + 2762 HAMMER_LOCALIZE_MISC; 2763 cursor.key_beg.obj_id = ip->obj_id; 2764 cursor.key_beg.create_tid = 0; 2765 cursor.key_beg.delete_tid = 0; 2766 cursor.key_beg.obj_type = 0; 2767 cursor.key_beg.key = bio->bio_offset + 1; 2768 cursor.asof = ip->obj_asof; 2769 cursor.flags |= HAMMER_CURSOR_ASOF; 2770 2771 cursor.key_end = cursor.key_beg; 2772 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2773 #if 0 2774 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2775 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2776 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2777 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2778 } else 2779 #endif 2780 { 2781 ran_end = bio->bio_offset + bp->b_bufsize; 2782 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2783 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2784 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2785 if (tmp64 < ran_end) 2786 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2787 else 2788 cursor.key_end.key = ran_end + MAXPHYS + 1; 2789 } 2790 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2791 2792 /* 2793 * Set NOSWAPCACHE for cursor data extraction if double buffering 2794 * is disabled or (if the file is not marked cacheable via chflags 2795 * and vm.swapcache_use_chflags is enabled). 2796 */ 2797 if (hammer_double_buffer == 0 || 2798 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2799 vm_swapcache_use_chflags)) { 2800 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2801 } 2802 2803 error = hammer_ip_first(&cursor); 2804 boff = 0; 2805 2806 while (error == 0) { 2807 /* 2808 * Get the base file offset of the record. The key for 2809 * data records is (base + bytes) rather then (base). 2810 */ 2811 base = &cursor.leaf->base; 2812 rec_offset = base->key - cursor.leaf->data_len; 2813 2814 /* 2815 * Calculate the gap, if any, and zero-fill it. 2816 * 2817 * n is the offset of the start of the record verses our 2818 * current seek offset in the bio. 2819 */ 2820 n = (int)(rec_offset - (bio->bio_offset + boff)); 2821 if (n > 0) { 2822 if (n > bp->b_bufsize - boff) 2823 n = bp->b_bufsize - boff; 2824 bzero((char *)bp->b_data + boff, n); 2825 boff += n; 2826 n = 0; 2827 } 2828 2829 /* 2830 * Calculate the data offset in the record and the number 2831 * of bytes we can copy. 2832 * 2833 * There are two degenerate cases. First, boff may already 2834 * be at bp->b_bufsize. Secondly, the data offset within 2835 * the record may exceed the record's size. 2836 */ 2837 roff = -n; 2838 rec_offset += roff; 2839 n = cursor.leaf->data_len - roff; 2840 if (n <= 0) { 2841 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2842 n = 0; 2843 } else if (n > bp->b_bufsize - boff) { 2844 n = bp->b_bufsize - boff; 2845 } 2846 2847 /* 2848 * Deal with cached truncations. This cool bit of code 2849 * allows truncate()/ftruncate() to avoid having to sync 2850 * the file. 2851 * 2852 * If the frontend is truncated then all backend records are 2853 * subject to the frontend's truncation. 2854 * 2855 * If the backend is truncated then backend records on-disk 2856 * (but not in-memory) are subject to the backend's 2857 * truncation. In-memory records owned by the backend 2858 * represent data written after the truncation point on the 2859 * backend and must not be truncated. 2860 * 2861 * Truncate operations deal with frontend buffer cache 2862 * buffers and frontend-owned in-memory records synchronously. 2863 */ 2864 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2865 if (hammer_cursor_ondisk(&cursor)/* || 2866 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2867 if (ip->trunc_off <= rec_offset) 2868 n = 0; 2869 else if (ip->trunc_off < rec_offset + n) 2870 n = (int)(ip->trunc_off - rec_offset); 2871 } 2872 } 2873 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2874 if (hammer_cursor_ondisk(&cursor)) { 2875 if (ip->sync_trunc_off <= rec_offset) 2876 n = 0; 2877 else if (ip->sync_trunc_off < rec_offset + n) 2878 n = (int)(ip->sync_trunc_off - rec_offset); 2879 } 2880 } 2881 2882 /* 2883 * Try to issue a direct read into our bio if possible, 2884 * otherwise resolve the element data into a hammer_buffer 2885 * and copy. 2886 * 2887 * The buffer on-disk should be zerod past any real 2888 * truncation point, but may not be for any synthesized 2889 * truncation point from above. 2890 * 2891 * NOTE: disk_offset is only valid if the cursor data is 2892 * on-disk. 2893 */ 2894 disk_offset = cursor.leaf->data_offset + roff; 2895 isdedupable = (boff == 0 && n == bp->b_bufsize && 2896 hammer_cursor_ondisk(&cursor) && 2897 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2898 2899 if (isdedupable && hammer_double_buffer == 0) { 2900 /* 2901 * Direct read case 2902 */ 2903 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2904 HAMMER_ZONE_LARGE_DATA); 2905 nbio->bio_offset = disk_offset; 2906 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2907 if (hammer_live_dedup && error == 0) 2908 hammer_dedup_cache_add(ip, cursor.leaf); 2909 goto done; 2910 } else if (isdedupable) { 2911 /* 2912 * Async I/O case for reading from backing store 2913 * and copying the data to the filesystem buffer. 2914 * live-dedup has to verify the data anyway if it 2915 * gets a hit later so we can just add the entry 2916 * now. 2917 */ 2918 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2919 HAMMER_ZONE_LARGE_DATA); 2920 nbio->bio_offset = disk_offset; 2921 if (hammer_live_dedup) 2922 hammer_dedup_cache_add(ip, cursor.leaf); 2923 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2924 goto done; 2925 } else if (n) { 2926 error = hammer_ip_resolve_data(&cursor); 2927 if (error == 0) { 2928 if (hammer_live_dedup && isdedupable) 2929 hammer_dedup_cache_add(ip, cursor.leaf); 2930 bcopy((char *)cursor.data + roff, 2931 (char *)bp->b_data + boff, n); 2932 } 2933 } 2934 if (error) 2935 break; 2936 2937 /* 2938 * We have to be sure that the only elements added to the 2939 * dedup cache are those which are already on-media. 2940 */ 2941 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2942 hammer_dedup_cache_add(ip, cursor.leaf); 2943 2944 /* 2945 * Iterate until we have filled the request. 2946 */ 2947 boff += n; 2948 if (boff == bp->b_bufsize) 2949 break; 2950 error = hammer_ip_next(&cursor); 2951 } 2952 2953 /* 2954 * There may have been a gap after the last record 2955 */ 2956 if (error == ENOENT) 2957 error = 0; 2958 if (error == 0 && boff != bp->b_bufsize) { 2959 KKASSERT(boff < bp->b_bufsize); 2960 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2961 /* boff = bp->b_bufsize; */ 2962 } 2963 2964 /* 2965 * Disallow swapcache operation on the vnode buffer if double 2966 * buffering is enabled, the swapcache will get the data via 2967 * the block device buffer. 2968 */ 2969 if (hammer_double_buffer) 2970 bp->b_flags |= B_NOTMETA; 2971 2972 /* 2973 * Cleanup 2974 */ 2975 bp->b_resid = 0; 2976 bp->b_error = error; 2977 if (error) 2978 bp->b_flags |= B_ERROR; 2979 biodone(ap->a_bio); 2980 2981 done: 2982 /* 2983 * Cache the b-tree node for the last data read in cache[1]. 2984 * 2985 * If we hit the file EOF then also cache the node in the 2986 * governing director's cache[3], it will be used to initialize 2987 * the inode's cache[1] for any inodes looked up via the directory. 2988 * 2989 * This doesn't reduce disk accesses since the B-Tree chain is 2990 * likely cached, but it does reduce cpu overhead when looking 2991 * up file offsets for cpdup/tar/cpio style iterations. 2992 */ 2993 if (cursor.node) 2994 hammer_cache_node(&ip->cache[1], cursor.node); 2995 if (ran_end >= ip->ino_data.size) { 2996 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2997 ip->obj_asof, ip->obj_localization); 2998 if (dip) { 2999 hammer_cache_node(&dip->cache[3], cursor.node); 3000 hammer_rel_inode(dip, 0); 3001 } 3002 } 3003 hammer_done_cursor(&cursor); 3004 hammer_done_transaction(&trans); 3005 lwkt_reltoken(&hmp->fs_token); 3006 return(error); 3007 } 3008 3009 /* 3010 * BMAP operation - used to support cluster_read() only. 3011 * 3012 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3013 * 3014 * This routine may return EOPNOTSUPP if the opration is not supported for 3015 * the specified offset. The contents of the pointer arguments do not 3016 * need to be initialized in that case. 3017 * 3018 * If a disk address is available and properly aligned return 0 with 3019 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3020 * to the run-length relative to that offset. Callers may assume that 3021 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3022 * large, so return EOPNOTSUPP if it is not sufficiently large. 3023 */ 3024 static 3025 int 3026 hammer_vop_bmap(struct vop_bmap_args *ap) 3027 { 3028 struct hammer_transaction trans; 3029 struct hammer_inode *ip; 3030 hammer_mount_t hmp; 3031 struct hammer_cursor cursor; 3032 hammer_base_elm_t base; 3033 int64_t rec_offset; 3034 int64_t ran_end; 3035 int64_t tmp64; 3036 int64_t base_offset; 3037 int64_t base_disk_offset; 3038 int64_t last_offset; 3039 hammer_off_t last_disk_offset; 3040 hammer_off_t disk_offset; 3041 int rec_len; 3042 int error; 3043 int blksize; 3044 3045 ++hammer_stats_file_iopsr; 3046 ip = ap->a_vp->v_data; 3047 hmp = ip->hmp; 3048 3049 /* 3050 * We can only BMAP regular files. We can't BMAP database files, 3051 * directories, etc. 3052 */ 3053 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3054 return(EOPNOTSUPP); 3055 3056 /* 3057 * bmap is typically called with runp/runb both NULL when used 3058 * for writing. We do not support BMAP for writing atm. 3059 */ 3060 if (ap->a_cmd != BUF_CMD_READ) 3061 return(EOPNOTSUPP); 3062 3063 /* 3064 * Scan the B-Tree to acquire blockmap addresses, then translate 3065 * to raw addresses. 3066 */ 3067 lwkt_gettoken(&hmp->fs_token); 3068 hammer_simple_transaction(&trans, hmp); 3069 #if 0 3070 kprintf("bmap_beg %016llx ip->cache %p\n", 3071 (long long)ap->a_loffset, ip->cache[1]); 3072 #endif 3073 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3074 3075 /* 3076 * Key range (begin and end inclusive) to scan. Note that the key's 3077 * stored in the actual records represent BASE+LEN, not BASE. The 3078 * first record containing bio_offset will have a key > bio_offset. 3079 */ 3080 cursor.key_beg.localization = ip->obj_localization + 3081 HAMMER_LOCALIZE_MISC; 3082 cursor.key_beg.obj_id = ip->obj_id; 3083 cursor.key_beg.create_tid = 0; 3084 cursor.key_beg.delete_tid = 0; 3085 cursor.key_beg.obj_type = 0; 3086 if (ap->a_runb) 3087 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3088 else 3089 cursor.key_beg.key = ap->a_loffset + 1; 3090 if (cursor.key_beg.key < 0) 3091 cursor.key_beg.key = 0; 3092 cursor.asof = ip->obj_asof; 3093 cursor.flags |= HAMMER_CURSOR_ASOF; 3094 3095 cursor.key_end = cursor.key_beg; 3096 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3097 3098 ran_end = ap->a_loffset + MAXPHYS; 3099 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3100 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3101 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3102 if (tmp64 < ran_end) 3103 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3104 else 3105 cursor.key_end.key = ran_end + MAXPHYS + 1; 3106 3107 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3108 3109 error = hammer_ip_first(&cursor); 3110 base_offset = last_offset = 0; 3111 base_disk_offset = last_disk_offset = 0; 3112 3113 while (error == 0) { 3114 /* 3115 * Get the base file offset of the record. The key for 3116 * data records is (base + bytes) rather then (base). 3117 * 3118 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3119 * The extra bytes should be zero on-disk and the BMAP op 3120 * should still be ok. 3121 */ 3122 base = &cursor.leaf->base; 3123 rec_offset = base->key - cursor.leaf->data_len; 3124 rec_len = cursor.leaf->data_len; 3125 3126 /* 3127 * Incorporate any cached truncation. 3128 * 3129 * NOTE: Modifications to rec_len based on synthesized 3130 * truncation points remove the guarantee that any extended 3131 * data on disk is zero (since the truncations may not have 3132 * taken place on-media yet). 3133 */ 3134 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3135 if (hammer_cursor_ondisk(&cursor) || 3136 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3137 if (ip->trunc_off <= rec_offset) 3138 rec_len = 0; 3139 else if (ip->trunc_off < rec_offset + rec_len) 3140 rec_len = (int)(ip->trunc_off - rec_offset); 3141 } 3142 } 3143 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3144 if (hammer_cursor_ondisk(&cursor)) { 3145 if (ip->sync_trunc_off <= rec_offset) 3146 rec_len = 0; 3147 else if (ip->sync_trunc_off < rec_offset + rec_len) 3148 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3149 } 3150 } 3151 3152 /* 3153 * Accumulate information. If we have hit a discontiguous 3154 * block reset base_offset unless we are already beyond the 3155 * requested offset. If we are, that's it, we stop. 3156 */ 3157 if (error) 3158 break; 3159 if (hammer_cursor_ondisk(&cursor)) { 3160 disk_offset = cursor.leaf->data_offset; 3161 if (rec_offset != last_offset || 3162 disk_offset != last_disk_offset) { 3163 if (rec_offset > ap->a_loffset) 3164 break; 3165 base_offset = rec_offset; 3166 base_disk_offset = disk_offset; 3167 } 3168 last_offset = rec_offset + rec_len; 3169 last_disk_offset = disk_offset + rec_len; 3170 3171 if (hammer_live_dedup) 3172 hammer_dedup_cache_add(ip, cursor.leaf); 3173 } 3174 3175 error = hammer_ip_next(&cursor); 3176 } 3177 3178 #if 0 3179 kprintf("BMAP %016llx: %016llx - %016llx\n", 3180 (long long)ap->a_loffset, 3181 (long long)base_offset, 3182 (long long)last_offset); 3183 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3184 (long long)base_disk_offset, 3185 (long long)last_disk_offset); 3186 #endif 3187 3188 if (cursor.node) { 3189 hammer_cache_node(&ip->cache[1], cursor.node); 3190 #if 0 3191 kprintf("bmap_end2 %016llx ip->cache %p\n", 3192 (long long)ap->a_loffset, ip->cache[1]); 3193 #endif 3194 } 3195 hammer_done_cursor(&cursor); 3196 hammer_done_transaction(&trans); 3197 lwkt_reltoken(&hmp->fs_token); 3198 3199 /* 3200 * If we couldn't find any records or the records we did find were 3201 * all behind the requested offset, return failure. A forward 3202 * truncation can leave a hole w/ no on-disk records. 3203 */ 3204 if (last_offset == 0 || last_offset < ap->a_loffset) 3205 return (EOPNOTSUPP); 3206 3207 /* 3208 * Figure out the block size at the requested offset and adjust 3209 * our limits so the cluster_read() does not create inappropriately 3210 * sized buffer cache buffers. 3211 */ 3212 blksize = hammer_blocksize(ap->a_loffset); 3213 if (hammer_blocksize(base_offset) != blksize) { 3214 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3215 } 3216 if (last_offset != ap->a_loffset && 3217 hammer_blocksize(last_offset - 1) != blksize) { 3218 last_offset = hammer_blockdemarc(ap->a_loffset, 3219 last_offset - 1); 3220 } 3221 3222 /* 3223 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3224 * from occuring. 3225 */ 3226 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3227 3228 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3229 /* 3230 * Only large-data zones can be direct-IOd 3231 */ 3232 error = EOPNOTSUPP; 3233 } else if ((disk_offset & HAMMER_BUFMASK) || 3234 (last_offset - ap->a_loffset) < blksize) { 3235 /* 3236 * doffsetp is not aligned or the forward run size does 3237 * not cover a whole buffer, disallow the direct I/O. 3238 */ 3239 error = EOPNOTSUPP; 3240 } else { 3241 /* 3242 * We're good. 3243 */ 3244 *ap->a_doffsetp = disk_offset; 3245 if (ap->a_runb) { 3246 *ap->a_runb = ap->a_loffset - base_offset; 3247 KKASSERT(*ap->a_runb >= 0); 3248 } 3249 if (ap->a_runp) { 3250 *ap->a_runp = last_offset - ap->a_loffset; 3251 KKASSERT(*ap->a_runp >= 0); 3252 } 3253 error = 0; 3254 } 3255 return(error); 3256 } 3257 3258 /* 3259 * Write to a regular file. Because this is a strategy call the OS is 3260 * trying to actually get data onto the media. 3261 */ 3262 static 3263 int 3264 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3265 { 3266 hammer_record_t record; 3267 hammer_mount_t hmp; 3268 hammer_inode_t ip; 3269 struct bio *bio; 3270 struct buf *bp; 3271 int blksize __debugvar; 3272 int bytes; 3273 int error; 3274 3275 bio = ap->a_bio; 3276 bp = bio->bio_buf; 3277 ip = ap->a_vp->v_data; 3278 hmp = ip->hmp; 3279 3280 blksize = hammer_blocksize(bio->bio_offset); 3281 KKASSERT(bp->b_bufsize == blksize); 3282 3283 if (ip->flags & HAMMER_INODE_RO) { 3284 bp->b_error = EROFS; 3285 bp->b_flags |= B_ERROR; 3286 biodone(ap->a_bio); 3287 return(EROFS); 3288 } 3289 3290 lwkt_gettoken(&hmp->fs_token); 3291 3292 /* 3293 * Disallow swapcache operation on the vnode buffer if double 3294 * buffering is enabled, the swapcache will get the data via 3295 * the block device buffer. 3296 */ 3297 if (hammer_double_buffer) 3298 bp->b_flags |= B_NOTMETA; 3299 3300 /* 3301 * Interlock with inode destruction (no in-kernel or directory 3302 * topology visibility). If we queue new IO while trying to 3303 * destroy the inode we can deadlock the vtrunc call in 3304 * hammer_inode_unloadable_check(). 3305 * 3306 * Besides, there's no point flushing a bp associated with an 3307 * inode that is being destroyed on-media and has no kernel 3308 * references. 3309 */ 3310 if ((ip->flags | ip->sync_flags) & 3311 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3312 bp->b_resid = 0; 3313 biodone(ap->a_bio); 3314 lwkt_reltoken(&hmp->fs_token); 3315 return(0); 3316 } 3317 3318 /* 3319 * Reserve space and issue a direct-write from the front-end. 3320 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3321 * allocations. 3322 * 3323 * An in-memory record will be installed to reference the storage 3324 * until the flusher can get to it. 3325 * 3326 * Since we own the high level bio the front-end will not try to 3327 * do a direct-read until the write completes. 3328 * 3329 * NOTE: The only time we do not reserve a full-sized buffers 3330 * worth of data is if the file is small. We do not try to 3331 * allocate a fragment (from the small-data zone) at the end of 3332 * an otherwise large file as this can lead to wildly separated 3333 * data. 3334 */ 3335 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3336 KKASSERT(bio->bio_offset < ip->ino_data.size); 3337 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3338 bytes = bp->b_bufsize; 3339 else 3340 bytes = ((int)ip->ino_data.size + 15) & ~15; 3341 3342 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3343 bytes, &error); 3344 3345 /* 3346 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3347 * in hammer_vop_write(). We must flag the record so the proper 3348 * REDO_TERM_WRITE entry is generated during the flush. 3349 */ 3350 if (record) { 3351 if (bp->b_flags & B_VFSFLAG1) { 3352 record->flags |= HAMMER_RECF_REDO; 3353 bp->b_flags &= ~B_VFSFLAG1; 3354 } 3355 if (record->flags & HAMMER_RECF_DEDUPED) { 3356 bp->b_resid = 0; 3357 hammer_ip_replace_bulk(hmp, record); 3358 biodone(ap->a_bio); 3359 } else { 3360 hammer_io_direct_write(hmp, bio, record); 3361 } 3362 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3363 hammer_flush_inode(ip, 0); 3364 } else { 3365 bp->b_bio2.bio_offset = NOOFFSET; 3366 bp->b_error = error; 3367 bp->b_flags |= B_ERROR; 3368 biodone(ap->a_bio); 3369 } 3370 lwkt_reltoken(&hmp->fs_token); 3371 return(error); 3372 } 3373 3374 /* 3375 * dounlink - disconnect a directory entry 3376 * 3377 * XXX whiteout support not really in yet 3378 */ 3379 static int 3380 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3381 struct vnode *dvp, struct ucred *cred, 3382 int flags, int isdir) 3383 { 3384 struct namecache *ncp; 3385 hammer_inode_t dip; 3386 hammer_inode_t ip; 3387 hammer_mount_t hmp; 3388 struct hammer_cursor cursor; 3389 int64_t namekey; 3390 u_int32_t max_iterations; 3391 int nlen, error; 3392 3393 /* 3394 * Calculate the namekey and setup the key range for the scan. This 3395 * works kinda like a chained hash table where the lower 32 bits 3396 * of the namekey synthesize the chain. 3397 * 3398 * The key range is inclusive of both key_beg and key_end. 3399 */ 3400 dip = VTOI(dvp); 3401 ncp = nch->ncp; 3402 hmp = dip->hmp; 3403 3404 if (dip->flags & HAMMER_INODE_RO) 3405 return (EROFS); 3406 3407 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3408 &max_iterations); 3409 retry: 3410 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3411 cursor.key_beg.localization = dip->obj_localization + 3412 hammer_dir_localization(dip); 3413 cursor.key_beg.obj_id = dip->obj_id; 3414 cursor.key_beg.key = namekey; 3415 cursor.key_beg.create_tid = 0; 3416 cursor.key_beg.delete_tid = 0; 3417 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3418 cursor.key_beg.obj_type = 0; 3419 3420 cursor.key_end = cursor.key_beg; 3421 cursor.key_end.key += max_iterations; 3422 cursor.asof = dip->obj_asof; 3423 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3424 3425 /* 3426 * Scan all matching records (the chain), locate the one matching 3427 * the requested path component. info->last_error contains the 3428 * error code on search termination and could be 0, ENOENT, or 3429 * something else. 3430 * 3431 * The hammer_ip_*() functions merge in-memory records with on-disk 3432 * records for the purposes of the search. 3433 */ 3434 error = hammer_ip_first(&cursor); 3435 3436 while (error == 0) { 3437 error = hammer_ip_resolve_data(&cursor); 3438 if (error) 3439 break; 3440 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3441 KKASSERT(nlen > 0); 3442 if (ncp->nc_nlen == nlen && 3443 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3444 break; 3445 } 3446 error = hammer_ip_next(&cursor); 3447 } 3448 3449 /* 3450 * If all is ok we have to get the inode so we can adjust nlinks. 3451 * To avoid a deadlock with the flusher we must release the inode 3452 * lock on the directory when acquiring the inode for the entry. 3453 * 3454 * If the target is a directory, it must be empty. 3455 */ 3456 if (error == 0) { 3457 hammer_unlock(&cursor.ip->lock); 3458 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3459 hmp->asof, 3460 cursor.data->entry.localization, 3461 0, &error); 3462 hammer_lock_sh(&cursor.ip->lock); 3463 if (error == ENOENT) { 3464 kprintf("HAMMER: WARNING: Removing " 3465 "dirent w/missing inode \"%s\"\n" 3466 "\tobj_id = %016llx\n", 3467 ncp->nc_name, 3468 (long long)cursor.data->entry.obj_id); 3469 error = 0; 3470 } 3471 3472 /* 3473 * If isdir >= 0 we validate that the entry is or is not a 3474 * directory. If isdir < 0 we don't care. 3475 */ 3476 if (error == 0 && isdir >= 0 && ip) { 3477 if (isdir && 3478 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3479 error = ENOTDIR; 3480 } else if (isdir == 0 && 3481 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3482 error = EISDIR; 3483 } 3484 } 3485 3486 /* 3487 * If we are trying to remove a directory the directory must 3488 * be empty. 3489 * 3490 * The check directory code can loop and deadlock/retry. Our 3491 * own cursor's node locks must be released to avoid a 3-way 3492 * deadlock with the flusher if the check directory code 3493 * blocks. 3494 * 3495 * If any changes whatsoever have been made to the cursor 3496 * set EDEADLK and retry. 3497 * 3498 * WARNING: See warnings in hammer_unlock_cursor() 3499 * function. 3500 */ 3501 if (error == 0 && ip && ip->ino_data.obj_type == 3502 HAMMER_OBJTYPE_DIRECTORY) { 3503 hammer_unlock_cursor(&cursor); 3504 error = hammer_ip_check_directory_empty(trans, ip); 3505 hammer_lock_cursor(&cursor); 3506 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3507 kprintf("HAMMER: Warning: avoided deadlock " 3508 "on rmdir '%s'\n", 3509 ncp->nc_name); 3510 error = EDEADLK; 3511 } 3512 } 3513 3514 /* 3515 * Delete the directory entry. 3516 * 3517 * WARNING: hammer_ip_del_directory() may have to terminate 3518 * the cursor to avoid a deadlock. It is ok to call 3519 * hammer_done_cursor() twice. 3520 */ 3521 if (error == 0) { 3522 error = hammer_ip_del_directory(trans, &cursor, 3523 dip, ip); 3524 } 3525 hammer_done_cursor(&cursor); 3526 if (error == 0) { 3527 /* 3528 * Tell the namecache that we are now unlinked. 3529 */ 3530 cache_unlink(nch); 3531 3532 /* 3533 * NOTE: ip->vp, if non-NULL, cannot be directly 3534 * referenced without formally acquiring the 3535 * vp since the vp might have zero refs on it, 3536 * or in the middle of a reclaim, etc. 3537 * 3538 * NOTE: The cache_setunresolved() can rip the vp 3539 * out from under us since the vp may not have 3540 * any refs, in which case ip->vp will be NULL 3541 * from the outset. 3542 */ 3543 while (ip && ip->vp) { 3544 struct vnode *vp; 3545 3546 error = hammer_get_vnode(ip, &vp); 3547 if (error == 0 && vp) { 3548 vn_unlock(vp); 3549 hammer_knote(ip->vp, NOTE_DELETE); 3550 #if 0 3551 /* 3552 * Don't do this, it can deadlock 3553 * on concurrent rm's of hardlinks. 3554 * Shouldn't be needed any more. 3555 */ 3556 cache_inval_vp(ip->vp, CINV_DESTROY); 3557 #endif 3558 vrele(vp); 3559 break; 3560 } 3561 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3562 } 3563 } 3564 if (ip) 3565 hammer_rel_inode(ip, 0); 3566 } else { 3567 hammer_done_cursor(&cursor); 3568 } 3569 if (error == EDEADLK) 3570 goto retry; 3571 3572 return (error); 3573 } 3574 3575 /************************************************************************ 3576 * FIFO AND SPECFS OPS * 3577 ************************************************************************ 3578 * 3579 */ 3580 static int 3581 hammer_vop_fifoclose (struct vop_close_args *ap) 3582 { 3583 /* XXX update itimes */ 3584 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3585 } 3586 3587 static int 3588 hammer_vop_fiforead (struct vop_read_args *ap) 3589 { 3590 int error; 3591 3592 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3593 /* XXX update access time */ 3594 return (error); 3595 } 3596 3597 static int 3598 hammer_vop_fifowrite (struct vop_write_args *ap) 3599 { 3600 int error; 3601 3602 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3603 /* XXX update access time */ 3604 return (error); 3605 } 3606 3607 static 3608 int 3609 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3610 { 3611 int error; 3612 3613 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3614 if (error) 3615 error = hammer_vop_kqfilter(ap); 3616 return(error); 3617 } 3618 3619 /************************************************************************ 3620 * KQFILTER OPS * 3621 ************************************************************************ 3622 * 3623 */ 3624 static void filt_hammerdetach(struct knote *kn); 3625 static int filt_hammerread(struct knote *kn, long hint); 3626 static int filt_hammerwrite(struct knote *kn, long hint); 3627 static int filt_hammervnode(struct knote *kn, long hint); 3628 3629 static struct filterops hammerread_filtops = 3630 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3631 NULL, filt_hammerdetach, filt_hammerread }; 3632 static struct filterops hammerwrite_filtops = 3633 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3634 NULL, filt_hammerdetach, filt_hammerwrite }; 3635 static struct filterops hammervnode_filtops = 3636 { FILTEROP_ISFD | FILTEROP_MPSAFE, 3637 NULL, filt_hammerdetach, filt_hammervnode }; 3638 3639 static 3640 int 3641 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3642 { 3643 struct vnode *vp = ap->a_vp; 3644 struct knote *kn = ap->a_kn; 3645 3646 switch (kn->kn_filter) { 3647 case EVFILT_READ: 3648 kn->kn_fop = &hammerread_filtops; 3649 break; 3650 case EVFILT_WRITE: 3651 kn->kn_fop = &hammerwrite_filtops; 3652 break; 3653 case EVFILT_VNODE: 3654 kn->kn_fop = &hammervnode_filtops; 3655 break; 3656 default: 3657 return (EOPNOTSUPP); 3658 } 3659 3660 kn->kn_hook = (caddr_t)vp; 3661 3662 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3663 3664 return(0); 3665 } 3666 3667 static void 3668 filt_hammerdetach(struct knote *kn) 3669 { 3670 struct vnode *vp = (void *)kn->kn_hook; 3671 3672 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3673 } 3674 3675 static int 3676 filt_hammerread(struct knote *kn, long hint) 3677 { 3678 struct vnode *vp = (void *)kn->kn_hook; 3679 hammer_inode_t ip = VTOI(vp); 3680 hammer_mount_t hmp = ip->hmp; 3681 off_t off; 3682 3683 if (hint == NOTE_REVOKE) { 3684 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3685 return(1); 3686 } 3687 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3688 off = ip->ino_data.size - kn->kn_fp->f_offset; 3689 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3690 lwkt_reltoken(&hmp->fs_token); 3691 if (kn->kn_sfflags & NOTE_OLDAPI) 3692 return(1); 3693 return (kn->kn_data != 0); 3694 } 3695 3696 static int 3697 filt_hammerwrite(struct knote *kn, long hint) 3698 { 3699 if (hint == NOTE_REVOKE) 3700 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3701 kn->kn_data = 0; 3702 return (1); 3703 } 3704 3705 static int 3706 filt_hammervnode(struct knote *kn, long hint) 3707 { 3708 if (kn->kn_sfflags & hint) 3709 kn->kn_fflags |= hint; 3710 if (hint == NOTE_REVOKE) { 3711 kn->kn_flags |= (EV_EOF | EV_NODATA); 3712 return (1); 3713 } 3714 return (kn->kn_fflags != 0); 3715 } 3716 3717