1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vm/swap_pager.h> 50 #include <vfs/fifofs/fifo.h> 51 52 #include "hammer.h" 53 54 /* 55 * USERFS VNOPS 56 */ 57 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 58 static int hammer_vop_fsync(struct vop_fsync_args *); 59 static int hammer_vop_read(struct vop_read_args *); 60 static int hammer_vop_write(struct vop_write_args *); 61 static int hammer_vop_access(struct vop_access_args *); 62 static int hammer_vop_advlock(struct vop_advlock_args *); 63 static int hammer_vop_close(struct vop_close_args *); 64 static int hammer_vop_ncreate(struct vop_ncreate_args *); 65 static int hammer_vop_getattr(struct vop_getattr_args *); 66 static int hammer_vop_nresolve(struct vop_nresolve_args *); 67 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 68 static int hammer_vop_nlink(struct vop_nlink_args *); 69 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 70 static int hammer_vop_nmknod(struct vop_nmknod_args *); 71 static int hammer_vop_open(struct vop_open_args *); 72 static int hammer_vop_print(struct vop_print_args *); 73 static int hammer_vop_readdir(struct vop_readdir_args *); 74 static int hammer_vop_readlink(struct vop_readlink_args *); 75 static int hammer_vop_nremove(struct vop_nremove_args *); 76 static int hammer_vop_nrename(struct vop_nrename_args *); 77 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 78 static int hammer_vop_markatime(struct vop_markatime_args *); 79 static int hammer_vop_setattr(struct vop_setattr_args *); 80 static int hammer_vop_strategy(struct vop_strategy_args *); 81 static int hammer_vop_bmap(struct vop_bmap_args *ap); 82 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 83 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 84 static int hammer_vop_ioctl(struct vop_ioctl_args *); 85 static int hammer_vop_mountctl(struct vop_mountctl_args *); 86 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 87 88 static int hammer_vop_fifoclose (struct vop_close_args *); 89 static int hammer_vop_fiforead (struct vop_read_args *); 90 static int hammer_vop_fifowrite (struct vop_write_args *); 91 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 92 93 struct vop_ops hammer_vnode_vops = { 94 .vop_default = vop_defaultop, 95 .vop_fsync = hammer_vop_fsync, 96 .vop_getpages = vop_stdgetpages, 97 .vop_putpages = vop_stdputpages, 98 .vop_read = hammer_vop_read, 99 .vop_write = hammer_vop_write, 100 .vop_access = hammer_vop_access, 101 .vop_advlock = hammer_vop_advlock, 102 .vop_close = hammer_vop_close, 103 .vop_ncreate = hammer_vop_ncreate, 104 .vop_getattr = hammer_vop_getattr, 105 .vop_inactive = hammer_vop_inactive, 106 .vop_reclaim = hammer_vop_reclaim, 107 .vop_nresolve = hammer_vop_nresolve, 108 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 109 .vop_nlink = hammer_vop_nlink, 110 .vop_nmkdir = hammer_vop_nmkdir, 111 .vop_nmknod = hammer_vop_nmknod, 112 .vop_open = hammer_vop_open, 113 .vop_pathconf = vop_stdpathconf, 114 .vop_print = hammer_vop_print, 115 .vop_readdir = hammer_vop_readdir, 116 .vop_readlink = hammer_vop_readlink, 117 .vop_nremove = hammer_vop_nremove, 118 .vop_nrename = hammer_vop_nrename, 119 .vop_nrmdir = hammer_vop_nrmdir, 120 .vop_markatime = hammer_vop_markatime, 121 .vop_setattr = hammer_vop_setattr, 122 .vop_bmap = hammer_vop_bmap, 123 .vop_strategy = hammer_vop_strategy, 124 .vop_nsymlink = hammer_vop_nsymlink, 125 .vop_nwhiteout = hammer_vop_nwhiteout, 126 .vop_ioctl = hammer_vop_ioctl, 127 .vop_mountctl = hammer_vop_mountctl, 128 .vop_kqfilter = hammer_vop_kqfilter 129 }; 130 131 struct vop_ops hammer_spec_vops = { 132 .vop_default = vop_defaultop, 133 .vop_fsync = hammer_vop_fsync, 134 .vop_read = vop_stdnoread, 135 .vop_write = vop_stdnowrite, 136 .vop_access = hammer_vop_access, 137 .vop_close = hammer_vop_close, 138 .vop_markatime = hammer_vop_markatime, 139 .vop_getattr = hammer_vop_getattr, 140 .vop_inactive = hammer_vop_inactive, 141 .vop_reclaim = hammer_vop_reclaim, 142 .vop_setattr = hammer_vop_setattr 143 }; 144 145 struct vop_ops hammer_fifo_vops = { 146 .vop_default = fifo_vnoperate, 147 .vop_fsync = hammer_vop_fsync, 148 .vop_read = hammer_vop_fiforead, 149 .vop_write = hammer_vop_fifowrite, 150 .vop_access = hammer_vop_access, 151 .vop_close = hammer_vop_fifoclose, 152 .vop_markatime = hammer_vop_markatime, 153 .vop_getattr = hammer_vop_getattr, 154 .vop_inactive = hammer_vop_inactive, 155 .vop_reclaim = hammer_vop_reclaim, 156 .vop_setattr = hammer_vop_setattr, 157 .vop_kqfilter = hammer_vop_fifokqfilter 158 }; 159 160 static __inline 161 void 162 hammer_knote(struct vnode *vp, int flags) 163 { 164 if (flags) 165 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 166 } 167 168 #ifdef DEBUG_TRUNCATE 169 struct hammer_inode *HammerTruncIp; 170 #endif 171 172 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 173 struct vnode *dvp, struct ucred *cred, 174 int flags, int isdir); 175 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 176 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 177 178 #if 0 179 static 180 int 181 hammer_vop_vnoperate(struct vop_generic_args *) 182 { 183 return (VOCALL(&hammer_vnode_vops, ap)); 184 } 185 #endif 186 187 /* 188 * hammer_vop_fsync { vp, waitfor } 189 * 190 * fsync() an inode to disk and wait for it to be completely committed 191 * such that the information would not be undone if a crash occured after 192 * return. 193 * 194 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 195 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 196 * operation. 197 * 198 * Ultimately the combination of a REDO log and use of fast storage 199 * to front-end cluster caches will make fsync fast, but it aint 200 * here yet. And, in anycase, we need real transactional 201 * all-or-nothing features which are not restricted to a single file. 202 */ 203 static 204 int 205 hammer_vop_fsync(struct vop_fsync_args *ap) 206 { 207 hammer_inode_t ip = VTOI(ap->a_vp); 208 hammer_mount_t hmp = ip->hmp; 209 int waitfor = ap->a_waitfor; 210 int mode; 211 212 lwkt_gettoken(&hmp->fs_token); 213 214 /* 215 * Fsync rule relaxation (default is either full synchronous flush 216 * or REDO semantics with synchronous flush). 217 */ 218 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 219 switch(hammer_fsync_mode) { 220 case 0: 221 mode0: 222 /* no REDO, full synchronous flush */ 223 goto skip; 224 case 1: 225 mode1: 226 /* no REDO, full asynchronous flush */ 227 if (waitfor == MNT_WAIT) 228 waitfor = MNT_NOWAIT; 229 goto skip; 230 case 2: 231 /* REDO semantics, synchronous flush */ 232 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 233 goto mode0; 234 mode = HAMMER_FLUSH_UNDOS_AUTO; 235 break; 236 case 3: 237 /* REDO semantics, relaxed asynchronous flush */ 238 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 239 goto mode1; 240 mode = HAMMER_FLUSH_UNDOS_RELAXED; 241 if (waitfor == MNT_WAIT) 242 waitfor = MNT_NOWAIT; 243 break; 244 case 4: 245 /* ignore the fsync() system call */ 246 lwkt_reltoken(&hmp->fs_token); 247 return(0); 248 default: 249 /* we have to do something */ 250 mode = HAMMER_FLUSH_UNDOS_RELAXED; 251 if (waitfor == MNT_WAIT) 252 waitfor = MNT_NOWAIT; 253 break; 254 } 255 256 /* 257 * Fast fsync only needs to flush the UNDO/REDO fifo if 258 * HAMMER_INODE_REDO is non-zero and the only modifications 259 * made to the file are write or write-extends. 260 */ 261 if ((ip->flags & HAMMER_INODE_REDO) && 262 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 263 ) { 264 ++hammer_count_fsyncs; 265 hammer_flusher_flush_undos(hmp, mode); 266 ip->redo_count = 0; 267 lwkt_reltoken(&hmp->fs_token); 268 return(0); 269 } 270 271 /* 272 * REDO is enabled by fsync(), the idea being we really only 273 * want to lay down REDO records when programs are using 274 * fsync() heavily. The first fsync() on the file starts 275 * the gravy train going and later fsync()s keep it hot by 276 * resetting the redo_count. 277 * 278 * We weren't running REDOs before now so we have to fall 279 * through and do a full fsync of what we have. 280 */ 281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 283 ip->flags |= HAMMER_INODE_REDO; 284 ip->redo_count = 0; 285 } 286 } 287 skip: 288 289 /* 290 * Do a full flush sequence. 291 * 292 * Attempt to release the vnode while waiting for the inode to 293 * finish flushing. This can really mess up inactive->reclaim 294 * sequences so only do it if the vnode is active. 295 */ 296 ++hammer_count_fsyncs; 297 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 298 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 299 if (waitfor == MNT_WAIT) { 300 if ((ap->a_vp->v_flag & VINACTIVE) == 0) 301 vn_unlock(ap->a_vp); 302 hammer_wait_inode(ip); 303 if ((ap->a_vp->v_flag & VINACTIVE) == 0) 304 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 305 } 306 lwkt_reltoken(&hmp->fs_token); 307 return (ip->error); 308 } 309 310 /* 311 * hammer_vop_read { vp, uio, ioflag, cred } 312 * 313 * MPSAFE (for the cache safe does not require fs_token) 314 */ 315 static 316 int 317 hammer_vop_read(struct vop_read_args *ap) 318 { 319 struct hammer_transaction trans; 320 hammer_inode_t ip; 321 hammer_mount_t hmp; 322 off_t offset; 323 struct buf *bp; 324 struct uio *uio; 325 int error; 326 int n; 327 int seqcount; 328 int ioseqcount; 329 int blksize; 330 int bigread; 331 int got_fstoken; 332 size_t resid; 333 334 if (ap->a_vp->v_type != VREG) 335 return (EINVAL); 336 ip = VTOI(ap->a_vp); 337 hmp = ip->hmp; 338 error = 0; 339 got_fstoken = 0; 340 uio = ap->a_uio; 341 342 /* 343 * Attempt to shortcut directly to the VM object using lwbufs. 344 * This is much faster than instantiating buffer cache buffers. 345 */ 346 resid = uio->uio_resid; 347 error = vop_helper_read_shortcut(ap); 348 hammer_stats_file_read += resid - uio->uio_resid; 349 if (error) 350 return (error); 351 if (uio->uio_resid == 0) 352 goto finished; 353 354 /* 355 * Allow the UIO's size to override the sequential heuristic. 356 */ 357 blksize = hammer_blocksize(uio->uio_offset); 358 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 359 ioseqcount = (ap->a_ioflag >> 16); 360 if (seqcount < ioseqcount) 361 seqcount = ioseqcount; 362 363 /* 364 * If reading or writing a huge amount of data we have to break 365 * atomicy and allow the operation to be interrupted by a signal 366 * or it can DOS the machine. 367 */ 368 bigread = (uio->uio_resid > 100 * 1024 * 1024); 369 370 /* 371 * Access the data typically in HAMMER_BUFSIZE blocks via the 372 * buffer cache, but HAMMER may use a variable block size based 373 * on the offset. 374 * 375 * XXX Temporary hack, delay the start transaction while we remain 376 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 377 * locked-shared. 378 */ 379 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 380 int64_t base_offset; 381 int64_t file_limit; 382 383 blksize = hammer_blocksize(uio->uio_offset); 384 offset = (int)uio->uio_offset & (blksize - 1); 385 base_offset = uio->uio_offset - offset; 386 387 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 388 break; 389 390 /* 391 * MPSAFE 392 */ 393 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 394 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 395 bp->b_flags &= ~B_AGE; 396 error = 0; 397 goto skip; 398 } 399 if (ap->a_ioflag & IO_NRDELAY) { 400 bqrelse(bp); 401 return (EWOULDBLOCK); 402 } 403 404 /* 405 * MPUNSAFE 406 */ 407 if (got_fstoken == 0) { 408 lwkt_gettoken(&hmp->fs_token); 409 got_fstoken = 1; 410 hammer_start_transaction(&trans, ip->hmp); 411 } 412 413 /* 414 * NOTE: A valid bp has already been acquired, but was not 415 * B_CACHE. 416 */ 417 if (hammer_cluster_enable) { 418 /* 419 * Use file_limit to prevent cluster_read() from 420 * creating buffers of the wrong block size past 421 * the demarc. 422 */ 423 file_limit = ip->ino_data.size; 424 if (base_offset < HAMMER_XDEMARC && 425 file_limit > HAMMER_XDEMARC) { 426 file_limit = HAMMER_XDEMARC; 427 } 428 error = cluster_readx(ap->a_vp, 429 file_limit, base_offset, 430 blksize, uio->uio_resid, 431 seqcount * BKVASIZE, &bp); 432 } else { 433 error = breadnx(ap->a_vp, base_offset, blksize, 434 NULL, NULL, 0, &bp); 435 } 436 if (error) { 437 brelse(bp); 438 break; 439 } 440 skip: 441 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 442 kprintf("doff %016jx read file %016jx@%016jx\n", 443 (intmax_t)bp->b_bio2.bio_offset, 444 (intmax_t)ip->obj_id, 445 (intmax_t)bp->b_loffset); 446 } 447 bp->b_flags &= ~B_IODEBUG; 448 if (blksize == HAMMER_XBUFSIZE) 449 bp->b_flags |= B_CLUSTEROK; 450 451 n = blksize - offset; 452 if (n > uio->uio_resid) 453 n = uio->uio_resid; 454 if (n > ip->ino_data.size - uio->uio_offset) 455 n = (int)(ip->ino_data.size - uio->uio_offset); 456 if (got_fstoken) 457 lwkt_reltoken(&hmp->fs_token); 458 459 /* 460 * Set B_AGE, data has a lower priority than meta-data. 461 * 462 * Use a hold/unlock/drop sequence to run the uiomove 463 * with the buffer unlocked, avoiding deadlocks against 464 * read()s on mmap()'d spaces. 465 */ 466 bp->b_flags |= B_AGE; 467 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 468 bqrelse(bp); 469 470 if (got_fstoken) 471 lwkt_gettoken(&hmp->fs_token); 472 473 if (error) 474 break; 475 hammer_stats_file_read += n; 476 } 477 478 finished: 479 480 /* 481 * Try to update the atime with just the inode lock for maximum 482 * concurrency. If we can't shortcut it we have to get the full 483 * blown transaction. 484 */ 485 if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) { 486 lwkt_gettoken(&hmp->fs_token); 487 got_fstoken = 1; 488 hammer_start_transaction(&trans, ip->hmp); 489 } 490 491 if (got_fstoken) { 492 if ((ip->flags & HAMMER_INODE_RO) == 0 && 493 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 494 ip->ino_data.atime = trans.time; 495 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 496 } 497 hammer_done_transaction(&trans); 498 lwkt_reltoken(&hmp->fs_token); 499 } 500 return (error); 501 } 502 503 /* 504 * hammer_vop_write { vp, uio, ioflag, cred } 505 */ 506 static 507 int 508 hammer_vop_write(struct vop_write_args *ap) 509 { 510 struct hammer_transaction trans; 511 struct hammer_inode *ip; 512 hammer_mount_t hmp; 513 thread_t td; 514 struct uio *uio; 515 int offset; 516 off_t base_offset; 517 int64_t cluster_eof; 518 struct buf *bp; 519 int kflags; 520 int error; 521 int n; 522 int flags; 523 int seqcount; 524 int bigwrite; 525 526 if (ap->a_vp->v_type != VREG) 527 return (EINVAL); 528 ip = VTOI(ap->a_vp); 529 hmp = ip->hmp; 530 error = 0; 531 kflags = 0; 532 seqcount = ap->a_ioflag >> 16; 533 534 if (ip->flags & HAMMER_INODE_RO) 535 return (EROFS); 536 537 /* 538 * Create a transaction to cover the operations we perform. 539 */ 540 lwkt_gettoken(&hmp->fs_token); 541 hammer_start_transaction(&trans, hmp); 542 uio = ap->a_uio; 543 544 /* 545 * Check append mode 546 */ 547 if (ap->a_ioflag & IO_APPEND) 548 uio->uio_offset = ip->ino_data.size; 549 550 /* 551 * Check for illegal write offsets. Valid range is 0...2^63-1. 552 * 553 * NOTE: the base_off assignment is required to work around what 554 * I consider to be a GCC-4 optimization bug. 555 */ 556 if (uio->uio_offset < 0) { 557 hammer_done_transaction(&trans); 558 lwkt_reltoken(&hmp->fs_token); 559 return (EFBIG); 560 } 561 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 562 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 563 hammer_done_transaction(&trans); 564 lwkt_reltoken(&hmp->fs_token); 565 return (EFBIG); 566 } 567 568 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 569 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 570 hammer_done_transaction(&trans); 571 lwkt_reltoken(&hmp->fs_token); 572 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 573 return (EFBIG); 574 } 575 576 /* 577 * If reading or writing a huge amount of data we have to break 578 * atomicy and allow the operation to be interrupted by a signal 579 * or it can DOS the machine. 580 * 581 * Preset redo_count so we stop generating REDOs earlier if the 582 * limit is exceeded. 583 */ 584 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 585 if ((ip->flags & HAMMER_INODE_REDO) && 586 ip->redo_count < hammer_limit_redo) { 587 ip->redo_count += uio->uio_resid; 588 } 589 590 /* 591 * Access the data typically in HAMMER_BUFSIZE blocks via the 592 * buffer cache, but HAMMER may use a variable block size based 593 * on the offset. 594 */ 595 while (uio->uio_resid > 0) { 596 int fixsize = 0; 597 int blksize; 598 int blkmask; 599 int trivial; 600 int endofblk; 601 off_t nsize; 602 603 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 604 break; 605 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 606 break; 607 608 blksize = hammer_blocksize(uio->uio_offset); 609 610 /* 611 * Do not allow HAMMER to blow out the buffer cache. Very 612 * large UIOs can lockout other processes due to bwillwrite() 613 * mechanics. 614 * 615 * The hammer inode is not locked during these operations. 616 * The vnode is locked which can interfere with the pageout 617 * daemon for non-UIO_NOCOPY writes but should not interfere 618 * with the buffer cache. Even so, we cannot afford to 619 * allow the pageout daemon to build up too many dirty buffer 620 * cache buffers. 621 * 622 * Only call this if we aren't being recursively called from 623 * a virtual disk device (vn), else we may deadlock. 624 */ 625 if ((ap->a_ioflag & IO_RECURSE) == 0) 626 bwillwrite(blksize); 627 628 /* 629 * Control the number of pending records associated with 630 * this inode. If too many have accumulated start a 631 * flush. Try to maintain a pipeline with the flusher. 632 * 633 * NOTE: It is possible for other sources to grow the 634 * records but not necessarily issue another flush, 635 * so use a timeout and ensure that a re-flush occurs. 636 */ 637 if (ip->rsv_recs >= hammer_limit_inode_recs) { 638 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 639 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 640 ip->flags |= HAMMER_INODE_RECSW; 641 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 642 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 643 } 644 } 645 646 #if 0 647 /* 648 * Do not allow HAMMER to blow out system memory by 649 * accumulating too many records. Records are so well 650 * decoupled from the buffer cache that it is possible 651 * for userland to push data out to the media via 652 * direct-write, but build up the records queued to the 653 * backend faster then the backend can flush them out. 654 * HAMMER has hit its write limit but the frontend has 655 * no pushback to slow it down. 656 */ 657 if (hmp->rsv_recs > hammer_limit_recs / 2) { 658 /* 659 * Get the inode on the flush list 660 */ 661 if (ip->rsv_recs >= 64) 662 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 663 else if (ip->rsv_recs >= 16) 664 hammer_flush_inode(ip, 0); 665 666 /* 667 * Keep the flusher going if the system keeps 668 * queueing records. 669 */ 670 delta = hmp->count_newrecords - 671 hmp->last_newrecords; 672 if (delta < 0 || delta > hammer_limit_recs / 2) { 673 hmp->last_newrecords = hmp->count_newrecords; 674 hammer_sync_hmp(hmp, MNT_NOWAIT); 675 } 676 677 /* 678 * If we have gotten behind start slowing 679 * down the writers. 680 */ 681 delta = (hmp->rsv_recs - hammer_limit_recs) * 682 hz / hammer_limit_recs; 683 if (delta > 0) 684 tsleep(&trans, 0, "hmrslo", delta); 685 } 686 #endif 687 688 /* 689 * Calculate the blocksize at the current offset and figure 690 * out how much we can actually write. 691 */ 692 blkmask = blksize - 1; 693 offset = (int)uio->uio_offset & blkmask; 694 base_offset = uio->uio_offset & ~(int64_t)blkmask; 695 n = blksize - offset; 696 if (n > uio->uio_resid) { 697 n = uio->uio_resid; 698 endofblk = 0; 699 } else { 700 endofblk = 1; 701 } 702 nsize = uio->uio_offset + n; 703 if (nsize > ip->ino_data.size) { 704 if (uio->uio_offset > ip->ino_data.size) 705 trivial = 0; 706 else 707 trivial = 1; 708 nvextendbuf(ap->a_vp, 709 ip->ino_data.size, 710 nsize, 711 hammer_blocksize(ip->ino_data.size), 712 hammer_blocksize(nsize), 713 hammer_blockoff(ip->ino_data.size), 714 hammer_blockoff(nsize), 715 trivial); 716 fixsize = 1; 717 kflags |= NOTE_EXTEND; 718 } 719 720 if (uio->uio_segflg == UIO_NOCOPY) { 721 /* 722 * Issuing a write with the same data backing the 723 * buffer. Instantiate the buffer to collect the 724 * backing vm pages, then read-in any missing bits. 725 * 726 * This case is used by vop_stdputpages(). 727 */ 728 bp = getblk(ap->a_vp, base_offset, 729 blksize, GETBLK_BHEAVY, 0); 730 if ((bp->b_flags & B_CACHE) == 0) { 731 bqrelse(bp); 732 error = bread(ap->a_vp, base_offset, 733 blksize, &bp); 734 } 735 } else if (offset == 0 && uio->uio_resid >= blksize) { 736 /* 737 * Even though we are entirely overwriting the buffer 738 * we may still have to zero it out to avoid a 739 * mmap/write visibility issue. 740 */ 741 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 742 if ((bp->b_flags & B_CACHE) == 0) 743 vfs_bio_clrbuf(bp); 744 } else if (base_offset >= ip->ino_data.size) { 745 /* 746 * If the base offset of the buffer is beyond the 747 * file EOF, we don't have to issue a read. 748 */ 749 bp = getblk(ap->a_vp, base_offset, 750 blksize, GETBLK_BHEAVY, 0); 751 vfs_bio_clrbuf(bp); 752 } else { 753 /* 754 * Partial overwrite, read in any missing bits then 755 * replace the portion being written. 756 */ 757 error = bread(ap->a_vp, base_offset, blksize, &bp); 758 if (error == 0) 759 bheavy(bp); 760 } 761 if (error == 0) { 762 lwkt_reltoken(&hmp->fs_token); 763 error = uiomovebp(bp, bp->b_data + offset, n, uio); 764 lwkt_gettoken(&hmp->fs_token); 765 } 766 767 /* 768 * Generate REDO records if enabled and redo_count will not 769 * exceeded the limit. 770 * 771 * If redo_count exceeds the limit we stop generating records 772 * and clear HAMMER_INODE_REDO. This will cause the next 773 * fsync() to do a full meta-data sync instead of just an 774 * UNDO/REDO fifo update. 775 * 776 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 777 * will still be tracked. The tracks will be terminated 778 * when the related meta-data (including possible data 779 * modifications which are not tracked via REDO) is 780 * flushed. 781 */ 782 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 783 if (ip->redo_count < hammer_limit_redo) { 784 bp->b_flags |= B_VFSFLAG1; 785 error = hammer_generate_redo(&trans, ip, 786 base_offset + offset, 787 HAMMER_REDO_WRITE, 788 bp->b_data + offset, 789 (size_t)n); 790 } else { 791 ip->flags &= ~HAMMER_INODE_REDO; 792 } 793 } 794 795 /* 796 * If we screwed up we have to undo any VM size changes we 797 * made. 798 */ 799 if (error) { 800 brelse(bp); 801 if (fixsize) { 802 nvtruncbuf(ap->a_vp, ip->ino_data.size, 803 hammer_blocksize(ip->ino_data.size), 804 hammer_blockoff(ip->ino_data.size), 805 0); 806 } 807 break; 808 } 809 kflags |= NOTE_WRITE; 810 hammer_stats_file_write += n; 811 if (blksize == HAMMER_XBUFSIZE) 812 bp->b_flags |= B_CLUSTEROK; 813 if (ip->ino_data.size < uio->uio_offset) { 814 ip->ino_data.size = uio->uio_offset; 815 flags = HAMMER_INODE_SDIRTY; 816 } else { 817 flags = 0; 818 } 819 ip->ino_data.mtime = trans.time; 820 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 821 hammer_modify_inode(&trans, ip, flags); 822 823 /* 824 * Once we dirty the buffer any cached zone-X offset 825 * becomes invalid. HAMMER NOTE: no-history mode cannot 826 * allow overwriting over the same data sector unless 827 * we provide UNDOs for the old data, which we don't. 828 */ 829 bp->b_bio2.bio_offset = NOOFFSET; 830 831 /* 832 * Final buffer disposition. 833 * 834 * Because meta-data updates are deferred, HAMMER is 835 * especially sensitive to excessive bdwrite()s because 836 * the I/O stream is not broken up by disk reads. So the 837 * buffer cache simply cannot keep up. 838 * 839 * WARNING! blksize is variable. cluster_write() is 840 * expected to not blow up if it encounters 841 * buffers that do not match the passed blksize. 842 * 843 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 844 * The ip->rsv_recs check should burst-flush the data. 845 * If we queue it immediately the buf could be left 846 * locked on the device queue for a very long time. 847 * 848 * However, failing to flush a dirty buffer out when 849 * issued from the pageout daemon can result in a low 850 * memory deadlock against bio_page_alloc(), so we 851 * have to bawrite() on IO_ASYNC as well. 852 * 853 * NOTE! To avoid degenerate stalls due to mismatched block 854 * sizes we only honor IO_DIRECT on the write which 855 * abuts the end of the buffer. However, we must 856 * honor IO_SYNC in case someone is silly enough to 857 * configure a HAMMER file as swap, or when HAMMER 858 * is serving NFS (for commits). Ick ick. 859 */ 860 bp->b_flags |= B_AGE; 861 if (blksize == HAMMER_XBUFSIZE) 862 bp->b_flags |= B_CLUSTEROK; 863 864 if (ap->a_ioflag & IO_SYNC) { 865 bwrite(bp); 866 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 867 bawrite(bp); 868 } else if (ap->a_ioflag & IO_ASYNC) { 869 bawrite(bp); 870 } else if (hammer_cluster_enable && 871 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 872 if (base_offset < HAMMER_XDEMARC) 873 cluster_eof = hammer_blockdemarc(base_offset, 874 ip->ino_data.size); 875 else 876 cluster_eof = ip->ino_data.size; 877 cluster_write(bp, cluster_eof, blksize, seqcount); 878 } else { 879 bdwrite(bp); 880 } 881 } 882 hammer_done_transaction(&trans); 883 hammer_knote(ap->a_vp, kflags); 884 lwkt_reltoken(&hmp->fs_token); 885 return (error); 886 } 887 888 /* 889 * hammer_vop_access { vp, mode, cred } 890 * 891 * MPSAFE - does not require fs_token 892 */ 893 static 894 int 895 hammer_vop_access(struct vop_access_args *ap) 896 { 897 struct hammer_inode *ip = VTOI(ap->a_vp); 898 uid_t uid; 899 gid_t gid; 900 int error; 901 902 ++hammer_stats_file_iopsr; 903 uid = hammer_to_unix_xid(&ip->ino_data.uid); 904 gid = hammer_to_unix_xid(&ip->ino_data.gid); 905 906 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 907 ip->ino_data.uflags); 908 return (error); 909 } 910 911 /* 912 * hammer_vop_advlock { vp, id, op, fl, flags } 913 * 914 * MPSAFE - does not require fs_token 915 */ 916 static 917 int 918 hammer_vop_advlock(struct vop_advlock_args *ap) 919 { 920 hammer_inode_t ip = VTOI(ap->a_vp); 921 922 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 923 } 924 925 /* 926 * hammer_vop_close { vp, fflag } 927 * 928 * We can only sync-on-close for normal closes. XXX disabled for now. 929 */ 930 static 931 int 932 hammer_vop_close(struct vop_close_args *ap) 933 { 934 #if 0 935 struct vnode *vp = ap->a_vp; 936 hammer_inode_t ip = VTOI(vp); 937 int waitfor; 938 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 939 if (vn_islocked(vp) == LK_EXCLUSIVE && 940 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 941 if (ip->flags & HAMMER_INODE_CLOSESYNC) 942 waitfor = MNT_WAIT; 943 else 944 waitfor = MNT_NOWAIT; 945 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 946 HAMMER_INODE_CLOSEASYNC); 947 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 948 } 949 } 950 #endif 951 return (vop_stdclose(ap)); 952 } 953 954 /* 955 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 956 * 957 * The operating system has already ensured that the directory entry 958 * does not exist and done all appropriate namespace locking. 959 */ 960 static 961 int 962 hammer_vop_ncreate(struct vop_ncreate_args *ap) 963 { 964 struct hammer_transaction trans; 965 struct hammer_inode *dip; 966 struct hammer_inode *nip; 967 struct nchandle *nch; 968 hammer_mount_t hmp; 969 int error; 970 971 nch = ap->a_nch; 972 dip = VTOI(ap->a_dvp); 973 hmp = dip->hmp; 974 975 if (dip->flags & HAMMER_INODE_RO) 976 return (EROFS); 977 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 978 return (error); 979 980 /* 981 * Create a transaction to cover the operations we perform. 982 */ 983 lwkt_gettoken(&hmp->fs_token); 984 hammer_start_transaction(&trans, hmp); 985 ++hammer_stats_file_iopsw; 986 987 /* 988 * Create a new filesystem object of the requested type. The 989 * returned inode will be referenced and shared-locked to prevent 990 * it from being moved to the flusher. 991 */ 992 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 993 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 994 NULL, &nip); 995 if (error) { 996 hkprintf("hammer_create_inode error %d\n", error); 997 hammer_done_transaction(&trans); 998 *ap->a_vpp = NULL; 999 lwkt_reltoken(&hmp->fs_token); 1000 return (error); 1001 } 1002 1003 /* 1004 * Add the new filesystem object to the directory. This will also 1005 * bump the inode's link count. 1006 */ 1007 error = hammer_ip_add_directory(&trans, dip, 1008 nch->ncp->nc_name, nch->ncp->nc_nlen, 1009 nip); 1010 if (error) 1011 hkprintf("hammer_ip_add_directory error %d\n", error); 1012 1013 /* 1014 * Finish up. 1015 */ 1016 if (error) { 1017 hammer_rel_inode(nip, 0); 1018 hammer_done_transaction(&trans); 1019 *ap->a_vpp = NULL; 1020 } else { 1021 error = hammer_get_vnode(nip, ap->a_vpp); 1022 hammer_done_transaction(&trans); 1023 hammer_rel_inode(nip, 0); 1024 if (error == 0) { 1025 cache_setunresolved(ap->a_nch); 1026 cache_setvp(ap->a_nch, *ap->a_vpp); 1027 } 1028 hammer_knote(ap->a_dvp, NOTE_WRITE); 1029 } 1030 lwkt_reltoken(&hmp->fs_token); 1031 return (error); 1032 } 1033 1034 /* 1035 * hammer_vop_getattr { vp, vap } 1036 * 1037 * Retrieve an inode's attribute information. When accessing inodes 1038 * historically we fake the atime field to ensure consistent results. 1039 * The atime field is stored in the B-Tree element and allowed to be 1040 * updated without cycling the element. 1041 * 1042 * MPSAFE - does not require fs_token 1043 */ 1044 static 1045 int 1046 hammer_vop_getattr(struct vop_getattr_args *ap) 1047 { 1048 struct hammer_inode *ip = VTOI(ap->a_vp); 1049 struct vattr *vap = ap->a_vap; 1050 1051 /* 1052 * We want the fsid to be different when accessing a filesystem 1053 * with different as-of's so programs like diff don't think 1054 * the files are the same. 1055 * 1056 * We also want the fsid to be the same when comparing snapshots, 1057 * or when comparing mirrors (which might be backed by different 1058 * physical devices). HAMMER fsids are based on the PFS's 1059 * shared_uuid field. 1060 * 1061 * XXX there is a chance of collision here. The va_fsid reported 1062 * by stat is different from the more involved fsid used in the 1063 * mount structure. 1064 */ 1065 ++hammer_stats_file_iopsr; 1066 hammer_lock_sh(&ip->lock); 1067 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1068 (u_int32_t)(ip->obj_asof >> 32); 1069 1070 vap->va_fileid = ip->ino_leaf.base.obj_id; 1071 vap->va_mode = ip->ino_data.mode; 1072 vap->va_nlink = ip->ino_data.nlinks; 1073 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1074 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1075 vap->va_rmajor = 0; 1076 vap->va_rminor = 0; 1077 vap->va_size = ip->ino_data.size; 1078 1079 /* 1080 * Special case for @@PFS softlinks. The actual size of the 1081 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1082 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1083 */ 1084 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1085 ip->ino_data.size == 10 && 1086 ip->obj_asof == HAMMER_MAX_TID && 1087 ip->obj_localization == 0 && 1088 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1089 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1090 vap->va_size = 26; 1091 else 1092 vap->va_size = 10; 1093 } 1094 1095 /* 1096 * We must provide a consistent atime and mtime for snapshots 1097 * so people can do a 'tar cf - ... | md5' on them and get 1098 * consistent results. 1099 */ 1100 if (ip->flags & HAMMER_INODE_RO) { 1101 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1102 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1103 } else { 1104 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1105 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1106 } 1107 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1108 vap->va_flags = ip->ino_data.uflags; 1109 vap->va_gen = 1; /* hammer inums are unique for all time */ 1110 vap->va_blocksize = HAMMER_BUFSIZE; 1111 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1112 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1113 ~HAMMER_XBUFMASK64; 1114 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1115 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1116 ~HAMMER_BUFMASK64; 1117 } else { 1118 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1119 } 1120 1121 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1122 vap->va_filerev = 0; /* XXX */ 1123 vap->va_uid_uuid = ip->ino_data.uid; 1124 vap->va_gid_uuid = ip->ino_data.gid; 1125 vap->va_fsid_uuid = ip->hmp->fsid; 1126 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1127 VA_FSID_UUID_VALID; 1128 1129 switch (ip->ino_data.obj_type) { 1130 case HAMMER_OBJTYPE_CDEV: 1131 case HAMMER_OBJTYPE_BDEV: 1132 vap->va_rmajor = ip->ino_data.rmajor; 1133 vap->va_rminor = ip->ino_data.rminor; 1134 break; 1135 default: 1136 break; 1137 } 1138 hammer_unlock(&ip->lock); 1139 return(0); 1140 } 1141 1142 /* 1143 * hammer_vop_nresolve { nch, dvp, cred } 1144 * 1145 * Locate the requested directory entry. 1146 */ 1147 static 1148 int 1149 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1150 { 1151 struct hammer_transaction trans; 1152 struct namecache *ncp; 1153 hammer_mount_t hmp; 1154 hammer_inode_t dip; 1155 hammer_inode_t ip; 1156 hammer_tid_t asof; 1157 struct hammer_cursor cursor; 1158 struct vnode *vp; 1159 int64_t namekey; 1160 int error; 1161 int i; 1162 int nlen; 1163 int flags; 1164 int ispfs; 1165 int64_t obj_id; 1166 u_int32_t localization; 1167 u_int32_t max_iterations; 1168 1169 /* 1170 * Misc initialization, plus handle as-of name extensions. Look for 1171 * the '@@' extension. Note that as-of files and directories cannot 1172 * be modified. 1173 */ 1174 dip = VTOI(ap->a_dvp); 1175 ncp = ap->a_nch->ncp; 1176 asof = dip->obj_asof; 1177 localization = dip->obj_localization; /* for code consistency */ 1178 nlen = ncp->nc_nlen; 1179 flags = dip->flags & HAMMER_INODE_RO; 1180 ispfs = 0; 1181 hmp = dip->hmp; 1182 1183 lwkt_gettoken(&hmp->fs_token); 1184 hammer_simple_transaction(&trans, hmp); 1185 ++hammer_stats_file_iopsr; 1186 1187 for (i = 0; i < nlen; ++i) { 1188 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1189 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1190 &ispfs, &asof, &localization); 1191 if (error != 0) { 1192 i = nlen; 1193 break; 1194 } 1195 if (asof != HAMMER_MAX_TID) 1196 flags |= HAMMER_INODE_RO; 1197 break; 1198 } 1199 } 1200 nlen = i; 1201 1202 /* 1203 * If this is a PFS softlink we dive into the PFS 1204 */ 1205 if (ispfs && nlen == 0) { 1206 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1207 asof, localization, 1208 flags, &error); 1209 if (error == 0) { 1210 error = hammer_get_vnode(ip, &vp); 1211 hammer_rel_inode(ip, 0); 1212 } else { 1213 vp = NULL; 1214 } 1215 if (error == 0) { 1216 vn_unlock(vp); 1217 cache_setvp(ap->a_nch, vp); 1218 vrele(vp); 1219 } 1220 goto done; 1221 } 1222 1223 /* 1224 * If there is no path component the time extension is relative to dip. 1225 * e.g. "fubar/@@<snapshot>" 1226 * 1227 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1228 * e.g. "fubar/.@@<snapshot>" 1229 * 1230 * ".." is handled by the kernel. We do not currently handle 1231 * "..@<snapshot>". 1232 */ 1233 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1234 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1235 asof, dip->obj_localization, 1236 flags, &error); 1237 if (error == 0) { 1238 error = hammer_get_vnode(ip, &vp); 1239 hammer_rel_inode(ip, 0); 1240 } else { 1241 vp = NULL; 1242 } 1243 if (error == 0) { 1244 vn_unlock(vp); 1245 cache_setvp(ap->a_nch, vp); 1246 vrele(vp); 1247 } 1248 goto done; 1249 } 1250 1251 /* 1252 * Calculate the namekey and setup the key range for the scan. This 1253 * works kinda like a chained hash table where the lower 32 bits 1254 * of the namekey synthesize the chain. 1255 * 1256 * The key range is inclusive of both key_beg and key_end. 1257 */ 1258 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1259 &max_iterations); 1260 1261 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1262 cursor.key_beg.localization = dip->obj_localization + 1263 hammer_dir_localization(dip); 1264 cursor.key_beg.obj_id = dip->obj_id; 1265 cursor.key_beg.key = namekey; 1266 cursor.key_beg.create_tid = 0; 1267 cursor.key_beg.delete_tid = 0; 1268 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1269 cursor.key_beg.obj_type = 0; 1270 1271 cursor.key_end = cursor.key_beg; 1272 cursor.key_end.key += max_iterations; 1273 cursor.asof = asof; 1274 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1275 1276 /* 1277 * Scan all matching records (the chain), locate the one matching 1278 * the requested path component. 1279 * 1280 * The hammer_ip_*() functions merge in-memory records with on-disk 1281 * records for the purposes of the search. 1282 */ 1283 obj_id = 0; 1284 localization = HAMMER_DEF_LOCALIZATION; 1285 1286 if (error == 0) { 1287 error = hammer_ip_first(&cursor); 1288 while (error == 0) { 1289 error = hammer_ip_resolve_data(&cursor); 1290 if (error) 1291 break; 1292 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1293 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1294 obj_id = cursor.data->entry.obj_id; 1295 localization = cursor.data->entry.localization; 1296 break; 1297 } 1298 error = hammer_ip_next(&cursor); 1299 } 1300 } 1301 hammer_done_cursor(&cursor); 1302 1303 /* 1304 * Lookup the obj_id. This should always succeed. If it does not 1305 * the filesystem may be damaged and we return a dummy inode. 1306 */ 1307 if (error == 0) { 1308 ip = hammer_get_inode(&trans, dip, obj_id, 1309 asof, localization, 1310 flags, &error); 1311 if (error == ENOENT) { 1312 kprintf("HAMMER: WARNING: Missing " 1313 "inode for dirent \"%s\"\n" 1314 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1315 ncp->nc_name, 1316 (long long)obj_id, (long long)asof, 1317 localization); 1318 error = 0; 1319 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1320 asof, localization, 1321 flags, &error); 1322 } 1323 if (error == 0) { 1324 error = hammer_get_vnode(ip, &vp); 1325 hammer_rel_inode(ip, 0); 1326 } else { 1327 vp = NULL; 1328 } 1329 if (error == 0) { 1330 vn_unlock(vp); 1331 cache_setvp(ap->a_nch, vp); 1332 vrele(vp); 1333 } 1334 } else if (error == ENOENT) { 1335 cache_setvp(ap->a_nch, NULL); 1336 } 1337 done: 1338 hammer_done_transaction(&trans); 1339 lwkt_reltoken(&hmp->fs_token); 1340 return (error); 1341 } 1342 1343 /* 1344 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1345 * 1346 * Locate the parent directory of a directory vnode. 1347 * 1348 * dvp is referenced but not locked. *vpp must be returned referenced and 1349 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1350 * at the root, instead it could indicate that the directory we were in was 1351 * removed. 1352 * 1353 * NOTE: as-of sequences are not linked into the directory structure. If 1354 * we are at the root with a different asof then the mount point, reload 1355 * the same directory with the mount point's asof. I'm not sure what this 1356 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1357 * get confused, but it hasn't been tested. 1358 */ 1359 static 1360 int 1361 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1362 { 1363 struct hammer_transaction trans; 1364 struct hammer_inode *dip; 1365 struct hammer_inode *ip; 1366 hammer_mount_t hmp; 1367 int64_t parent_obj_id; 1368 u_int32_t parent_obj_localization; 1369 hammer_tid_t asof; 1370 int error; 1371 1372 dip = VTOI(ap->a_dvp); 1373 asof = dip->obj_asof; 1374 hmp = dip->hmp; 1375 1376 /* 1377 * Whos are parent? This could be the root of a pseudo-filesystem 1378 * whos parent is in another localization domain. 1379 */ 1380 lwkt_gettoken(&hmp->fs_token); 1381 parent_obj_id = dip->ino_data.parent_obj_id; 1382 if (dip->obj_id == HAMMER_OBJID_ROOT) 1383 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1384 else 1385 parent_obj_localization = dip->obj_localization; 1386 1387 if (parent_obj_id == 0) { 1388 if (dip->obj_id == HAMMER_OBJID_ROOT && 1389 asof != hmp->asof) { 1390 parent_obj_id = dip->obj_id; 1391 asof = hmp->asof; 1392 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1393 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1394 (long long)dip->obj_asof); 1395 } else { 1396 *ap->a_vpp = NULL; 1397 lwkt_reltoken(&hmp->fs_token); 1398 return ENOENT; 1399 } 1400 } 1401 1402 hammer_simple_transaction(&trans, hmp); 1403 ++hammer_stats_file_iopsr; 1404 1405 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1406 asof, parent_obj_localization, 1407 dip->flags, &error); 1408 if (ip) { 1409 error = hammer_get_vnode(ip, ap->a_vpp); 1410 hammer_rel_inode(ip, 0); 1411 } else { 1412 *ap->a_vpp = NULL; 1413 } 1414 hammer_done_transaction(&trans); 1415 lwkt_reltoken(&hmp->fs_token); 1416 return (error); 1417 } 1418 1419 /* 1420 * hammer_vop_nlink { nch, dvp, vp, cred } 1421 */ 1422 static 1423 int 1424 hammer_vop_nlink(struct vop_nlink_args *ap) 1425 { 1426 struct hammer_transaction trans; 1427 struct hammer_inode *dip; 1428 struct hammer_inode *ip; 1429 struct nchandle *nch; 1430 hammer_mount_t hmp; 1431 int error; 1432 1433 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1434 return(EXDEV); 1435 1436 nch = ap->a_nch; 1437 dip = VTOI(ap->a_dvp); 1438 ip = VTOI(ap->a_vp); 1439 hmp = dip->hmp; 1440 1441 if (dip->obj_localization != ip->obj_localization) 1442 return(EXDEV); 1443 1444 if (dip->flags & HAMMER_INODE_RO) 1445 return (EROFS); 1446 if (ip->flags & HAMMER_INODE_RO) 1447 return (EROFS); 1448 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1449 return (error); 1450 1451 /* 1452 * Create a transaction to cover the operations we perform. 1453 */ 1454 lwkt_gettoken(&hmp->fs_token); 1455 hammer_start_transaction(&trans, hmp); 1456 ++hammer_stats_file_iopsw; 1457 1458 /* 1459 * Add the filesystem object to the directory. Note that neither 1460 * dip nor ip are referenced or locked, but their vnodes are 1461 * referenced. This function will bump the inode's link count. 1462 */ 1463 error = hammer_ip_add_directory(&trans, dip, 1464 nch->ncp->nc_name, nch->ncp->nc_nlen, 1465 ip); 1466 1467 /* 1468 * Finish up. 1469 */ 1470 if (error == 0) { 1471 cache_setunresolved(nch); 1472 cache_setvp(nch, ap->a_vp); 1473 } 1474 hammer_done_transaction(&trans); 1475 hammer_knote(ap->a_vp, NOTE_LINK); 1476 hammer_knote(ap->a_dvp, NOTE_WRITE); 1477 lwkt_reltoken(&hmp->fs_token); 1478 return (error); 1479 } 1480 1481 /* 1482 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1483 * 1484 * The operating system has already ensured that the directory entry 1485 * does not exist and done all appropriate namespace locking. 1486 */ 1487 static 1488 int 1489 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1490 { 1491 struct hammer_transaction trans; 1492 struct hammer_inode *dip; 1493 struct hammer_inode *nip; 1494 struct nchandle *nch; 1495 hammer_mount_t hmp; 1496 int error; 1497 1498 nch = ap->a_nch; 1499 dip = VTOI(ap->a_dvp); 1500 hmp = dip->hmp; 1501 1502 if (dip->flags & HAMMER_INODE_RO) 1503 return (EROFS); 1504 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1505 return (error); 1506 1507 /* 1508 * Create a transaction to cover the operations we perform. 1509 */ 1510 lwkt_gettoken(&hmp->fs_token); 1511 hammer_start_transaction(&trans, hmp); 1512 ++hammer_stats_file_iopsw; 1513 1514 /* 1515 * Create a new filesystem object of the requested type. The 1516 * returned inode will be referenced but not locked. 1517 */ 1518 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1519 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1520 NULL, &nip); 1521 if (error) { 1522 hkprintf("hammer_mkdir error %d\n", error); 1523 hammer_done_transaction(&trans); 1524 *ap->a_vpp = NULL; 1525 lwkt_reltoken(&hmp->fs_token); 1526 return (error); 1527 } 1528 /* 1529 * Add the new filesystem object to the directory. This will also 1530 * bump the inode's link count. 1531 */ 1532 error = hammer_ip_add_directory(&trans, dip, 1533 nch->ncp->nc_name, nch->ncp->nc_nlen, 1534 nip); 1535 if (error) 1536 hkprintf("hammer_mkdir (add) error %d\n", error); 1537 1538 /* 1539 * Finish up. 1540 */ 1541 if (error) { 1542 hammer_rel_inode(nip, 0); 1543 *ap->a_vpp = NULL; 1544 } else { 1545 error = hammer_get_vnode(nip, ap->a_vpp); 1546 hammer_rel_inode(nip, 0); 1547 if (error == 0) { 1548 cache_setunresolved(ap->a_nch); 1549 cache_setvp(ap->a_nch, *ap->a_vpp); 1550 } 1551 } 1552 hammer_done_transaction(&trans); 1553 if (error == 0) 1554 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1555 lwkt_reltoken(&hmp->fs_token); 1556 return (error); 1557 } 1558 1559 /* 1560 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1561 * 1562 * The operating system has already ensured that the directory entry 1563 * does not exist and done all appropriate namespace locking. 1564 */ 1565 static 1566 int 1567 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1568 { 1569 struct hammer_transaction trans; 1570 struct hammer_inode *dip; 1571 struct hammer_inode *nip; 1572 struct nchandle *nch; 1573 hammer_mount_t hmp; 1574 int error; 1575 1576 nch = ap->a_nch; 1577 dip = VTOI(ap->a_dvp); 1578 hmp = dip->hmp; 1579 1580 if (dip->flags & HAMMER_INODE_RO) 1581 return (EROFS); 1582 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1583 return (error); 1584 1585 /* 1586 * Create a transaction to cover the operations we perform. 1587 */ 1588 lwkt_gettoken(&hmp->fs_token); 1589 hammer_start_transaction(&trans, hmp); 1590 ++hammer_stats_file_iopsw; 1591 1592 /* 1593 * Create a new filesystem object of the requested type. The 1594 * returned inode will be referenced but not locked. 1595 * 1596 * If mknod specifies a directory a pseudo-fs is created. 1597 */ 1598 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1599 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1600 NULL, &nip); 1601 if (error) { 1602 hammer_done_transaction(&trans); 1603 *ap->a_vpp = NULL; 1604 lwkt_reltoken(&hmp->fs_token); 1605 return (error); 1606 } 1607 1608 /* 1609 * Add the new filesystem object to the directory. This will also 1610 * bump the inode's link count. 1611 */ 1612 error = hammer_ip_add_directory(&trans, dip, 1613 nch->ncp->nc_name, nch->ncp->nc_nlen, 1614 nip); 1615 1616 /* 1617 * Finish up. 1618 */ 1619 if (error) { 1620 hammer_rel_inode(nip, 0); 1621 *ap->a_vpp = NULL; 1622 } else { 1623 error = hammer_get_vnode(nip, ap->a_vpp); 1624 hammer_rel_inode(nip, 0); 1625 if (error == 0) { 1626 cache_setunresolved(ap->a_nch); 1627 cache_setvp(ap->a_nch, *ap->a_vpp); 1628 } 1629 } 1630 hammer_done_transaction(&trans); 1631 if (error == 0) 1632 hammer_knote(ap->a_dvp, NOTE_WRITE); 1633 lwkt_reltoken(&hmp->fs_token); 1634 return (error); 1635 } 1636 1637 /* 1638 * hammer_vop_open { vp, mode, cred, fp } 1639 * 1640 * MPSAFE (does not require fs_token) 1641 */ 1642 static 1643 int 1644 hammer_vop_open(struct vop_open_args *ap) 1645 { 1646 hammer_inode_t ip; 1647 1648 ++hammer_stats_file_iopsr; 1649 ip = VTOI(ap->a_vp); 1650 1651 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1652 return (EROFS); 1653 return(vop_stdopen(ap)); 1654 } 1655 1656 /* 1657 * hammer_vop_print { vp } 1658 */ 1659 static 1660 int 1661 hammer_vop_print(struct vop_print_args *ap) 1662 { 1663 return EOPNOTSUPP; 1664 } 1665 1666 /* 1667 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1668 */ 1669 static 1670 int 1671 hammer_vop_readdir(struct vop_readdir_args *ap) 1672 { 1673 struct hammer_transaction trans; 1674 struct hammer_cursor cursor; 1675 struct hammer_inode *ip; 1676 hammer_mount_t hmp; 1677 struct uio *uio; 1678 hammer_base_elm_t base; 1679 int error; 1680 int cookie_index; 1681 int ncookies; 1682 off_t *cookies; 1683 off_t saveoff; 1684 int r; 1685 int dtype; 1686 1687 ++hammer_stats_file_iopsr; 1688 ip = VTOI(ap->a_vp); 1689 uio = ap->a_uio; 1690 saveoff = uio->uio_offset; 1691 hmp = ip->hmp; 1692 1693 if (ap->a_ncookies) { 1694 ncookies = uio->uio_resid / 16 + 1; 1695 if (ncookies > 1024) 1696 ncookies = 1024; 1697 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1698 cookie_index = 0; 1699 } else { 1700 ncookies = -1; 1701 cookies = NULL; 1702 cookie_index = 0; 1703 } 1704 1705 lwkt_gettoken(&hmp->fs_token); 1706 hammer_simple_transaction(&trans, hmp); 1707 1708 /* 1709 * Handle artificial entries 1710 * 1711 * It should be noted that the minimum value for a directory 1712 * hash key on-media is 0x0000000100000000, so we can use anything 1713 * less then that to represent our 'special' key space. 1714 */ 1715 error = 0; 1716 if (saveoff == 0) { 1717 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1718 if (r) 1719 goto done; 1720 if (cookies) 1721 cookies[cookie_index] = saveoff; 1722 ++saveoff; 1723 ++cookie_index; 1724 if (cookie_index == ncookies) 1725 goto done; 1726 } 1727 if (saveoff == 1) { 1728 if (ip->ino_data.parent_obj_id) { 1729 r = vop_write_dirent(&error, uio, 1730 ip->ino_data.parent_obj_id, 1731 DT_DIR, 2, ".."); 1732 } else { 1733 r = vop_write_dirent(&error, uio, 1734 ip->obj_id, DT_DIR, 2, ".."); 1735 } 1736 if (r) 1737 goto done; 1738 if (cookies) 1739 cookies[cookie_index] = saveoff; 1740 ++saveoff; 1741 ++cookie_index; 1742 if (cookie_index == ncookies) 1743 goto done; 1744 } 1745 1746 /* 1747 * Key range (begin and end inclusive) to scan. Directory keys 1748 * directly translate to a 64 bit 'seek' position. 1749 */ 1750 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1751 cursor.key_beg.localization = ip->obj_localization + 1752 hammer_dir_localization(ip); 1753 cursor.key_beg.obj_id = ip->obj_id; 1754 cursor.key_beg.create_tid = 0; 1755 cursor.key_beg.delete_tid = 0; 1756 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1757 cursor.key_beg.obj_type = 0; 1758 cursor.key_beg.key = saveoff; 1759 1760 cursor.key_end = cursor.key_beg; 1761 cursor.key_end.key = HAMMER_MAX_KEY; 1762 cursor.asof = ip->obj_asof; 1763 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1764 1765 error = hammer_ip_first(&cursor); 1766 1767 while (error == 0) { 1768 error = hammer_ip_resolve_data(&cursor); 1769 if (error) 1770 break; 1771 base = &cursor.leaf->base; 1772 saveoff = base->key; 1773 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1774 1775 if (base->obj_id != ip->obj_id) 1776 panic("readdir: bad record at %p", cursor.node); 1777 1778 /* 1779 * Convert pseudo-filesystems into softlinks 1780 */ 1781 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1782 r = vop_write_dirent( 1783 &error, uio, cursor.data->entry.obj_id, 1784 dtype, 1785 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1786 (void *)cursor.data->entry.name); 1787 if (r) 1788 break; 1789 ++saveoff; 1790 if (cookies) 1791 cookies[cookie_index] = base->key; 1792 ++cookie_index; 1793 if (cookie_index == ncookies) 1794 break; 1795 error = hammer_ip_next(&cursor); 1796 } 1797 hammer_done_cursor(&cursor); 1798 1799 done: 1800 hammer_done_transaction(&trans); 1801 1802 if (ap->a_eofflag) 1803 *ap->a_eofflag = (error == ENOENT); 1804 uio->uio_offset = saveoff; 1805 if (error && cookie_index == 0) { 1806 if (error == ENOENT) 1807 error = 0; 1808 if (cookies) { 1809 kfree(cookies, M_TEMP); 1810 *ap->a_ncookies = 0; 1811 *ap->a_cookies = NULL; 1812 } 1813 } else { 1814 if (error == ENOENT) 1815 error = 0; 1816 if (cookies) { 1817 *ap->a_ncookies = cookie_index; 1818 *ap->a_cookies = cookies; 1819 } 1820 } 1821 lwkt_reltoken(&hmp->fs_token); 1822 return(error); 1823 } 1824 1825 /* 1826 * hammer_vop_readlink { vp, uio, cred } 1827 */ 1828 static 1829 int 1830 hammer_vop_readlink(struct vop_readlink_args *ap) 1831 { 1832 struct hammer_transaction trans; 1833 struct hammer_cursor cursor; 1834 struct hammer_inode *ip; 1835 hammer_mount_t hmp; 1836 char buf[32]; 1837 u_int32_t localization; 1838 hammer_pseudofs_inmem_t pfsm; 1839 int error; 1840 1841 ip = VTOI(ap->a_vp); 1842 hmp = ip->hmp; 1843 1844 lwkt_gettoken(&hmp->fs_token); 1845 1846 /* 1847 * Shortcut if the symlink data was stuffed into ino_data. 1848 * 1849 * Also expand special "@@PFS%05d" softlinks (expansion only 1850 * occurs for non-historical (current) accesses made from the 1851 * primary filesystem). 1852 */ 1853 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1854 char *ptr; 1855 int bytes; 1856 1857 ptr = ip->ino_data.ext.symlink; 1858 bytes = (int)ip->ino_data.size; 1859 if (bytes == 10 && 1860 ip->obj_asof == HAMMER_MAX_TID && 1861 ip->obj_localization == 0 && 1862 strncmp(ptr, "@@PFS", 5) == 0) { 1863 hammer_simple_transaction(&trans, hmp); 1864 bcopy(ptr + 5, buf, 5); 1865 buf[5] = 0; 1866 localization = strtoul(buf, NULL, 10) << 16; 1867 pfsm = hammer_load_pseudofs(&trans, localization, 1868 &error); 1869 if (error == 0) { 1870 if (pfsm->pfsd.mirror_flags & 1871 HAMMER_PFSD_SLAVE) { 1872 /* vap->va_size == 26 */ 1873 ksnprintf(buf, sizeof(buf), 1874 "@@0x%016llx:%05d", 1875 (long long)pfsm->pfsd.sync_end_tid, 1876 localization >> 16); 1877 } else { 1878 /* vap->va_size == 10 */ 1879 ksnprintf(buf, sizeof(buf), 1880 "@@-1:%05d", 1881 localization >> 16); 1882 #if 0 1883 ksnprintf(buf, sizeof(buf), 1884 "@@0x%016llx:%05d", 1885 (long long)HAMMER_MAX_TID, 1886 localization >> 16); 1887 #endif 1888 } 1889 ptr = buf; 1890 bytes = strlen(buf); 1891 } 1892 if (pfsm) 1893 hammer_rel_pseudofs(hmp, pfsm); 1894 hammer_done_transaction(&trans); 1895 } 1896 error = uiomove(ptr, bytes, ap->a_uio); 1897 lwkt_reltoken(&hmp->fs_token); 1898 return(error); 1899 } 1900 1901 /* 1902 * Long version 1903 */ 1904 hammer_simple_transaction(&trans, hmp); 1905 ++hammer_stats_file_iopsr; 1906 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1907 1908 /* 1909 * Key range (begin and end inclusive) to scan. Directory keys 1910 * directly translate to a 64 bit 'seek' position. 1911 */ 1912 cursor.key_beg.localization = ip->obj_localization + 1913 HAMMER_LOCALIZE_MISC; 1914 cursor.key_beg.obj_id = ip->obj_id; 1915 cursor.key_beg.create_tid = 0; 1916 cursor.key_beg.delete_tid = 0; 1917 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1918 cursor.key_beg.obj_type = 0; 1919 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1920 cursor.asof = ip->obj_asof; 1921 cursor.flags |= HAMMER_CURSOR_ASOF; 1922 1923 error = hammer_ip_lookup(&cursor); 1924 if (error == 0) { 1925 error = hammer_ip_resolve_data(&cursor); 1926 if (error == 0) { 1927 KKASSERT(cursor.leaf->data_len >= 1928 HAMMER_SYMLINK_NAME_OFF); 1929 error = uiomove(cursor.data->symlink.name, 1930 cursor.leaf->data_len - 1931 HAMMER_SYMLINK_NAME_OFF, 1932 ap->a_uio); 1933 } 1934 } 1935 hammer_done_cursor(&cursor); 1936 hammer_done_transaction(&trans); 1937 lwkt_reltoken(&hmp->fs_token); 1938 return(error); 1939 } 1940 1941 /* 1942 * hammer_vop_nremove { nch, dvp, cred } 1943 */ 1944 static 1945 int 1946 hammer_vop_nremove(struct vop_nremove_args *ap) 1947 { 1948 struct hammer_transaction trans; 1949 struct hammer_inode *dip; 1950 hammer_mount_t hmp; 1951 int error; 1952 1953 dip = VTOI(ap->a_dvp); 1954 hmp = dip->hmp; 1955 1956 if (hammer_nohistory(dip) == 0 && 1957 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1958 return (error); 1959 } 1960 1961 lwkt_gettoken(&hmp->fs_token); 1962 hammer_start_transaction(&trans, hmp); 1963 ++hammer_stats_file_iopsw; 1964 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1965 hammer_done_transaction(&trans); 1966 if (error == 0) 1967 hammer_knote(ap->a_dvp, NOTE_WRITE); 1968 lwkt_reltoken(&hmp->fs_token); 1969 return (error); 1970 } 1971 1972 /* 1973 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1974 */ 1975 static 1976 int 1977 hammer_vop_nrename(struct vop_nrename_args *ap) 1978 { 1979 struct hammer_transaction trans; 1980 struct namecache *fncp; 1981 struct namecache *tncp; 1982 struct hammer_inode *fdip; 1983 struct hammer_inode *tdip; 1984 struct hammer_inode *ip; 1985 hammer_mount_t hmp; 1986 struct hammer_cursor cursor; 1987 int64_t namekey; 1988 u_int32_t max_iterations; 1989 int nlen, error; 1990 1991 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1992 return(EXDEV); 1993 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1994 return(EXDEV); 1995 1996 fdip = VTOI(ap->a_fdvp); 1997 tdip = VTOI(ap->a_tdvp); 1998 fncp = ap->a_fnch->ncp; 1999 tncp = ap->a_tnch->ncp; 2000 ip = VTOI(fncp->nc_vp); 2001 KKASSERT(ip != NULL); 2002 2003 hmp = ip->hmp; 2004 2005 if (fdip->obj_localization != tdip->obj_localization) 2006 return(EXDEV); 2007 if (fdip->obj_localization != ip->obj_localization) 2008 return(EXDEV); 2009 2010 if (fdip->flags & HAMMER_INODE_RO) 2011 return (EROFS); 2012 if (tdip->flags & HAMMER_INODE_RO) 2013 return (EROFS); 2014 if (ip->flags & HAMMER_INODE_RO) 2015 return (EROFS); 2016 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2017 return (error); 2018 2019 lwkt_gettoken(&hmp->fs_token); 2020 hammer_start_transaction(&trans, hmp); 2021 ++hammer_stats_file_iopsw; 2022 2023 /* 2024 * Remove tncp from the target directory and then link ip as 2025 * tncp. XXX pass trans to dounlink 2026 * 2027 * Force the inode sync-time to match the transaction so it is 2028 * in-sync with the creation of the target directory entry. 2029 */ 2030 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 2031 ap->a_cred, 0, -1); 2032 if (error == 0 || error == ENOENT) { 2033 error = hammer_ip_add_directory(&trans, tdip, 2034 tncp->nc_name, tncp->nc_nlen, 2035 ip); 2036 if (error == 0) { 2037 ip->ino_data.parent_obj_id = tdip->obj_id; 2038 ip->ino_data.ctime = trans.time; 2039 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2040 } 2041 } 2042 if (error) 2043 goto failed; /* XXX */ 2044 2045 /* 2046 * Locate the record in the originating directory and remove it. 2047 * 2048 * Calculate the namekey and setup the key range for the scan. This 2049 * works kinda like a chained hash table where the lower 32 bits 2050 * of the namekey synthesize the chain. 2051 * 2052 * The key range is inclusive of both key_beg and key_end. 2053 */ 2054 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2055 &max_iterations); 2056 retry: 2057 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2058 cursor.key_beg.localization = fdip->obj_localization + 2059 hammer_dir_localization(fdip); 2060 cursor.key_beg.obj_id = fdip->obj_id; 2061 cursor.key_beg.key = namekey; 2062 cursor.key_beg.create_tid = 0; 2063 cursor.key_beg.delete_tid = 0; 2064 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2065 cursor.key_beg.obj_type = 0; 2066 2067 cursor.key_end = cursor.key_beg; 2068 cursor.key_end.key += max_iterations; 2069 cursor.asof = fdip->obj_asof; 2070 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2071 2072 /* 2073 * Scan all matching records (the chain), locate the one matching 2074 * the requested path component. 2075 * 2076 * The hammer_ip_*() functions merge in-memory records with on-disk 2077 * records for the purposes of the search. 2078 */ 2079 error = hammer_ip_first(&cursor); 2080 while (error == 0) { 2081 if (hammer_ip_resolve_data(&cursor) != 0) 2082 break; 2083 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2084 KKASSERT(nlen > 0); 2085 if (fncp->nc_nlen == nlen && 2086 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2087 break; 2088 } 2089 error = hammer_ip_next(&cursor); 2090 } 2091 2092 /* 2093 * If all is ok we have to get the inode so we can adjust nlinks. 2094 * 2095 * WARNING: hammer_ip_del_directory() may have to terminate the 2096 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2097 * twice. 2098 */ 2099 if (error == 0) 2100 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2101 2102 /* 2103 * XXX A deadlock here will break rename's atomicy for the purposes 2104 * of crash recovery. 2105 */ 2106 if (error == EDEADLK) { 2107 hammer_done_cursor(&cursor); 2108 goto retry; 2109 } 2110 2111 /* 2112 * Cleanup and tell the kernel that the rename succeeded. 2113 * 2114 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2115 * without formally acquiring the vp since the vp might 2116 * have zero refs on it, or in the middle of a reclaim, 2117 * etc. 2118 */ 2119 hammer_done_cursor(&cursor); 2120 if (error == 0) { 2121 cache_rename(ap->a_fnch, ap->a_tnch); 2122 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2123 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2124 while (ip->vp) { 2125 struct vnode *vp; 2126 2127 error = hammer_get_vnode(ip, &vp); 2128 if (error == 0 && vp) { 2129 vn_unlock(vp); 2130 hammer_knote(ip->vp, NOTE_RENAME); 2131 vrele(vp); 2132 break; 2133 } 2134 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2135 } 2136 } 2137 2138 failed: 2139 hammer_done_transaction(&trans); 2140 lwkt_reltoken(&hmp->fs_token); 2141 return (error); 2142 } 2143 2144 /* 2145 * hammer_vop_nrmdir { nch, dvp, cred } 2146 */ 2147 static 2148 int 2149 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2150 { 2151 struct hammer_transaction trans; 2152 struct hammer_inode *dip; 2153 hammer_mount_t hmp; 2154 int error; 2155 2156 dip = VTOI(ap->a_dvp); 2157 hmp = dip->hmp; 2158 2159 if (hammer_nohistory(dip) == 0 && 2160 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2161 return (error); 2162 } 2163 2164 lwkt_gettoken(&hmp->fs_token); 2165 hammer_start_transaction(&trans, hmp); 2166 ++hammer_stats_file_iopsw; 2167 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2168 hammer_done_transaction(&trans); 2169 if (error == 0) 2170 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2171 lwkt_reltoken(&hmp->fs_token); 2172 return (error); 2173 } 2174 2175 /* 2176 * hammer_vop_markatime { vp, cred } 2177 */ 2178 static 2179 int 2180 hammer_vop_markatime(struct vop_markatime_args *ap) 2181 { 2182 struct hammer_transaction trans; 2183 struct hammer_inode *ip; 2184 hammer_mount_t hmp; 2185 2186 ip = VTOI(ap->a_vp); 2187 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2188 return (EROFS); 2189 if (ip->flags & HAMMER_INODE_RO) 2190 return (EROFS); 2191 hmp = ip->hmp; 2192 if (hmp->mp->mnt_flag & MNT_NOATIME) 2193 return (0); 2194 lwkt_gettoken(&hmp->fs_token); 2195 hammer_start_transaction(&trans, hmp); 2196 ++hammer_stats_file_iopsw; 2197 2198 ip->ino_data.atime = trans.time; 2199 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2200 hammer_done_transaction(&trans); 2201 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2202 lwkt_reltoken(&hmp->fs_token); 2203 return (0); 2204 } 2205 2206 /* 2207 * hammer_vop_setattr { vp, vap, cred } 2208 */ 2209 static 2210 int 2211 hammer_vop_setattr(struct vop_setattr_args *ap) 2212 { 2213 struct hammer_transaction trans; 2214 struct hammer_inode *ip; 2215 struct vattr *vap; 2216 hammer_mount_t hmp; 2217 int modflags; 2218 int error; 2219 int truncating; 2220 int blksize; 2221 int kflags; 2222 #if 0 2223 int64_t aligned_size; 2224 #endif 2225 u_int32_t flags; 2226 2227 vap = ap->a_vap; 2228 ip = ap->a_vp->v_data; 2229 modflags = 0; 2230 kflags = 0; 2231 hmp = ip->hmp; 2232 2233 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2234 return(EROFS); 2235 if (ip->flags & HAMMER_INODE_RO) 2236 return (EROFS); 2237 if (hammer_nohistory(ip) == 0 && 2238 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2239 return (error); 2240 } 2241 2242 lwkt_gettoken(&hmp->fs_token); 2243 hammer_start_transaction(&trans, hmp); 2244 ++hammer_stats_file_iopsw; 2245 error = 0; 2246 2247 if (vap->va_flags != VNOVAL) { 2248 flags = ip->ino_data.uflags; 2249 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2250 hammer_to_unix_xid(&ip->ino_data.uid), 2251 ap->a_cred); 2252 if (error == 0) { 2253 if (ip->ino_data.uflags != flags) { 2254 ip->ino_data.uflags = flags; 2255 ip->ino_data.ctime = trans.time; 2256 modflags |= HAMMER_INODE_DDIRTY; 2257 kflags |= NOTE_ATTRIB; 2258 } 2259 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2260 error = 0; 2261 goto done; 2262 } 2263 } 2264 goto done; 2265 } 2266 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2267 error = EPERM; 2268 goto done; 2269 } 2270 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2271 mode_t cur_mode = ip->ino_data.mode; 2272 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2273 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2274 uuid_t uuid_uid; 2275 uuid_t uuid_gid; 2276 2277 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2278 ap->a_cred, 2279 &cur_uid, &cur_gid, &cur_mode); 2280 if (error == 0) { 2281 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2282 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2283 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2284 sizeof(uuid_uid)) || 2285 bcmp(&uuid_gid, &ip->ino_data.gid, 2286 sizeof(uuid_gid)) || 2287 ip->ino_data.mode != cur_mode 2288 ) { 2289 ip->ino_data.uid = uuid_uid; 2290 ip->ino_data.gid = uuid_gid; 2291 ip->ino_data.mode = cur_mode; 2292 ip->ino_data.ctime = trans.time; 2293 modflags |= HAMMER_INODE_DDIRTY; 2294 } 2295 kflags |= NOTE_ATTRIB; 2296 } 2297 } 2298 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2299 switch(ap->a_vp->v_type) { 2300 case VREG: 2301 if (vap->va_size == ip->ino_data.size) 2302 break; 2303 2304 /* 2305 * Log the operation if in fast-fsync mode or if 2306 * there are unterminated redo write records present. 2307 * 2308 * The second check is needed so the recovery code 2309 * properly truncates write redos even if nominal 2310 * REDO operations is turned off due to excessive 2311 * writes, because the related records might be 2312 * destroyed and never lay down a TERM_WRITE. 2313 */ 2314 if ((ip->flags & HAMMER_INODE_REDO) || 2315 (ip->flags & HAMMER_INODE_RDIRTY)) { 2316 error = hammer_generate_redo(&trans, ip, 2317 vap->va_size, 2318 HAMMER_REDO_TRUNC, 2319 NULL, 0); 2320 } 2321 blksize = hammer_blocksize(vap->va_size); 2322 2323 /* 2324 * XXX break atomicy, we can deadlock the backend 2325 * if we do not release the lock. Probably not a 2326 * big deal here. 2327 */ 2328 if (vap->va_size < ip->ino_data.size) { 2329 nvtruncbuf(ap->a_vp, vap->va_size, 2330 blksize, 2331 hammer_blockoff(vap->va_size), 2332 0); 2333 truncating = 1; 2334 kflags |= NOTE_WRITE; 2335 } else { 2336 nvextendbuf(ap->a_vp, 2337 ip->ino_data.size, 2338 vap->va_size, 2339 hammer_blocksize(ip->ino_data.size), 2340 hammer_blocksize(vap->va_size), 2341 hammer_blockoff(ip->ino_data.size), 2342 hammer_blockoff(vap->va_size), 2343 0); 2344 truncating = 0; 2345 kflags |= NOTE_WRITE | NOTE_EXTEND; 2346 } 2347 ip->ino_data.size = vap->va_size; 2348 ip->ino_data.mtime = trans.time; 2349 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2350 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2351 2352 /* 2353 * On-media truncation is cached in the inode until 2354 * the inode is synchronized. We must immediately 2355 * handle any frontend records. 2356 */ 2357 if (truncating) { 2358 hammer_ip_frontend_trunc(ip, vap->va_size); 2359 #ifdef DEBUG_TRUNCATE 2360 if (HammerTruncIp == NULL) 2361 HammerTruncIp = ip; 2362 #endif 2363 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2364 ip->flags |= HAMMER_INODE_TRUNCATED; 2365 ip->trunc_off = vap->va_size; 2366 #ifdef DEBUG_TRUNCATE 2367 if (ip == HammerTruncIp) 2368 kprintf("truncate1 %016llx\n", 2369 (long long)ip->trunc_off); 2370 #endif 2371 } else if (ip->trunc_off > vap->va_size) { 2372 ip->trunc_off = vap->va_size; 2373 #ifdef DEBUG_TRUNCATE 2374 if (ip == HammerTruncIp) 2375 kprintf("truncate2 %016llx\n", 2376 (long long)ip->trunc_off); 2377 #endif 2378 } else { 2379 #ifdef DEBUG_TRUNCATE 2380 if (ip == HammerTruncIp) 2381 kprintf("truncate3 %016llx (ignored)\n", 2382 (long long)vap->va_size); 2383 #endif 2384 } 2385 } 2386 2387 #if 0 2388 /* 2389 * When truncating, nvtruncbuf() may have cleaned out 2390 * a portion of the last block on-disk in the buffer 2391 * cache. We must clean out any frontend records 2392 * for blocks beyond the new last block. 2393 */ 2394 aligned_size = (vap->va_size + (blksize - 1)) & 2395 ~(int64_t)(blksize - 1); 2396 if (truncating && vap->va_size < aligned_size) { 2397 aligned_size -= blksize; 2398 hammer_ip_frontend_trunc(ip, aligned_size); 2399 } 2400 #endif 2401 break; 2402 case VDATABASE: 2403 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2404 ip->flags |= HAMMER_INODE_TRUNCATED; 2405 ip->trunc_off = vap->va_size; 2406 } else if (ip->trunc_off > vap->va_size) { 2407 ip->trunc_off = vap->va_size; 2408 } 2409 hammer_ip_frontend_trunc(ip, vap->va_size); 2410 ip->ino_data.size = vap->va_size; 2411 ip->ino_data.mtime = trans.time; 2412 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2413 kflags |= NOTE_ATTRIB; 2414 break; 2415 default: 2416 error = EINVAL; 2417 goto done; 2418 } 2419 break; 2420 } 2421 if (vap->va_atime.tv_sec != VNOVAL) { 2422 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2423 modflags |= HAMMER_INODE_ATIME; 2424 kflags |= NOTE_ATTRIB; 2425 } 2426 if (vap->va_mtime.tv_sec != VNOVAL) { 2427 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2428 modflags |= HAMMER_INODE_MTIME; 2429 kflags |= NOTE_ATTRIB; 2430 } 2431 if (vap->va_mode != (mode_t)VNOVAL) { 2432 mode_t cur_mode = ip->ino_data.mode; 2433 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2434 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2435 2436 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2437 cur_uid, cur_gid, &cur_mode); 2438 if (error == 0 && ip->ino_data.mode != cur_mode) { 2439 ip->ino_data.mode = cur_mode; 2440 ip->ino_data.ctime = trans.time; 2441 modflags |= HAMMER_INODE_DDIRTY; 2442 kflags |= NOTE_ATTRIB; 2443 } 2444 } 2445 done: 2446 if (error == 0) 2447 hammer_modify_inode(&trans, ip, modflags); 2448 hammer_done_transaction(&trans); 2449 hammer_knote(ap->a_vp, kflags); 2450 lwkt_reltoken(&hmp->fs_token); 2451 return (error); 2452 } 2453 2454 /* 2455 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2456 */ 2457 static 2458 int 2459 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2460 { 2461 struct hammer_transaction trans; 2462 struct hammer_inode *dip; 2463 struct hammer_inode *nip; 2464 hammer_record_t record; 2465 struct nchandle *nch; 2466 hammer_mount_t hmp; 2467 int error; 2468 int bytes; 2469 2470 ap->a_vap->va_type = VLNK; 2471 2472 nch = ap->a_nch; 2473 dip = VTOI(ap->a_dvp); 2474 hmp = dip->hmp; 2475 2476 if (dip->flags & HAMMER_INODE_RO) 2477 return (EROFS); 2478 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2479 return (error); 2480 2481 /* 2482 * Create a transaction to cover the operations we perform. 2483 */ 2484 lwkt_gettoken(&hmp->fs_token); 2485 hammer_start_transaction(&trans, hmp); 2486 ++hammer_stats_file_iopsw; 2487 2488 /* 2489 * Create a new filesystem object of the requested type. The 2490 * returned inode will be referenced but not locked. 2491 */ 2492 2493 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2494 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2495 NULL, &nip); 2496 if (error) { 2497 hammer_done_transaction(&trans); 2498 *ap->a_vpp = NULL; 2499 lwkt_reltoken(&hmp->fs_token); 2500 return (error); 2501 } 2502 2503 /* 2504 * Add a record representing the symlink. symlink stores the link 2505 * as pure data, not a string, and is no \0 terminated. 2506 */ 2507 if (error == 0) { 2508 bytes = strlen(ap->a_target); 2509 2510 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2511 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2512 } else { 2513 record = hammer_alloc_mem_record(nip, bytes); 2514 record->type = HAMMER_MEM_RECORD_GENERAL; 2515 2516 record->leaf.base.localization = nip->obj_localization + 2517 HAMMER_LOCALIZE_MISC; 2518 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2519 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2520 record->leaf.data_len = bytes; 2521 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2522 bcopy(ap->a_target, record->data->symlink.name, bytes); 2523 error = hammer_ip_add_record(&trans, record); 2524 } 2525 2526 /* 2527 * Set the file size to the length of the link. 2528 */ 2529 if (error == 0) { 2530 nip->ino_data.size = bytes; 2531 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2532 } 2533 } 2534 if (error == 0) 2535 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2536 nch->ncp->nc_nlen, nip); 2537 2538 /* 2539 * Finish up. 2540 */ 2541 if (error) { 2542 hammer_rel_inode(nip, 0); 2543 *ap->a_vpp = NULL; 2544 } else { 2545 error = hammer_get_vnode(nip, ap->a_vpp); 2546 hammer_rel_inode(nip, 0); 2547 if (error == 0) { 2548 cache_setunresolved(ap->a_nch); 2549 cache_setvp(ap->a_nch, *ap->a_vpp); 2550 hammer_knote(ap->a_dvp, NOTE_WRITE); 2551 } 2552 } 2553 hammer_done_transaction(&trans); 2554 lwkt_reltoken(&hmp->fs_token); 2555 return (error); 2556 } 2557 2558 /* 2559 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2560 */ 2561 static 2562 int 2563 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2564 { 2565 struct hammer_transaction trans; 2566 struct hammer_inode *dip; 2567 hammer_mount_t hmp; 2568 int error; 2569 2570 dip = VTOI(ap->a_dvp); 2571 hmp = dip->hmp; 2572 2573 if (hammer_nohistory(dip) == 0 && 2574 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2575 return (error); 2576 } 2577 2578 lwkt_gettoken(&hmp->fs_token); 2579 hammer_start_transaction(&trans, hmp); 2580 ++hammer_stats_file_iopsw; 2581 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2582 ap->a_cred, ap->a_flags, -1); 2583 hammer_done_transaction(&trans); 2584 lwkt_reltoken(&hmp->fs_token); 2585 2586 return (error); 2587 } 2588 2589 /* 2590 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2591 */ 2592 static 2593 int 2594 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2595 { 2596 struct hammer_inode *ip = ap->a_vp->v_data; 2597 hammer_mount_t hmp = ip->hmp; 2598 int error; 2599 2600 ++hammer_stats_file_iopsr; 2601 lwkt_gettoken(&hmp->fs_token); 2602 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2603 ap->a_fflag, ap->a_cred); 2604 lwkt_reltoken(&hmp->fs_token); 2605 return (error); 2606 } 2607 2608 static 2609 int 2610 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2611 { 2612 static const struct mountctl_opt extraopt[] = { 2613 { HMNT_NOHISTORY, "nohistory" }, 2614 { HMNT_MASTERID, "master" }, 2615 { 0, NULL} 2616 2617 }; 2618 struct hammer_mount *hmp; 2619 struct mount *mp; 2620 int usedbytes; 2621 int error; 2622 2623 error = 0; 2624 usedbytes = 0; 2625 mp = ap->a_head.a_ops->head.vv_mount; 2626 KKASSERT(mp->mnt_data != NULL); 2627 hmp = (struct hammer_mount *)mp->mnt_data; 2628 2629 lwkt_gettoken(&hmp->fs_token); 2630 2631 switch(ap->a_op) { 2632 case MOUNTCTL_SET_EXPORT: 2633 if (ap->a_ctllen != sizeof(struct export_args)) 2634 error = EINVAL; 2635 else 2636 error = hammer_vfs_export(mp, ap->a_op, 2637 (const struct export_args *)ap->a_ctl); 2638 break; 2639 case MOUNTCTL_MOUNTFLAGS: 2640 { 2641 /* 2642 * Call standard mountctl VOP function 2643 * so we get user mount flags. 2644 */ 2645 error = vop_stdmountctl(ap); 2646 if (error) 2647 break; 2648 2649 usedbytes = *ap->a_res; 2650 2651 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2652 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2653 ap->a_buf, 2654 ap->a_buflen - usedbytes, 2655 &error); 2656 } 2657 2658 *ap->a_res += usedbytes; 2659 break; 2660 } 2661 default: 2662 error = vop_stdmountctl(ap); 2663 break; 2664 } 2665 lwkt_reltoken(&hmp->fs_token); 2666 return(error); 2667 } 2668 2669 /* 2670 * hammer_vop_strategy { vp, bio } 2671 * 2672 * Strategy call, used for regular file read & write only. Note that the 2673 * bp may represent a cluster. 2674 * 2675 * To simplify operation and allow better optimizations in the future, 2676 * this code does not make any assumptions with regards to buffer alignment 2677 * or size. 2678 */ 2679 static 2680 int 2681 hammer_vop_strategy(struct vop_strategy_args *ap) 2682 { 2683 struct buf *bp; 2684 int error; 2685 2686 bp = ap->a_bio->bio_buf; 2687 2688 switch(bp->b_cmd) { 2689 case BUF_CMD_READ: 2690 error = hammer_vop_strategy_read(ap); 2691 break; 2692 case BUF_CMD_WRITE: 2693 error = hammer_vop_strategy_write(ap); 2694 break; 2695 default: 2696 bp->b_error = error = EINVAL; 2697 bp->b_flags |= B_ERROR; 2698 biodone(ap->a_bio); 2699 break; 2700 } 2701 2702 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2703 2704 return (error); 2705 } 2706 2707 /* 2708 * Read from a regular file. Iterate the related records and fill in the 2709 * BIO/BUF. Gaps are zero-filled. 2710 * 2711 * The support code in hammer_object.c should be used to deal with mixed 2712 * in-memory and on-disk records. 2713 * 2714 * NOTE: Can be called from the cluster code with an oversized buf. 2715 * 2716 * XXX atime update 2717 */ 2718 static 2719 int 2720 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2721 { 2722 struct hammer_transaction trans; 2723 struct hammer_inode *ip; 2724 struct hammer_inode *dip; 2725 hammer_mount_t hmp; 2726 struct hammer_cursor cursor; 2727 hammer_base_elm_t base; 2728 hammer_off_t disk_offset; 2729 struct bio *bio; 2730 struct bio *nbio; 2731 struct buf *bp; 2732 int64_t rec_offset; 2733 int64_t ran_end; 2734 int64_t tmp64; 2735 int error; 2736 int boff; 2737 int roff; 2738 int n; 2739 int isdedupable; 2740 2741 bio = ap->a_bio; 2742 bp = bio->bio_buf; 2743 ip = ap->a_vp->v_data; 2744 hmp = ip->hmp; 2745 2746 /* 2747 * The zone-2 disk offset may have been set by the cluster code via 2748 * a BMAP operation, or else should be NOOFFSET. 2749 * 2750 * Checking the high bits for a match against zone-2 should suffice. 2751 * 2752 * In cases where a lot of data duplication is present it may be 2753 * more beneficial to drop through and doubule-buffer through the 2754 * device. 2755 */ 2756 nbio = push_bio(bio); 2757 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2758 HAMMER_ZONE_LARGE_DATA) { 2759 if (hammer_double_buffer == 0) { 2760 lwkt_gettoken(&hmp->fs_token); 2761 error = hammer_io_direct_read(hmp, nbio, NULL); 2762 lwkt_reltoken(&hmp->fs_token); 2763 return (error); 2764 } 2765 2766 /* 2767 * Try to shortcut requests for double_buffer mode too. 2768 * Since this mode runs through the device buffer cache 2769 * only compatible buffer sizes (meaning those generated 2770 * by normal filesystem buffers) are legal. 2771 */ 2772 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2773 error = hammer_io_indirect_read(hmp, nbio, NULL); 2774 return (error); 2775 } 2776 } 2777 2778 /* 2779 * Well, that sucked. Do it the hard way. If all the stars are 2780 * aligned we may still be able to issue a direct-read. 2781 */ 2782 lwkt_gettoken(&hmp->fs_token); 2783 hammer_simple_transaction(&trans, hmp); 2784 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2785 2786 /* 2787 * Key range (begin and end inclusive) to scan. Note that the key's 2788 * stored in the actual records represent BASE+LEN, not BASE. The 2789 * first record containing bio_offset will have a key > bio_offset. 2790 */ 2791 cursor.key_beg.localization = ip->obj_localization + 2792 HAMMER_LOCALIZE_MISC; 2793 cursor.key_beg.obj_id = ip->obj_id; 2794 cursor.key_beg.create_tid = 0; 2795 cursor.key_beg.delete_tid = 0; 2796 cursor.key_beg.obj_type = 0; 2797 cursor.key_beg.key = bio->bio_offset + 1; 2798 cursor.asof = ip->obj_asof; 2799 cursor.flags |= HAMMER_CURSOR_ASOF; 2800 2801 cursor.key_end = cursor.key_beg; 2802 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2803 #if 0 2804 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2805 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2806 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2807 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2808 } else 2809 #endif 2810 { 2811 ran_end = bio->bio_offset + bp->b_bufsize; 2812 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2813 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2814 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2815 if (tmp64 < ran_end) 2816 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2817 else 2818 cursor.key_end.key = ran_end + MAXPHYS + 1; 2819 } 2820 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2821 2822 /* 2823 * Set NOSWAPCACHE for cursor data extraction if double buffering 2824 * is disabled or (if the file is not marked cacheable via chflags 2825 * and vm.swapcache_use_chflags is enabled). 2826 */ 2827 if (hammer_double_buffer == 0 || 2828 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2829 vm_swapcache_use_chflags)) { 2830 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2831 } 2832 2833 error = hammer_ip_first(&cursor); 2834 boff = 0; 2835 2836 while (error == 0) { 2837 /* 2838 * Get the base file offset of the record. The key for 2839 * data records is (base + bytes) rather then (base). 2840 */ 2841 base = &cursor.leaf->base; 2842 rec_offset = base->key - cursor.leaf->data_len; 2843 2844 /* 2845 * Calculate the gap, if any, and zero-fill it. 2846 * 2847 * n is the offset of the start of the record verses our 2848 * current seek offset in the bio. 2849 */ 2850 n = (int)(rec_offset - (bio->bio_offset + boff)); 2851 if (n > 0) { 2852 if (n > bp->b_bufsize - boff) 2853 n = bp->b_bufsize - boff; 2854 bzero((char *)bp->b_data + boff, n); 2855 boff += n; 2856 n = 0; 2857 } 2858 2859 /* 2860 * Calculate the data offset in the record and the number 2861 * of bytes we can copy. 2862 * 2863 * There are two degenerate cases. First, boff may already 2864 * be at bp->b_bufsize. Secondly, the data offset within 2865 * the record may exceed the record's size. 2866 */ 2867 roff = -n; 2868 rec_offset += roff; 2869 n = cursor.leaf->data_len - roff; 2870 if (n <= 0) { 2871 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2872 n = 0; 2873 } else if (n > bp->b_bufsize - boff) { 2874 n = bp->b_bufsize - boff; 2875 } 2876 2877 /* 2878 * Deal with cached truncations. This cool bit of code 2879 * allows truncate()/ftruncate() to avoid having to sync 2880 * the file. 2881 * 2882 * If the frontend is truncated then all backend records are 2883 * subject to the frontend's truncation. 2884 * 2885 * If the backend is truncated then backend records on-disk 2886 * (but not in-memory) are subject to the backend's 2887 * truncation. In-memory records owned by the backend 2888 * represent data written after the truncation point on the 2889 * backend and must not be truncated. 2890 * 2891 * Truncate operations deal with frontend buffer cache 2892 * buffers and frontend-owned in-memory records synchronously. 2893 */ 2894 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2895 if (hammer_cursor_ondisk(&cursor)/* || 2896 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2897 if (ip->trunc_off <= rec_offset) 2898 n = 0; 2899 else if (ip->trunc_off < rec_offset + n) 2900 n = (int)(ip->trunc_off - rec_offset); 2901 } 2902 } 2903 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2904 if (hammer_cursor_ondisk(&cursor)) { 2905 if (ip->sync_trunc_off <= rec_offset) 2906 n = 0; 2907 else if (ip->sync_trunc_off < rec_offset + n) 2908 n = (int)(ip->sync_trunc_off - rec_offset); 2909 } 2910 } 2911 2912 /* 2913 * Try to issue a direct read into our bio if possible, 2914 * otherwise resolve the element data into a hammer_buffer 2915 * and copy. 2916 * 2917 * The buffer on-disk should be zerod past any real 2918 * truncation point, but may not be for any synthesized 2919 * truncation point from above. 2920 * 2921 * NOTE: disk_offset is only valid if the cursor data is 2922 * on-disk. 2923 */ 2924 disk_offset = cursor.leaf->data_offset + roff; 2925 isdedupable = (boff == 0 && n == bp->b_bufsize && 2926 hammer_cursor_ondisk(&cursor) && 2927 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2928 2929 if (isdedupable && hammer_double_buffer == 0) { 2930 /* 2931 * Direct read case 2932 */ 2933 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2934 HAMMER_ZONE_LARGE_DATA); 2935 nbio->bio_offset = disk_offset; 2936 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2937 if (hammer_live_dedup && error == 0) 2938 hammer_dedup_cache_add(ip, cursor.leaf); 2939 goto done; 2940 } else if (isdedupable) { 2941 /* 2942 * Async I/O case for reading from backing store 2943 * and copying the data to the filesystem buffer. 2944 * live-dedup has to verify the data anyway if it 2945 * gets a hit later so we can just add the entry 2946 * now. 2947 */ 2948 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2949 HAMMER_ZONE_LARGE_DATA); 2950 nbio->bio_offset = disk_offset; 2951 if (hammer_live_dedup) 2952 hammer_dedup_cache_add(ip, cursor.leaf); 2953 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2954 goto done; 2955 } else if (n) { 2956 error = hammer_ip_resolve_data(&cursor); 2957 if (error == 0) { 2958 if (hammer_live_dedup && isdedupable) 2959 hammer_dedup_cache_add(ip, cursor.leaf); 2960 bcopy((char *)cursor.data + roff, 2961 (char *)bp->b_data + boff, n); 2962 } 2963 } 2964 if (error) 2965 break; 2966 2967 /* 2968 * We have to be sure that the only elements added to the 2969 * dedup cache are those which are already on-media. 2970 */ 2971 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2972 hammer_dedup_cache_add(ip, cursor.leaf); 2973 2974 /* 2975 * Iterate until we have filled the request. 2976 */ 2977 boff += n; 2978 if (boff == bp->b_bufsize) 2979 break; 2980 error = hammer_ip_next(&cursor); 2981 } 2982 2983 /* 2984 * There may have been a gap after the last record 2985 */ 2986 if (error == ENOENT) 2987 error = 0; 2988 if (error == 0 && boff != bp->b_bufsize) { 2989 KKASSERT(boff < bp->b_bufsize); 2990 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2991 /* boff = bp->b_bufsize; */ 2992 } 2993 2994 /* 2995 * Disallow swapcache operation on the vnode buffer if double 2996 * buffering is enabled, the swapcache will get the data via 2997 * the block device buffer. 2998 */ 2999 if (hammer_double_buffer) 3000 bp->b_flags |= B_NOTMETA; 3001 3002 /* 3003 * Cleanup 3004 */ 3005 bp->b_resid = 0; 3006 bp->b_error = error; 3007 if (error) 3008 bp->b_flags |= B_ERROR; 3009 biodone(ap->a_bio); 3010 3011 done: 3012 /* 3013 * Cache the b-tree node for the last data read in cache[1]. 3014 * 3015 * If we hit the file EOF then also cache the node in the 3016 * governing director's cache[3], it will be used to initialize 3017 * the inode's cache[1] for any inodes looked up via the directory. 3018 * 3019 * This doesn't reduce disk accesses since the B-Tree chain is 3020 * likely cached, but it does reduce cpu overhead when looking 3021 * up file offsets for cpdup/tar/cpio style iterations. 3022 */ 3023 if (cursor.node) 3024 hammer_cache_node(&ip->cache[1], cursor.node); 3025 if (ran_end >= ip->ino_data.size) { 3026 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 3027 ip->obj_asof, ip->obj_localization); 3028 if (dip) { 3029 hammer_cache_node(&dip->cache[3], cursor.node); 3030 hammer_rel_inode(dip, 0); 3031 } 3032 } 3033 hammer_done_cursor(&cursor); 3034 hammer_done_transaction(&trans); 3035 lwkt_reltoken(&hmp->fs_token); 3036 return(error); 3037 } 3038 3039 /* 3040 * BMAP operation - used to support cluster_read() only. 3041 * 3042 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3043 * 3044 * This routine may return EOPNOTSUPP if the opration is not supported for 3045 * the specified offset. The contents of the pointer arguments do not 3046 * need to be initialized in that case. 3047 * 3048 * If a disk address is available and properly aligned return 0 with 3049 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3050 * to the run-length relative to that offset. Callers may assume that 3051 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3052 * large, so return EOPNOTSUPP if it is not sufficiently large. 3053 */ 3054 static 3055 int 3056 hammer_vop_bmap(struct vop_bmap_args *ap) 3057 { 3058 struct hammer_transaction trans; 3059 struct hammer_inode *ip; 3060 hammer_mount_t hmp; 3061 struct hammer_cursor cursor; 3062 hammer_base_elm_t base; 3063 int64_t rec_offset; 3064 int64_t ran_end; 3065 int64_t tmp64; 3066 int64_t base_offset; 3067 int64_t base_disk_offset; 3068 int64_t last_offset; 3069 hammer_off_t last_disk_offset; 3070 hammer_off_t disk_offset; 3071 int rec_len; 3072 int error; 3073 int blksize; 3074 3075 ++hammer_stats_file_iopsr; 3076 ip = ap->a_vp->v_data; 3077 hmp = ip->hmp; 3078 3079 /* 3080 * We can only BMAP regular files. We can't BMAP database files, 3081 * directories, etc. 3082 */ 3083 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3084 return(EOPNOTSUPP); 3085 3086 /* 3087 * bmap is typically called with runp/runb both NULL when used 3088 * for writing. We do not support BMAP for writing atm. 3089 */ 3090 if (ap->a_cmd != BUF_CMD_READ) 3091 return(EOPNOTSUPP); 3092 3093 /* 3094 * Scan the B-Tree to acquire blockmap addresses, then translate 3095 * to raw addresses. 3096 */ 3097 lwkt_gettoken(&hmp->fs_token); 3098 hammer_simple_transaction(&trans, hmp); 3099 #if 0 3100 kprintf("bmap_beg %016llx ip->cache %p\n", 3101 (long long)ap->a_loffset, ip->cache[1]); 3102 #endif 3103 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3104 3105 /* 3106 * Key range (begin and end inclusive) to scan. Note that the key's 3107 * stored in the actual records represent BASE+LEN, not BASE. The 3108 * first record containing bio_offset will have a key > bio_offset. 3109 */ 3110 cursor.key_beg.localization = ip->obj_localization + 3111 HAMMER_LOCALIZE_MISC; 3112 cursor.key_beg.obj_id = ip->obj_id; 3113 cursor.key_beg.create_tid = 0; 3114 cursor.key_beg.delete_tid = 0; 3115 cursor.key_beg.obj_type = 0; 3116 if (ap->a_runb) 3117 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3118 else 3119 cursor.key_beg.key = ap->a_loffset + 1; 3120 if (cursor.key_beg.key < 0) 3121 cursor.key_beg.key = 0; 3122 cursor.asof = ip->obj_asof; 3123 cursor.flags |= HAMMER_CURSOR_ASOF; 3124 3125 cursor.key_end = cursor.key_beg; 3126 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3127 3128 ran_end = ap->a_loffset + MAXPHYS; 3129 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3130 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3131 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3132 if (tmp64 < ran_end) 3133 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3134 else 3135 cursor.key_end.key = ran_end + MAXPHYS + 1; 3136 3137 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3138 3139 error = hammer_ip_first(&cursor); 3140 base_offset = last_offset = 0; 3141 base_disk_offset = last_disk_offset = 0; 3142 3143 while (error == 0) { 3144 /* 3145 * Get the base file offset of the record. The key for 3146 * data records is (base + bytes) rather then (base). 3147 * 3148 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3149 * The extra bytes should be zero on-disk and the BMAP op 3150 * should still be ok. 3151 */ 3152 base = &cursor.leaf->base; 3153 rec_offset = base->key - cursor.leaf->data_len; 3154 rec_len = cursor.leaf->data_len; 3155 3156 /* 3157 * Incorporate any cached truncation. 3158 * 3159 * NOTE: Modifications to rec_len based on synthesized 3160 * truncation points remove the guarantee that any extended 3161 * data on disk is zero (since the truncations may not have 3162 * taken place on-media yet). 3163 */ 3164 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3165 if (hammer_cursor_ondisk(&cursor) || 3166 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3167 if (ip->trunc_off <= rec_offset) 3168 rec_len = 0; 3169 else if (ip->trunc_off < rec_offset + rec_len) 3170 rec_len = (int)(ip->trunc_off - rec_offset); 3171 } 3172 } 3173 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3174 if (hammer_cursor_ondisk(&cursor)) { 3175 if (ip->sync_trunc_off <= rec_offset) 3176 rec_len = 0; 3177 else if (ip->sync_trunc_off < rec_offset + rec_len) 3178 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3179 } 3180 } 3181 3182 /* 3183 * Accumulate information. If we have hit a discontiguous 3184 * block reset base_offset unless we are already beyond the 3185 * requested offset. If we are, that's it, we stop. 3186 */ 3187 if (error) 3188 break; 3189 if (hammer_cursor_ondisk(&cursor)) { 3190 disk_offset = cursor.leaf->data_offset; 3191 if (rec_offset != last_offset || 3192 disk_offset != last_disk_offset) { 3193 if (rec_offset > ap->a_loffset) 3194 break; 3195 base_offset = rec_offset; 3196 base_disk_offset = disk_offset; 3197 } 3198 last_offset = rec_offset + rec_len; 3199 last_disk_offset = disk_offset + rec_len; 3200 3201 if (hammer_live_dedup) 3202 hammer_dedup_cache_add(ip, cursor.leaf); 3203 } 3204 3205 error = hammer_ip_next(&cursor); 3206 } 3207 3208 #if 0 3209 kprintf("BMAP %016llx: %016llx - %016llx\n", 3210 (long long)ap->a_loffset, 3211 (long long)base_offset, 3212 (long long)last_offset); 3213 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3214 (long long)base_disk_offset, 3215 (long long)last_disk_offset); 3216 #endif 3217 3218 if (cursor.node) { 3219 hammer_cache_node(&ip->cache[1], cursor.node); 3220 #if 0 3221 kprintf("bmap_end2 %016llx ip->cache %p\n", 3222 (long long)ap->a_loffset, ip->cache[1]); 3223 #endif 3224 } 3225 hammer_done_cursor(&cursor); 3226 hammer_done_transaction(&trans); 3227 lwkt_reltoken(&hmp->fs_token); 3228 3229 /* 3230 * If we couldn't find any records or the records we did find were 3231 * all behind the requested offset, return failure. A forward 3232 * truncation can leave a hole w/ no on-disk records. 3233 */ 3234 if (last_offset == 0 || last_offset < ap->a_loffset) 3235 return (EOPNOTSUPP); 3236 3237 /* 3238 * Figure out the block size at the requested offset and adjust 3239 * our limits so the cluster_read() does not create inappropriately 3240 * sized buffer cache buffers. 3241 */ 3242 blksize = hammer_blocksize(ap->a_loffset); 3243 if (hammer_blocksize(base_offset) != blksize) { 3244 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3245 } 3246 if (last_offset != ap->a_loffset && 3247 hammer_blocksize(last_offset - 1) != blksize) { 3248 last_offset = hammer_blockdemarc(ap->a_loffset, 3249 last_offset - 1); 3250 } 3251 3252 /* 3253 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3254 * from occuring. 3255 */ 3256 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3257 3258 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3259 /* 3260 * Only large-data zones can be direct-IOd 3261 */ 3262 error = EOPNOTSUPP; 3263 } else if ((disk_offset & HAMMER_BUFMASK) || 3264 (last_offset - ap->a_loffset) < blksize) { 3265 /* 3266 * doffsetp is not aligned or the forward run size does 3267 * not cover a whole buffer, disallow the direct I/O. 3268 */ 3269 error = EOPNOTSUPP; 3270 } else { 3271 /* 3272 * We're good. 3273 */ 3274 *ap->a_doffsetp = disk_offset; 3275 if (ap->a_runb) { 3276 *ap->a_runb = ap->a_loffset - base_offset; 3277 KKASSERT(*ap->a_runb >= 0); 3278 } 3279 if (ap->a_runp) { 3280 *ap->a_runp = last_offset - ap->a_loffset; 3281 KKASSERT(*ap->a_runp >= 0); 3282 } 3283 error = 0; 3284 } 3285 return(error); 3286 } 3287 3288 /* 3289 * Write to a regular file. Because this is a strategy call the OS is 3290 * trying to actually get data onto the media. 3291 */ 3292 static 3293 int 3294 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3295 { 3296 hammer_record_t record; 3297 hammer_mount_t hmp; 3298 hammer_inode_t ip; 3299 struct bio *bio; 3300 struct buf *bp; 3301 int blksize; 3302 int bytes; 3303 int error; 3304 3305 bio = ap->a_bio; 3306 bp = bio->bio_buf; 3307 ip = ap->a_vp->v_data; 3308 hmp = ip->hmp; 3309 3310 blksize = hammer_blocksize(bio->bio_offset); 3311 KKASSERT(bp->b_bufsize == blksize); 3312 3313 if (ip->flags & HAMMER_INODE_RO) { 3314 bp->b_error = EROFS; 3315 bp->b_flags |= B_ERROR; 3316 biodone(ap->a_bio); 3317 return(EROFS); 3318 } 3319 3320 lwkt_gettoken(&hmp->fs_token); 3321 3322 /* 3323 * Disallow swapcache operation on the vnode buffer if double 3324 * buffering is enabled, the swapcache will get the data via 3325 * the block device buffer. 3326 */ 3327 if (hammer_double_buffer) 3328 bp->b_flags |= B_NOTMETA; 3329 3330 /* 3331 * Interlock with inode destruction (no in-kernel or directory 3332 * topology visibility). If we queue new IO while trying to 3333 * destroy the inode we can deadlock the vtrunc call in 3334 * hammer_inode_unloadable_check(). 3335 * 3336 * Besides, there's no point flushing a bp associated with an 3337 * inode that is being destroyed on-media and has no kernel 3338 * references. 3339 */ 3340 if ((ip->flags | ip->sync_flags) & 3341 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3342 bp->b_resid = 0; 3343 biodone(ap->a_bio); 3344 lwkt_reltoken(&hmp->fs_token); 3345 return(0); 3346 } 3347 3348 /* 3349 * Reserve space and issue a direct-write from the front-end. 3350 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3351 * allocations. 3352 * 3353 * An in-memory record will be installed to reference the storage 3354 * until the flusher can get to it. 3355 * 3356 * Since we own the high level bio the front-end will not try to 3357 * do a direct-read until the write completes. 3358 * 3359 * NOTE: The only time we do not reserve a full-sized buffers 3360 * worth of data is if the file is small. We do not try to 3361 * allocate a fragment (from the small-data zone) at the end of 3362 * an otherwise large file as this can lead to wildly separated 3363 * data. 3364 */ 3365 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3366 KKASSERT(bio->bio_offset < ip->ino_data.size); 3367 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3368 bytes = bp->b_bufsize; 3369 else 3370 bytes = ((int)ip->ino_data.size + 15) & ~15; 3371 3372 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3373 bytes, &error); 3374 3375 /* 3376 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3377 * in hammer_vop_write(). We must flag the record so the proper 3378 * REDO_TERM_WRITE entry is generated during the flush. 3379 */ 3380 if (record) { 3381 if (bp->b_flags & B_VFSFLAG1) { 3382 record->flags |= HAMMER_RECF_REDO; 3383 bp->b_flags &= ~B_VFSFLAG1; 3384 } 3385 if (record->flags & HAMMER_RECF_DEDUPED) { 3386 bp->b_resid = 0; 3387 hammer_ip_replace_bulk(hmp, record); 3388 biodone(ap->a_bio); 3389 } else { 3390 hammer_io_direct_write(hmp, bio, record); 3391 } 3392 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3393 hammer_flush_inode(ip, 0); 3394 } else { 3395 bp->b_bio2.bio_offset = NOOFFSET; 3396 bp->b_error = error; 3397 bp->b_flags |= B_ERROR; 3398 biodone(ap->a_bio); 3399 } 3400 lwkt_reltoken(&hmp->fs_token); 3401 return(error); 3402 } 3403 3404 /* 3405 * dounlink - disconnect a directory entry 3406 * 3407 * XXX whiteout support not really in yet 3408 */ 3409 static int 3410 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3411 struct vnode *dvp, struct ucred *cred, 3412 int flags, int isdir) 3413 { 3414 struct namecache *ncp; 3415 hammer_inode_t dip; 3416 hammer_inode_t ip; 3417 hammer_mount_t hmp; 3418 struct hammer_cursor cursor; 3419 int64_t namekey; 3420 u_int32_t max_iterations; 3421 int nlen, error; 3422 3423 /* 3424 * Calculate the namekey and setup the key range for the scan. This 3425 * works kinda like a chained hash table where the lower 32 bits 3426 * of the namekey synthesize the chain. 3427 * 3428 * The key range is inclusive of both key_beg and key_end. 3429 */ 3430 dip = VTOI(dvp); 3431 ncp = nch->ncp; 3432 hmp = dip->hmp; 3433 3434 if (dip->flags & HAMMER_INODE_RO) 3435 return (EROFS); 3436 3437 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3438 &max_iterations); 3439 retry: 3440 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3441 cursor.key_beg.localization = dip->obj_localization + 3442 hammer_dir_localization(dip); 3443 cursor.key_beg.obj_id = dip->obj_id; 3444 cursor.key_beg.key = namekey; 3445 cursor.key_beg.create_tid = 0; 3446 cursor.key_beg.delete_tid = 0; 3447 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3448 cursor.key_beg.obj_type = 0; 3449 3450 cursor.key_end = cursor.key_beg; 3451 cursor.key_end.key += max_iterations; 3452 cursor.asof = dip->obj_asof; 3453 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3454 3455 /* 3456 * Scan all matching records (the chain), locate the one matching 3457 * the requested path component. info->last_error contains the 3458 * error code on search termination and could be 0, ENOENT, or 3459 * something else. 3460 * 3461 * The hammer_ip_*() functions merge in-memory records with on-disk 3462 * records for the purposes of the search. 3463 */ 3464 error = hammer_ip_first(&cursor); 3465 3466 while (error == 0) { 3467 error = hammer_ip_resolve_data(&cursor); 3468 if (error) 3469 break; 3470 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3471 KKASSERT(nlen > 0); 3472 if (ncp->nc_nlen == nlen && 3473 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3474 break; 3475 } 3476 error = hammer_ip_next(&cursor); 3477 } 3478 3479 /* 3480 * If all is ok we have to get the inode so we can adjust nlinks. 3481 * To avoid a deadlock with the flusher we must release the inode 3482 * lock on the directory when acquiring the inode for the entry. 3483 * 3484 * If the target is a directory, it must be empty. 3485 */ 3486 if (error == 0) { 3487 hammer_unlock(&cursor.ip->lock); 3488 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3489 hmp->asof, 3490 cursor.data->entry.localization, 3491 0, &error); 3492 hammer_lock_sh(&cursor.ip->lock); 3493 if (error == ENOENT) { 3494 kprintf("HAMMER: WARNING: Removing " 3495 "dirent w/missing inode \"%s\"\n" 3496 "\tobj_id = %016llx\n", 3497 ncp->nc_name, 3498 (long long)cursor.data->entry.obj_id); 3499 error = 0; 3500 } 3501 3502 /* 3503 * If isdir >= 0 we validate that the entry is or is not a 3504 * directory. If isdir < 0 we don't care. 3505 */ 3506 if (error == 0 && isdir >= 0 && ip) { 3507 if (isdir && 3508 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3509 error = ENOTDIR; 3510 } else if (isdir == 0 && 3511 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3512 error = EISDIR; 3513 } 3514 } 3515 3516 /* 3517 * If we are trying to remove a directory the directory must 3518 * be empty. 3519 * 3520 * The check directory code can loop and deadlock/retry. Our 3521 * own cursor's node locks must be released to avoid a 3-way 3522 * deadlock with the flusher if the check directory code 3523 * blocks. 3524 * 3525 * If any changes whatsoever have been made to the cursor 3526 * set EDEADLK and retry. 3527 * 3528 * WARNING: See warnings in hammer_unlock_cursor() 3529 * function. 3530 */ 3531 if (error == 0 && ip && ip->ino_data.obj_type == 3532 HAMMER_OBJTYPE_DIRECTORY) { 3533 hammer_unlock_cursor(&cursor); 3534 error = hammer_ip_check_directory_empty(trans, ip); 3535 hammer_lock_cursor(&cursor); 3536 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3537 kprintf("HAMMER: Warning: avoided deadlock " 3538 "on rmdir '%s'\n", 3539 ncp->nc_name); 3540 error = EDEADLK; 3541 } 3542 } 3543 3544 /* 3545 * Delete the directory entry. 3546 * 3547 * WARNING: hammer_ip_del_directory() may have to terminate 3548 * the cursor to avoid a deadlock. It is ok to call 3549 * hammer_done_cursor() twice. 3550 */ 3551 if (error == 0) { 3552 error = hammer_ip_del_directory(trans, &cursor, 3553 dip, ip); 3554 } 3555 hammer_done_cursor(&cursor); 3556 if (error == 0) { 3557 /* 3558 * Tell the namecache that we are now unlinked. 3559 */ 3560 cache_unlink(nch); 3561 3562 /* 3563 * NOTE: ip->vp, if non-NULL, cannot be directly 3564 * referenced without formally acquiring the 3565 * vp since the vp might have zero refs on it, 3566 * or in the middle of a reclaim, etc. 3567 * 3568 * NOTE: The cache_setunresolved() can rip the vp 3569 * out from under us since the vp may not have 3570 * any refs, in which case ip->vp will be NULL 3571 * from the outset. 3572 */ 3573 while (ip && ip->vp) { 3574 struct vnode *vp; 3575 3576 error = hammer_get_vnode(ip, &vp); 3577 if (error == 0 && vp) { 3578 vn_unlock(vp); 3579 hammer_knote(ip->vp, NOTE_DELETE); 3580 cache_inval_vp(ip->vp, CINV_DESTROY); 3581 vrele(vp); 3582 break; 3583 } 3584 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3585 } 3586 } 3587 if (ip) 3588 hammer_rel_inode(ip, 0); 3589 } else { 3590 hammer_done_cursor(&cursor); 3591 } 3592 if (error == EDEADLK) 3593 goto retry; 3594 3595 return (error); 3596 } 3597 3598 /************************************************************************ 3599 * FIFO AND SPECFS OPS * 3600 ************************************************************************ 3601 * 3602 */ 3603 static int 3604 hammer_vop_fifoclose (struct vop_close_args *ap) 3605 { 3606 /* XXX update itimes */ 3607 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3608 } 3609 3610 static int 3611 hammer_vop_fiforead (struct vop_read_args *ap) 3612 { 3613 int error; 3614 3615 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3616 /* XXX update access time */ 3617 return (error); 3618 } 3619 3620 static int 3621 hammer_vop_fifowrite (struct vop_write_args *ap) 3622 { 3623 int error; 3624 3625 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3626 /* XXX update access time */ 3627 return (error); 3628 } 3629 3630 static 3631 int 3632 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3633 { 3634 int error; 3635 3636 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3637 if (error) 3638 error = hammer_vop_kqfilter(ap); 3639 return(error); 3640 } 3641 3642 /************************************************************************ 3643 * KQFILTER OPS * 3644 ************************************************************************ 3645 * 3646 */ 3647 static void filt_hammerdetach(struct knote *kn); 3648 static int filt_hammerread(struct knote *kn, long hint); 3649 static int filt_hammerwrite(struct knote *kn, long hint); 3650 static int filt_hammervnode(struct knote *kn, long hint); 3651 3652 static struct filterops hammerread_filtops = 3653 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread }; 3654 static struct filterops hammerwrite_filtops = 3655 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite }; 3656 static struct filterops hammervnode_filtops = 3657 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode }; 3658 3659 static 3660 int 3661 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3662 { 3663 struct vnode *vp = ap->a_vp; 3664 struct knote *kn = ap->a_kn; 3665 3666 switch (kn->kn_filter) { 3667 case EVFILT_READ: 3668 kn->kn_fop = &hammerread_filtops; 3669 break; 3670 case EVFILT_WRITE: 3671 kn->kn_fop = &hammerwrite_filtops; 3672 break; 3673 case EVFILT_VNODE: 3674 kn->kn_fop = &hammervnode_filtops; 3675 break; 3676 default: 3677 return (EOPNOTSUPP); 3678 } 3679 3680 kn->kn_hook = (caddr_t)vp; 3681 3682 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3683 3684 return(0); 3685 } 3686 3687 static void 3688 filt_hammerdetach(struct knote *kn) 3689 { 3690 struct vnode *vp = (void *)kn->kn_hook; 3691 3692 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3693 } 3694 3695 static int 3696 filt_hammerread(struct knote *kn, long hint) 3697 { 3698 struct vnode *vp = (void *)kn->kn_hook; 3699 hammer_inode_t ip = VTOI(vp); 3700 hammer_mount_t hmp = ip->hmp; 3701 off_t off; 3702 3703 if (hint == NOTE_REVOKE) { 3704 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3705 return(1); 3706 } 3707 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3708 off = ip->ino_data.size - kn->kn_fp->f_offset; 3709 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3710 lwkt_reltoken(&hmp->fs_token); 3711 if (kn->kn_sfflags & NOTE_OLDAPI) 3712 return(1); 3713 return (kn->kn_data != 0); 3714 } 3715 3716 static int 3717 filt_hammerwrite(struct knote *kn, long hint) 3718 { 3719 if (hint == NOTE_REVOKE) 3720 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3721 kn->kn_data = 0; 3722 return (1); 3723 } 3724 3725 static int 3726 filt_hammervnode(struct knote *kn, long hint) 3727 { 3728 if (kn->kn_sfflags & hint) 3729 kn->kn_fflags |= hint; 3730 if (hint == NOTE_REVOKE) { 3731 kn->kn_flags |= (EV_EOF | EV_NODATA); 3732 return (1); 3733 } 3734 return (kn->kn_fflags != 0); 3735 } 3736 3737