1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vm/swap_pager.h> 50 #include <vfs/fifofs/fifo.h> 51 52 #include "hammer.h" 53 54 /* 55 * USERFS VNOPS 56 */ 57 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 58 static int hammer_vop_fsync(struct vop_fsync_args *); 59 static int hammer_vop_read(struct vop_read_args *); 60 static int hammer_vop_write(struct vop_write_args *); 61 static int hammer_vop_access(struct vop_access_args *); 62 static int hammer_vop_advlock(struct vop_advlock_args *); 63 static int hammer_vop_close(struct vop_close_args *); 64 static int hammer_vop_ncreate(struct vop_ncreate_args *); 65 static int hammer_vop_getattr(struct vop_getattr_args *); 66 static int hammer_vop_nresolve(struct vop_nresolve_args *); 67 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 68 static int hammer_vop_nlink(struct vop_nlink_args *); 69 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 70 static int hammer_vop_nmknod(struct vop_nmknod_args *); 71 static int hammer_vop_open(struct vop_open_args *); 72 static int hammer_vop_print(struct vop_print_args *); 73 static int hammer_vop_readdir(struct vop_readdir_args *); 74 static int hammer_vop_readlink(struct vop_readlink_args *); 75 static int hammer_vop_nremove(struct vop_nremove_args *); 76 static int hammer_vop_nrename(struct vop_nrename_args *); 77 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 78 static int hammer_vop_markatime(struct vop_markatime_args *); 79 static int hammer_vop_setattr(struct vop_setattr_args *); 80 static int hammer_vop_strategy(struct vop_strategy_args *); 81 static int hammer_vop_bmap(struct vop_bmap_args *ap); 82 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 83 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 84 static int hammer_vop_ioctl(struct vop_ioctl_args *); 85 static int hammer_vop_mountctl(struct vop_mountctl_args *); 86 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 87 88 static int hammer_vop_fifoclose (struct vop_close_args *); 89 static int hammer_vop_fiforead (struct vop_read_args *); 90 static int hammer_vop_fifowrite (struct vop_write_args *); 91 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 92 93 struct vop_ops hammer_vnode_vops = { 94 .vop_default = vop_defaultop, 95 .vop_fsync = hammer_vop_fsync, 96 .vop_getpages = vop_stdgetpages, 97 .vop_putpages = vop_stdputpages, 98 .vop_read = hammer_vop_read, 99 .vop_write = hammer_vop_write, 100 .vop_access = hammer_vop_access, 101 .vop_advlock = hammer_vop_advlock, 102 .vop_close = hammer_vop_close, 103 .vop_ncreate = hammer_vop_ncreate, 104 .vop_getattr = hammer_vop_getattr, 105 .vop_inactive = hammer_vop_inactive, 106 .vop_reclaim = hammer_vop_reclaim, 107 .vop_nresolve = hammer_vop_nresolve, 108 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 109 .vop_nlink = hammer_vop_nlink, 110 .vop_nmkdir = hammer_vop_nmkdir, 111 .vop_nmknod = hammer_vop_nmknod, 112 .vop_open = hammer_vop_open, 113 .vop_pathconf = vop_stdpathconf, 114 .vop_print = hammer_vop_print, 115 .vop_readdir = hammer_vop_readdir, 116 .vop_readlink = hammer_vop_readlink, 117 .vop_nremove = hammer_vop_nremove, 118 .vop_nrename = hammer_vop_nrename, 119 .vop_nrmdir = hammer_vop_nrmdir, 120 .vop_markatime = hammer_vop_markatime, 121 .vop_setattr = hammer_vop_setattr, 122 .vop_bmap = hammer_vop_bmap, 123 .vop_strategy = hammer_vop_strategy, 124 .vop_nsymlink = hammer_vop_nsymlink, 125 .vop_nwhiteout = hammer_vop_nwhiteout, 126 .vop_ioctl = hammer_vop_ioctl, 127 .vop_mountctl = hammer_vop_mountctl, 128 .vop_kqfilter = hammer_vop_kqfilter 129 }; 130 131 struct vop_ops hammer_spec_vops = { 132 .vop_default = vop_defaultop, 133 .vop_fsync = hammer_vop_fsync, 134 .vop_read = vop_stdnoread, 135 .vop_write = vop_stdnowrite, 136 .vop_access = hammer_vop_access, 137 .vop_close = hammer_vop_close, 138 .vop_markatime = hammer_vop_markatime, 139 .vop_getattr = hammer_vop_getattr, 140 .vop_inactive = hammer_vop_inactive, 141 .vop_reclaim = hammer_vop_reclaim, 142 .vop_setattr = hammer_vop_setattr 143 }; 144 145 struct vop_ops hammer_fifo_vops = { 146 .vop_default = fifo_vnoperate, 147 .vop_fsync = hammer_vop_fsync, 148 .vop_read = hammer_vop_fiforead, 149 .vop_write = hammer_vop_fifowrite, 150 .vop_access = hammer_vop_access, 151 .vop_close = hammer_vop_fifoclose, 152 .vop_markatime = hammer_vop_markatime, 153 .vop_getattr = hammer_vop_getattr, 154 .vop_inactive = hammer_vop_inactive, 155 .vop_reclaim = hammer_vop_reclaim, 156 .vop_setattr = hammer_vop_setattr, 157 .vop_kqfilter = hammer_vop_fifokqfilter 158 }; 159 160 static __inline 161 void 162 hammer_knote(struct vnode *vp, int flags) 163 { 164 if (flags) 165 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 166 } 167 168 #ifdef DEBUG_TRUNCATE 169 struct hammer_inode *HammerTruncIp; 170 #endif 171 172 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 173 struct vnode *dvp, struct ucred *cred, 174 int flags, int isdir); 175 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 176 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 177 178 #if 0 179 static 180 int 181 hammer_vop_vnoperate(struct vop_generic_args *) 182 { 183 return (VOCALL(&hammer_vnode_vops, ap)); 184 } 185 #endif 186 187 /* 188 * hammer_vop_fsync { vp, waitfor } 189 * 190 * fsync() an inode to disk and wait for it to be completely committed 191 * such that the information would not be undone if a crash occured after 192 * return. 193 * 194 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 195 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 196 * operation. 197 * 198 * Ultimately the combination of a REDO log and use of fast storage 199 * to front-end cluster caches will make fsync fast, but it aint 200 * here yet. And, in anycase, we need real transactional 201 * all-or-nothing features which are not restricted to a single file. 202 */ 203 static 204 int 205 hammer_vop_fsync(struct vop_fsync_args *ap) 206 { 207 hammer_inode_t ip = VTOI(ap->a_vp); 208 hammer_mount_t hmp = ip->hmp; 209 int waitfor = ap->a_waitfor; 210 int mode; 211 212 lwkt_gettoken(&hmp->fs_token); 213 214 /* 215 * Fsync rule relaxation (default is either full synchronous flush 216 * or REDO semantics with synchronous flush). 217 */ 218 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 219 switch(hammer_fsync_mode) { 220 case 0: 221 mode0: 222 /* no REDO, full synchronous flush */ 223 goto skip; 224 case 1: 225 mode1: 226 /* no REDO, full asynchronous flush */ 227 if (waitfor == MNT_WAIT) 228 waitfor = MNT_NOWAIT; 229 goto skip; 230 case 2: 231 /* REDO semantics, synchronous flush */ 232 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 233 goto mode0; 234 mode = HAMMER_FLUSH_UNDOS_AUTO; 235 break; 236 case 3: 237 /* REDO semantics, relaxed asynchronous flush */ 238 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 239 goto mode1; 240 mode = HAMMER_FLUSH_UNDOS_RELAXED; 241 if (waitfor == MNT_WAIT) 242 waitfor = MNT_NOWAIT; 243 break; 244 case 4: 245 /* ignore the fsync() system call */ 246 lwkt_reltoken(&hmp->fs_token); 247 return(0); 248 default: 249 /* we have to do something */ 250 mode = HAMMER_FLUSH_UNDOS_RELAXED; 251 if (waitfor == MNT_WAIT) 252 waitfor = MNT_NOWAIT; 253 break; 254 } 255 256 /* 257 * Fast fsync only needs to flush the UNDO/REDO fifo if 258 * HAMMER_INODE_REDO is non-zero and the only modifications 259 * made to the file are write or write-extends. 260 */ 261 if ((ip->flags & HAMMER_INODE_REDO) && 262 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 263 ) { 264 ++hammer_count_fsyncs; 265 hammer_flusher_flush_undos(hmp, mode); 266 ip->redo_count = 0; 267 lwkt_reltoken(&hmp->fs_token); 268 return(0); 269 } 270 271 /* 272 * REDO is enabled by fsync(), the idea being we really only 273 * want to lay down REDO records when programs are using 274 * fsync() heavily. The first fsync() on the file starts 275 * the gravy train going and later fsync()s keep it hot by 276 * resetting the redo_count. 277 * 278 * We weren't running REDOs before now so we have to fall 279 * through and do a full fsync of what we have. 280 */ 281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 283 ip->flags |= HAMMER_INODE_REDO; 284 ip->redo_count = 0; 285 } 286 } 287 skip: 288 289 /* 290 * Do a full flush sequence. 291 */ 292 ++hammer_count_fsyncs; 293 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 294 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 295 if (waitfor == MNT_WAIT) { 296 vn_unlock(ap->a_vp); 297 hammer_wait_inode(ip); 298 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 299 } 300 lwkt_reltoken(&hmp->fs_token); 301 return (ip->error); 302 } 303 304 /* 305 * hammer_vop_read { vp, uio, ioflag, cred } 306 * 307 * MPSAFE (for the cache safe does not require fs_token) 308 */ 309 static 310 int 311 hammer_vop_read(struct vop_read_args *ap) 312 { 313 struct hammer_transaction trans; 314 hammer_inode_t ip; 315 hammer_mount_t hmp; 316 off_t offset; 317 struct buf *bp; 318 struct uio *uio; 319 int error; 320 int n; 321 int seqcount; 322 int ioseqcount; 323 int blksize; 324 int bigread; 325 int got_fstoken; 326 327 if (ap->a_vp->v_type != VREG) 328 return (EINVAL); 329 ip = VTOI(ap->a_vp); 330 hmp = ip->hmp; 331 error = 0; 332 uio = ap->a_uio; 333 334 /* 335 * Allow the UIO's size to override the sequential heuristic. 336 */ 337 blksize = hammer_blocksize(uio->uio_offset); 338 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 339 ioseqcount = (ap->a_ioflag >> 16); 340 if (seqcount < ioseqcount) 341 seqcount = ioseqcount; 342 343 /* 344 * If reading or writing a huge amount of data we have to break 345 * atomicy and allow the operation to be interrupted by a signal 346 * or it can DOS the machine. 347 */ 348 bigread = (uio->uio_resid > 100 * 1024 * 1024); 349 got_fstoken = 0; 350 351 /* 352 * Access the data typically in HAMMER_BUFSIZE blocks via the 353 * buffer cache, but HAMMER may use a variable block size based 354 * on the offset. 355 * 356 * XXX Temporary hack, delay the start transaction while we remain 357 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 358 * locked-shared. 359 */ 360 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 361 int64_t base_offset; 362 int64_t file_limit; 363 364 blksize = hammer_blocksize(uio->uio_offset); 365 offset = (int)uio->uio_offset & (blksize - 1); 366 base_offset = uio->uio_offset - offset; 367 368 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 369 break; 370 371 /* 372 * MPSAFE 373 */ 374 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 375 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 376 bp->b_flags &= ~B_AGE; 377 error = 0; 378 goto skip; 379 } 380 if (ap->a_ioflag & IO_NRDELAY) { 381 bqrelse(bp); 382 return (EWOULDBLOCK); 383 } 384 385 /* 386 * MPUNSAFE 387 */ 388 if (got_fstoken == 0) { 389 lwkt_gettoken(&hmp->fs_token); 390 got_fstoken = 1; 391 hammer_start_transaction(&trans, ip->hmp); 392 } 393 394 /* 395 * NOTE: A valid bp has already been acquired, but was not 396 * B_CACHE. 397 */ 398 if (hammer_cluster_enable) { 399 /* 400 * Use file_limit to prevent cluster_read() from 401 * creating buffers of the wrong block size past 402 * the demarc. 403 */ 404 file_limit = ip->ino_data.size; 405 if (base_offset < HAMMER_XDEMARC && 406 file_limit > HAMMER_XDEMARC) { 407 file_limit = HAMMER_XDEMARC; 408 } 409 error = cluster_readx(ap->a_vp, 410 file_limit, base_offset, 411 blksize, uio->uio_resid, 412 seqcount * BKVASIZE, &bp); 413 } else { 414 error = breadnx(ap->a_vp, base_offset, blksize, 415 NULL, NULL, 0, &bp); 416 } 417 if (error) { 418 brelse(bp); 419 break; 420 } 421 skip: 422 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 423 kprintf("doff %016jx read file %016jx@%016jx\n", 424 (intmax_t)bp->b_bio2.bio_offset, 425 (intmax_t)ip->obj_id, 426 (intmax_t)bp->b_loffset); 427 } 428 bp->b_flags &= ~B_IODEBUG; 429 430 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 431 n = blksize - offset; 432 if (n > uio->uio_resid) 433 n = uio->uio_resid; 434 if (n > ip->ino_data.size - uio->uio_offset) 435 n = (int)(ip->ino_data.size - uio->uio_offset); 436 if (got_fstoken) 437 lwkt_reltoken(&hmp->fs_token); 438 439 /* 440 * Set B_AGE, data has a lower priority than meta-data. 441 * 442 * Use a hold/unlock/drop sequence to run the uiomove 443 * with the buffer unlocked, avoiding deadlocks against 444 * read()s on mmap()'d spaces. 445 */ 446 bp->b_flags |= B_AGE; 447 bqhold(bp); 448 bqrelse(bp); 449 error = uiomove((char *)bp->b_data + offset, n, uio); 450 bqdrop(bp); 451 452 if (got_fstoken) 453 lwkt_gettoken(&hmp->fs_token); 454 455 if (error) 456 break; 457 hammer_stats_file_read += n; 458 } 459 460 /* 461 * Try to update the atime with just the inode lock for maximum 462 * concurrency. If we can't shortcut it we have to get the full 463 * blown transaction. 464 */ 465 if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) { 466 lwkt_gettoken(&hmp->fs_token); 467 got_fstoken = 1; 468 hammer_start_transaction(&trans, ip->hmp); 469 } 470 471 if (got_fstoken) { 472 if ((ip->flags & HAMMER_INODE_RO) == 0 && 473 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 474 ip->ino_data.atime = trans.time; 475 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 476 } 477 hammer_done_transaction(&trans); 478 lwkt_reltoken(&hmp->fs_token); 479 } 480 return (error); 481 } 482 483 /* 484 * hammer_vop_write { vp, uio, ioflag, cred } 485 */ 486 static 487 int 488 hammer_vop_write(struct vop_write_args *ap) 489 { 490 struct hammer_transaction trans; 491 struct hammer_inode *ip; 492 hammer_mount_t hmp; 493 thread_t td; 494 struct uio *uio; 495 int offset; 496 off_t base_offset; 497 struct buf *bp; 498 int kflags; 499 int error; 500 int n; 501 int flags; 502 int seqcount; 503 int bigwrite; 504 505 if (ap->a_vp->v_type != VREG) 506 return (EINVAL); 507 ip = VTOI(ap->a_vp); 508 hmp = ip->hmp; 509 error = 0; 510 kflags = 0; 511 seqcount = ap->a_ioflag >> 16; 512 513 if (ip->flags & HAMMER_INODE_RO) 514 return (EROFS); 515 516 /* 517 * Create a transaction to cover the operations we perform. 518 */ 519 lwkt_gettoken(&hmp->fs_token); 520 hammer_start_transaction(&trans, hmp); 521 uio = ap->a_uio; 522 523 /* 524 * Check append mode 525 */ 526 if (ap->a_ioflag & IO_APPEND) 527 uio->uio_offset = ip->ino_data.size; 528 529 /* 530 * Check for illegal write offsets. Valid range is 0...2^63-1. 531 * 532 * NOTE: the base_off assignment is required to work around what 533 * I consider to be a GCC-4 optimization bug. 534 */ 535 if (uio->uio_offset < 0) { 536 hammer_done_transaction(&trans); 537 lwkt_reltoken(&hmp->fs_token); 538 return (EFBIG); 539 } 540 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 541 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 542 hammer_done_transaction(&trans); 543 lwkt_reltoken(&hmp->fs_token); 544 return (EFBIG); 545 } 546 547 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 548 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 549 hammer_done_transaction(&trans); 550 lwkt_reltoken(&hmp->fs_token); 551 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 552 return (EFBIG); 553 } 554 555 /* 556 * If reading or writing a huge amount of data we have to break 557 * atomicy and allow the operation to be interrupted by a signal 558 * or it can DOS the machine. 559 * 560 * Preset redo_count so we stop generating REDOs earlier if the 561 * limit is exceeded. 562 */ 563 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 564 if ((ip->flags & HAMMER_INODE_REDO) && 565 ip->redo_count < hammer_limit_redo) { 566 ip->redo_count += uio->uio_resid; 567 } 568 569 /* 570 * Access the data typically in HAMMER_BUFSIZE blocks via the 571 * buffer cache, but HAMMER may use a variable block size based 572 * on the offset. 573 */ 574 while (uio->uio_resid > 0) { 575 int fixsize = 0; 576 int blksize; 577 int blkmask; 578 int trivial; 579 int endofblk; 580 off_t nsize; 581 582 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 583 break; 584 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 585 break; 586 587 blksize = hammer_blocksize(uio->uio_offset); 588 589 /* 590 * Do not allow HAMMER to blow out the buffer cache. Very 591 * large UIOs can lockout other processes due to bwillwrite() 592 * mechanics. 593 * 594 * The hammer inode is not locked during these operations. 595 * The vnode is locked which can interfere with the pageout 596 * daemon for non-UIO_NOCOPY writes but should not interfere 597 * with the buffer cache. Even so, we cannot afford to 598 * allow the pageout daemon to build up too many dirty buffer 599 * cache buffers. 600 * 601 * Only call this if we aren't being recursively called from 602 * a virtual disk device (vn), else we may deadlock. 603 */ 604 if ((ap->a_ioflag & IO_RECURSE) == 0) 605 bwillwrite(blksize); 606 607 /* 608 * Control the number of pending records associated with 609 * this inode. If too many have accumulated start a 610 * flush. Try to maintain a pipeline with the flusher. 611 * 612 * NOTE: It is possible for other sources to grow the 613 * records but not necessarily issue another flush, 614 * so use a timeout and ensure that a re-flush occurs. 615 */ 616 if (ip->rsv_recs >= hammer_limit_inode_recs) { 617 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 618 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 619 ip->flags |= HAMMER_INODE_RECSW; 620 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 621 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 622 } 623 } 624 625 #if 0 626 /* 627 * Do not allow HAMMER to blow out system memory by 628 * accumulating too many records. Records are so well 629 * decoupled from the buffer cache that it is possible 630 * for userland to push data out to the media via 631 * direct-write, but build up the records queued to the 632 * backend faster then the backend can flush them out. 633 * HAMMER has hit its write limit but the frontend has 634 * no pushback to slow it down. 635 */ 636 if (hmp->rsv_recs > hammer_limit_recs / 2) { 637 /* 638 * Get the inode on the flush list 639 */ 640 if (ip->rsv_recs >= 64) 641 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 642 else if (ip->rsv_recs >= 16) 643 hammer_flush_inode(ip, 0); 644 645 /* 646 * Keep the flusher going if the system keeps 647 * queueing records. 648 */ 649 delta = hmp->count_newrecords - 650 hmp->last_newrecords; 651 if (delta < 0 || delta > hammer_limit_recs / 2) { 652 hmp->last_newrecords = hmp->count_newrecords; 653 hammer_sync_hmp(hmp, MNT_NOWAIT); 654 } 655 656 /* 657 * If we have gotten behind start slowing 658 * down the writers. 659 */ 660 delta = (hmp->rsv_recs - hammer_limit_recs) * 661 hz / hammer_limit_recs; 662 if (delta > 0) 663 tsleep(&trans, 0, "hmrslo", delta); 664 } 665 #endif 666 667 /* 668 * Calculate the blocksize at the current offset and figure 669 * out how much we can actually write. 670 */ 671 blkmask = blksize - 1; 672 offset = (int)uio->uio_offset & blkmask; 673 base_offset = uio->uio_offset & ~(int64_t)blkmask; 674 n = blksize - offset; 675 if (n > uio->uio_resid) { 676 n = uio->uio_resid; 677 endofblk = 0; 678 } else { 679 endofblk = 1; 680 } 681 nsize = uio->uio_offset + n; 682 if (nsize > ip->ino_data.size) { 683 if (uio->uio_offset > ip->ino_data.size) 684 trivial = 0; 685 else 686 trivial = 1; 687 nvextendbuf(ap->a_vp, 688 ip->ino_data.size, 689 nsize, 690 hammer_blocksize(ip->ino_data.size), 691 hammer_blocksize(nsize), 692 hammer_blockoff(ip->ino_data.size), 693 hammer_blockoff(nsize), 694 trivial); 695 fixsize = 1; 696 kflags |= NOTE_EXTEND; 697 } 698 699 if (uio->uio_segflg == UIO_NOCOPY) { 700 /* 701 * Issuing a write with the same data backing the 702 * buffer. Instantiate the buffer to collect the 703 * backing vm pages, then read-in any missing bits. 704 * 705 * This case is used by vop_stdputpages(). 706 */ 707 bp = getblk(ap->a_vp, base_offset, 708 blksize, GETBLK_BHEAVY, 0); 709 if ((bp->b_flags & B_CACHE) == 0) { 710 bqrelse(bp); 711 error = bread(ap->a_vp, base_offset, 712 blksize, &bp); 713 } 714 } else if (offset == 0 && uio->uio_resid >= blksize) { 715 /* 716 * Even though we are entirely overwriting the buffer 717 * we may still have to zero it out to avoid a 718 * mmap/write visibility issue. 719 */ 720 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 721 if ((bp->b_flags & B_CACHE) == 0) 722 vfs_bio_clrbuf(bp); 723 } else if (base_offset >= ip->ino_data.size) { 724 /* 725 * If the base offset of the buffer is beyond the 726 * file EOF, we don't have to issue a read. 727 */ 728 bp = getblk(ap->a_vp, base_offset, 729 blksize, GETBLK_BHEAVY, 0); 730 vfs_bio_clrbuf(bp); 731 } else { 732 /* 733 * Partial overwrite, read in any missing bits then 734 * replace the portion being written. 735 */ 736 error = bread(ap->a_vp, base_offset, blksize, &bp); 737 if (error == 0) 738 bheavy(bp); 739 } 740 if (error == 0) { 741 lwkt_reltoken(&hmp->fs_token); 742 error = uiomove(bp->b_data + offset, n, uio); 743 lwkt_gettoken(&hmp->fs_token); 744 } 745 746 /* 747 * Generate REDO records if enabled and redo_count will not 748 * exceeded the limit. 749 * 750 * If redo_count exceeds the limit we stop generating records 751 * and clear HAMMER_INODE_REDO. This will cause the next 752 * fsync() to do a full meta-data sync instead of just an 753 * UNDO/REDO fifo update. 754 * 755 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 756 * will still be tracked. The tracks will be terminated 757 * when the related meta-data (including possible data 758 * modifications which are not tracked via REDO) is 759 * flushed. 760 */ 761 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 762 if (ip->redo_count < hammer_limit_redo) { 763 bp->b_flags |= B_VFSFLAG1; 764 error = hammer_generate_redo(&trans, ip, 765 base_offset + offset, 766 HAMMER_REDO_WRITE, 767 bp->b_data + offset, 768 (size_t)n); 769 } else { 770 ip->flags &= ~HAMMER_INODE_REDO; 771 } 772 } 773 774 /* 775 * If we screwed up we have to undo any VM size changes we 776 * made. 777 */ 778 if (error) { 779 brelse(bp); 780 if (fixsize) { 781 nvtruncbuf(ap->a_vp, ip->ino_data.size, 782 hammer_blocksize(ip->ino_data.size), 783 hammer_blockoff(ip->ino_data.size)); 784 } 785 break; 786 } 787 kflags |= NOTE_WRITE; 788 hammer_stats_file_write += n; 789 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 790 if (ip->ino_data.size < uio->uio_offset) { 791 ip->ino_data.size = uio->uio_offset; 792 flags = HAMMER_INODE_SDIRTY; 793 } else { 794 flags = 0; 795 } 796 ip->ino_data.mtime = trans.time; 797 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 798 hammer_modify_inode(&trans, ip, flags); 799 800 /* 801 * Once we dirty the buffer any cached zone-X offset 802 * becomes invalid. HAMMER NOTE: no-history mode cannot 803 * allow overwriting over the same data sector unless 804 * we provide UNDOs for the old data, which we don't. 805 */ 806 bp->b_bio2.bio_offset = NOOFFSET; 807 808 /* 809 * Final buffer disposition. 810 * 811 * Because meta-data updates are deferred, HAMMER is 812 * especially sensitive to excessive bdwrite()s because 813 * the I/O stream is not broken up by disk reads. So the 814 * buffer cache simply cannot keep up. 815 * 816 * WARNING! blksize is variable. cluster_write() is 817 * expected to not blow up if it encounters 818 * buffers that do not match the passed blksize. 819 * 820 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 821 * The ip->rsv_recs check should burst-flush the data. 822 * If we queue it immediately the buf could be left 823 * locked on the device queue for a very long time. 824 * 825 * NOTE! To avoid degenerate stalls due to mismatched block 826 * sizes we only honor IO_DIRECT on the write which 827 * abuts the end of the buffer. However, we must 828 * honor IO_SYNC in case someone is silly enough to 829 * configure a HAMMER file as swap, or when HAMMER 830 * is serving NFS (for commits). Ick ick. 831 */ 832 bp->b_flags |= B_AGE; 833 if (ap->a_ioflag & IO_SYNC) { 834 bwrite(bp); 835 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 836 bawrite(bp); 837 } else { 838 #if 0 839 if (offset + n == blksize) { 840 if (hammer_cluster_enable == 0 || 841 (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 842 bawrite(bp); 843 } else { 844 cluster_write(bp, ip->ino_data.size, 845 blksize, seqcount); 846 } 847 } else { 848 #endif 849 bdwrite(bp); 850 } 851 } 852 hammer_done_transaction(&trans); 853 hammer_knote(ap->a_vp, kflags); 854 lwkt_reltoken(&hmp->fs_token); 855 return (error); 856 } 857 858 /* 859 * hammer_vop_access { vp, mode, cred } 860 * 861 * MPSAFE - does not require fs_token 862 */ 863 static 864 int 865 hammer_vop_access(struct vop_access_args *ap) 866 { 867 struct hammer_inode *ip = VTOI(ap->a_vp); 868 uid_t uid; 869 gid_t gid; 870 int error; 871 872 ++hammer_stats_file_iopsr; 873 uid = hammer_to_unix_xid(&ip->ino_data.uid); 874 gid = hammer_to_unix_xid(&ip->ino_data.gid); 875 876 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 877 ip->ino_data.uflags); 878 return (error); 879 } 880 881 /* 882 * hammer_vop_advlock { vp, id, op, fl, flags } 883 * 884 * MPSAFE - does not require fs_token 885 */ 886 static 887 int 888 hammer_vop_advlock(struct vop_advlock_args *ap) 889 { 890 hammer_inode_t ip = VTOI(ap->a_vp); 891 892 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 893 } 894 895 /* 896 * hammer_vop_close { vp, fflag } 897 * 898 * We can only sync-on-close for normal closes. XXX disabled for now. 899 */ 900 static 901 int 902 hammer_vop_close(struct vop_close_args *ap) 903 { 904 #if 0 905 struct vnode *vp = ap->a_vp; 906 hammer_inode_t ip = VTOI(vp); 907 int waitfor; 908 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 909 if (vn_islocked(vp) == LK_EXCLUSIVE && 910 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 911 if (ip->flags & HAMMER_INODE_CLOSESYNC) 912 waitfor = MNT_WAIT; 913 else 914 waitfor = MNT_NOWAIT; 915 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 916 HAMMER_INODE_CLOSEASYNC); 917 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 918 } 919 } 920 #endif 921 return (vop_stdclose(ap)); 922 } 923 924 /* 925 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 926 * 927 * The operating system has already ensured that the directory entry 928 * does not exist and done all appropriate namespace locking. 929 */ 930 static 931 int 932 hammer_vop_ncreate(struct vop_ncreate_args *ap) 933 { 934 struct hammer_transaction trans; 935 struct hammer_inode *dip; 936 struct hammer_inode *nip; 937 struct nchandle *nch; 938 hammer_mount_t hmp; 939 int error; 940 941 nch = ap->a_nch; 942 dip = VTOI(ap->a_dvp); 943 hmp = dip->hmp; 944 945 if (dip->flags & HAMMER_INODE_RO) 946 return (EROFS); 947 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 948 return (error); 949 950 /* 951 * Create a transaction to cover the operations we perform. 952 */ 953 lwkt_gettoken(&hmp->fs_token); 954 hammer_start_transaction(&trans, hmp); 955 ++hammer_stats_file_iopsw; 956 957 /* 958 * Create a new filesystem object of the requested type. The 959 * returned inode will be referenced and shared-locked to prevent 960 * it from being moved to the flusher. 961 */ 962 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 963 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 964 NULL, &nip); 965 if (error) { 966 hkprintf("hammer_create_inode error %d\n", error); 967 hammer_done_transaction(&trans); 968 *ap->a_vpp = NULL; 969 lwkt_reltoken(&hmp->fs_token); 970 return (error); 971 } 972 973 /* 974 * Add the new filesystem object to the directory. This will also 975 * bump the inode's link count. 976 */ 977 error = hammer_ip_add_directory(&trans, dip, 978 nch->ncp->nc_name, nch->ncp->nc_nlen, 979 nip); 980 if (error) 981 hkprintf("hammer_ip_add_directory error %d\n", error); 982 983 /* 984 * Finish up. 985 */ 986 if (error) { 987 hammer_rel_inode(nip, 0); 988 hammer_done_transaction(&trans); 989 *ap->a_vpp = NULL; 990 } else { 991 error = hammer_get_vnode(nip, ap->a_vpp); 992 hammer_done_transaction(&trans); 993 hammer_rel_inode(nip, 0); 994 if (error == 0) { 995 cache_setunresolved(ap->a_nch); 996 cache_setvp(ap->a_nch, *ap->a_vpp); 997 } 998 hammer_knote(ap->a_dvp, NOTE_WRITE); 999 } 1000 lwkt_reltoken(&hmp->fs_token); 1001 return (error); 1002 } 1003 1004 /* 1005 * hammer_vop_getattr { vp, vap } 1006 * 1007 * Retrieve an inode's attribute information. When accessing inodes 1008 * historically we fake the atime field to ensure consistent results. 1009 * The atime field is stored in the B-Tree element and allowed to be 1010 * updated without cycling the element. 1011 * 1012 * MPSAFE - does not require fs_token 1013 */ 1014 static 1015 int 1016 hammer_vop_getattr(struct vop_getattr_args *ap) 1017 { 1018 struct hammer_inode *ip = VTOI(ap->a_vp); 1019 struct vattr *vap = ap->a_vap; 1020 1021 /* 1022 * We want the fsid to be different when accessing a filesystem 1023 * with different as-of's so programs like diff don't think 1024 * the files are the same. 1025 * 1026 * We also want the fsid to be the same when comparing snapshots, 1027 * or when comparing mirrors (which might be backed by different 1028 * physical devices). HAMMER fsids are based on the PFS's 1029 * shared_uuid field. 1030 * 1031 * XXX there is a chance of collision here. The va_fsid reported 1032 * by stat is different from the more involved fsid used in the 1033 * mount structure. 1034 */ 1035 ++hammer_stats_file_iopsr; 1036 hammer_lock_sh(&ip->lock); 1037 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1038 (u_int32_t)(ip->obj_asof >> 32); 1039 1040 vap->va_fileid = ip->ino_leaf.base.obj_id; 1041 vap->va_mode = ip->ino_data.mode; 1042 vap->va_nlink = ip->ino_data.nlinks; 1043 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1044 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1045 vap->va_rmajor = 0; 1046 vap->va_rminor = 0; 1047 vap->va_size = ip->ino_data.size; 1048 1049 /* 1050 * Special case for @@PFS softlinks. The actual size of the 1051 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1052 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1053 */ 1054 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1055 ip->ino_data.size == 10 && 1056 ip->obj_asof == HAMMER_MAX_TID && 1057 ip->obj_localization == 0 && 1058 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1059 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1060 vap->va_size = 26; 1061 else 1062 vap->va_size = 10; 1063 } 1064 1065 /* 1066 * We must provide a consistent atime and mtime for snapshots 1067 * so people can do a 'tar cf - ... | md5' on them and get 1068 * consistent results. 1069 */ 1070 if (ip->flags & HAMMER_INODE_RO) { 1071 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1072 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1073 } else { 1074 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1075 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1076 } 1077 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1078 vap->va_flags = ip->ino_data.uflags; 1079 vap->va_gen = 1; /* hammer inums are unique for all time */ 1080 vap->va_blocksize = HAMMER_BUFSIZE; 1081 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1082 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1083 ~HAMMER_XBUFMASK64; 1084 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1085 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1086 ~HAMMER_BUFMASK64; 1087 } else { 1088 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1089 } 1090 1091 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1092 vap->va_filerev = 0; /* XXX */ 1093 vap->va_uid_uuid = ip->ino_data.uid; 1094 vap->va_gid_uuid = ip->ino_data.gid; 1095 vap->va_fsid_uuid = ip->hmp->fsid; 1096 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1097 VA_FSID_UUID_VALID; 1098 1099 switch (ip->ino_data.obj_type) { 1100 case HAMMER_OBJTYPE_CDEV: 1101 case HAMMER_OBJTYPE_BDEV: 1102 vap->va_rmajor = ip->ino_data.rmajor; 1103 vap->va_rminor = ip->ino_data.rminor; 1104 break; 1105 default: 1106 break; 1107 } 1108 hammer_unlock(&ip->lock); 1109 return(0); 1110 } 1111 1112 /* 1113 * hammer_vop_nresolve { nch, dvp, cred } 1114 * 1115 * Locate the requested directory entry. 1116 */ 1117 static 1118 int 1119 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1120 { 1121 struct hammer_transaction trans; 1122 struct namecache *ncp; 1123 hammer_mount_t hmp; 1124 hammer_inode_t dip; 1125 hammer_inode_t ip; 1126 hammer_tid_t asof; 1127 struct hammer_cursor cursor; 1128 struct vnode *vp; 1129 int64_t namekey; 1130 int error; 1131 int i; 1132 int nlen; 1133 int flags; 1134 int ispfs; 1135 int64_t obj_id; 1136 u_int32_t localization; 1137 u_int32_t max_iterations; 1138 1139 /* 1140 * Misc initialization, plus handle as-of name extensions. Look for 1141 * the '@@' extension. Note that as-of files and directories cannot 1142 * be modified. 1143 */ 1144 dip = VTOI(ap->a_dvp); 1145 ncp = ap->a_nch->ncp; 1146 asof = dip->obj_asof; 1147 localization = dip->obj_localization; /* for code consistency */ 1148 nlen = ncp->nc_nlen; 1149 flags = dip->flags & HAMMER_INODE_RO; 1150 ispfs = 0; 1151 hmp = dip->hmp; 1152 1153 lwkt_gettoken(&hmp->fs_token); 1154 hammer_simple_transaction(&trans, hmp); 1155 ++hammer_stats_file_iopsr; 1156 1157 for (i = 0; i < nlen; ++i) { 1158 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1159 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1160 &ispfs, &asof, &localization); 1161 if (error != 0) { 1162 i = nlen; 1163 break; 1164 } 1165 if (asof != HAMMER_MAX_TID) 1166 flags |= HAMMER_INODE_RO; 1167 break; 1168 } 1169 } 1170 nlen = i; 1171 1172 /* 1173 * If this is a PFS softlink we dive into the PFS 1174 */ 1175 if (ispfs && nlen == 0) { 1176 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1177 asof, localization, 1178 flags, &error); 1179 if (error == 0) { 1180 error = hammer_get_vnode(ip, &vp); 1181 hammer_rel_inode(ip, 0); 1182 } else { 1183 vp = NULL; 1184 } 1185 if (error == 0) { 1186 vn_unlock(vp); 1187 cache_setvp(ap->a_nch, vp); 1188 vrele(vp); 1189 } 1190 goto done; 1191 } 1192 1193 /* 1194 * If there is no path component the time extension is relative to dip. 1195 * e.g. "fubar/@@<snapshot>" 1196 * 1197 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1198 * e.g. "fubar/.@@<snapshot>" 1199 * 1200 * ".." is handled by the kernel. We do not currently handle 1201 * "..@<snapshot>". 1202 */ 1203 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1204 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1205 asof, dip->obj_localization, 1206 flags, &error); 1207 if (error == 0) { 1208 error = hammer_get_vnode(ip, &vp); 1209 hammer_rel_inode(ip, 0); 1210 } else { 1211 vp = NULL; 1212 } 1213 if (error == 0) { 1214 vn_unlock(vp); 1215 cache_setvp(ap->a_nch, vp); 1216 vrele(vp); 1217 } 1218 goto done; 1219 } 1220 1221 /* 1222 * Calculate the namekey and setup the key range for the scan. This 1223 * works kinda like a chained hash table where the lower 32 bits 1224 * of the namekey synthesize the chain. 1225 * 1226 * The key range is inclusive of both key_beg and key_end. 1227 */ 1228 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1229 &max_iterations); 1230 1231 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1232 cursor.key_beg.localization = dip->obj_localization + 1233 hammer_dir_localization(dip); 1234 cursor.key_beg.obj_id = dip->obj_id; 1235 cursor.key_beg.key = namekey; 1236 cursor.key_beg.create_tid = 0; 1237 cursor.key_beg.delete_tid = 0; 1238 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1239 cursor.key_beg.obj_type = 0; 1240 1241 cursor.key_end = cursor.key_beg; 1242 cursor.key_end.key += max_iterations; 1243 cursor.asof = asof; 1244 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1245 1246 /* 1247 * Scan all matching records (the chain), locate the one matching 1248 * the requested path component. 1249 * 1250 * The hammer_ip_*() functions merge in-memory records with on-disk 1251 * records for the purposes of the search. 1252 */ 1253 obj_id = 0; 1254 localization = HAMMER_DEF_LOCALIZATION; 1255 1256 if (error == 0) { 1257 error = hammer_ip_first(&cursor); 1258 while (error == 0) { 1259 error = hammer_ip_resolve_data(&cursor); 1260 if (error) 1261 break; 1262 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1263 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1264 obj_id = cursor.data->entry.obj_id; 1265 localization = cursor.data->entry.localization; 1266 break; 1267 } 1268 error = hammer_ip_next(&cursor); 1269 } 1270 } 1271 hammer_done_cursor(&cursor); 1272 1273 /* 1274 * Lookup the obj_id. This should always succeed. If it does not 1275 * the filesystem may be damaged and we return a dummy inode. 1276 */ 1277 if (error == 0) { 1278 ip = hammer_get_inode(&trans, dip, obj_id, 1279 asof, localization, 1280 flags, &error); 1281 if (error == ENOENT) { 1282 kprintf("HAMMER: WARNING: Missing " 1283 "inode for dirent \"%s\"\n" 1284 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1285 ncp->nc_name, 1286 (long long)obj_id, (long long)asof, 1287 localization); 1288 error = 0; 1289 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1290 asof, localization, 1291 flags, &error); 1292 } 1293 if (error == 0) { 1294 error = hammer_get_vnode(ip, &vp); 1295 hammer_rel_inode(ip, 0); 1296 } else { 1297 vp = NULL; 1298 } 1299 if (error == 0) { 1300 vn_unlock(vp); 1301 cache_setvp(ap->a_nch, vp); 1302 vrele(vp); 1303 } 1304 } else if (error == ENOENT) { 1305 cache_setvp(ap->a_nch, NULL); 1306 } 1307 done: 1308 hammer_done_transaction(&trans); 1309 lwkt_reltoken(&hmp->fs_token); 1310 return (error); 1311 } 1312 1313 /* 1314 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1315 * 1316 * Locate the parent directory of a directory vnode. 1317 * 1318 * dvp is referenced but not locked. *vpp must be returned referenced and 1319 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1320 * at the root, instead it could indicate that the directory we were in was 1321 * removed. 1322 * 1323 * NOTE: as-of sequences are not linked into the directory structure. If 1324 * we are at the root with a different asof then the mount point, reload 1325 * the same directory with the mount point's asof. I'm not sure what this 1326 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1327 * get confused, but it hasn't been tested. 1328 */ 1329 static 1330 int 1331 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1332 { 1333 struct hammer_transaction trans; 1334 struct hammer_inode *dip; 1335 struct hammer_inode *ip; 1336 hammer_mount_t hmp; 1337 int64_t parent_obj_id; 1338 u_int32_t parent_obj_localization; 1339 hammer_tid_t asof; 1340 int error; 1341 1342 dip = VTOI(ap->a_dvp); 1343 asof = dip->obj_asof; 1344 hmp = dip->hmp; 1345 1346 /* 1347 * Whos are parent? This could be the root of a pseudo-filesystem 1348 * whos parent is in another localization domain. 1349 */ 1350 lwkt_gettoken(&hmp->fs_token); 1351 parent_obj_id = dip->ino_data.parent_obj_id; 1352 if (dip->obj_id == HAMMER_OBJID_ROOT) 1353 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1354 else 1355 parent_obj_localization = dip->obj_localization; 1356 1357 if (parent_obj_id == 0) { 1358 if (dip->obj_id == HAMMER_OBJID_ROOT && 1359 asof != hmp->asof) { 1360 parent_obj_id = dip->obj_id; 1361 asof = hmp->asof; 1362 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1363 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1364 (long long)dip->obj_asof); 1365 } else { 1366 *ap->a_vpp = NULL; 1367 lwkt_reltoken(&hmp->fs_token); 1368 return ENOENT; 1369 } 1370 } 1371 1372 hammer_simple_transaction(&trans, hmp); 1373 ++hammer_stats_file_iopsr; 1374 1375 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1376 asof, parent_obj_localization, 1377 dip->flags, &error); 1378 if (ip) { 1379 error = hammer_get_vnode(ip, ap->a_vpp); 1380 hammer_rel_inode(ip, 0); 1381 } else { 1382 *ap->a_vpp = NULL; 1383 } 1384 hammer_done_transaction(&trans); 1385 lwkt_reltoken(&hmp->fs_token); 1386 return (error); 1387 } 1388 1389 /* 1390 * hammer_vop_nlink { nch, dvp, vp, cred } 1391 */ 1392 static 1393 int 1394 hammer_vop_nlink(struct vop_nlink_args *ap) 1395 { 1396 struct hammer_transaction trans; 1397 struct hammer_inode *dip; 1398 struct hammer_inode *ip; 1399 struct nchandle *nch; 1400 hammer_mount_t hmp; 1401 int error; 1402 1403 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1404 return(EXDEV); 1405 1406 nch = ap->a_nch; 1407 dip = VTOI(ap->a_dvp); 1408 ip = VTOI(ap->a_vp); 1409 hmp = dip->hmp; 1410 1411 if (dip->obj_localization != ip->obj_localization) 1412 return(EXDEV); 1413 1414 if (dip->flags & HAMMER_INODE_RO) 1415 return (EROFS); 1416 if (ip->flags & HAMMER_INODE_RO) 1417 return (EROFS); 1418 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1419 return (error); 1420 1421 /* 1422 * Create a transaction to cover the operations we perform. 1423 */ 1424 lwkt_gettoken(&hmp->fs_token); 1425 hammer_start_transaction(&trans, hmp); 1426 ++hammer_stats_file_iopsw; 1427 1428 /* 1429 * Add the filesystem object to the directory. Note that neither 1430 * dip nor ip are referenced or locked, but their vnodes are 1431 * referenced. This function will bump the inode's link count. 1432 */ 1433 error = hammer_ip_add_directory(&trans, dip, 1434 nch->ncp->nc_name, nch->ncp->nc_nlen, 1435 ip); 1436 1437 /* 1438 * Finish up. 1439 */ 1440 if (error == 0) { 1441 cache_setunresolved(nch); 1442 cache_setvp(nch, ap->a_vp); 1443 } 1444 hammer_done_transaction(&trans); 1445 hammer_knote(ap->a_vp, NOTE_LINK); 1446 hammer_knote(ap->a_dvp, NOTE_WRITE); 1447 lwkt_reltoken(&hmp->fs_token); 1448 return (error); 1449 } 1450 1451 /* 1452 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1453 * 1454 * The operating system has already ensured that the directory entry 1455 * does not exist and done all appropriate namespace locking. 1456 */ 1457 static 1458 int 1459 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1460 { 1461 struct hammer_transaction trans; 1462 struct hammer_inode *dip; 1463 struct hammer_inode *nip; 1464 struct nchandle *nch; 1465 hammer_mount_t hmp; 1466 int error; 1467 1468 nch = ap->a_nch; 1469 dip = VTOI(ap->a_dvp); 1470 hmp = dip->hmp; 1471 1472 if (dip->flags & HAMMER_INODE_RO) 1473 return (EROFS); 1474 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1475 return (error); 1476 1477 /* 1478 * Create a transaction to cover the operations we perform. 1479 */ 1480 lwkt_gettoken(&hmp->fs_token); 1481 hammer_start_transaction(&trans, hmp); 1482 ++hammer_stats_file_iopsw; 1483 1484 /* 1485 * Create a new filesystem object of the requested type. The 1486 * returned inode will be referenced but not locked. 1487 */ 1488 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1489 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1490 NULL, &nip); 1491 if (error) { 1492 hkprintf("hammer_mkdir error %d\n", error); 1493 hammer_done_transaction(&trans); 1494 *ap->a_vpp = NULL; 1495 lwkt_reltoken(&hmp->fs_token); 1496 return (error); 1497 } 1498 /* 1499 * Add the new filesystem object to the directory. This will also 1500 * bump the inode's link count. 1501 */ 1502 error = hammer_ip_add_directory(&trans, dip, 1503 nch->ncp->nc_name, nch->ncp->nc_nlen, 1504 nip); 1505 if (error) 1506 hkprintf("hammer_mkdir (add) error %d\n", error); 1507 1508 /* 1509 * Finish up. 1510 */ 1511 if (error) { 1512 hammer_rel_inode(nip, 0); 1513 *ap->a_vpp = NULL; 1514 } else { 1515 error = hammer_get_vnode(nip, ap->a_vpp); 1516 hammer_rel_inode(nip, 0); 1517 if (error == 0) { 1518 cache_setunresolved(ap->a_nch); 1519 cache_setvp(ap->a_nch, *ap->a_vpp); 1520 } 1521 } 1522 hammer_done_transaction(&trans); 1523 if (error == 0) 1524 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1525 lwkt_reltoken(&hmp->fs_token); 1526 return (error); 1527 } 1528 1529 /* 1530 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1531 * 1532 * The operating system has already ensured that the directory entry 1533 * does not exist and done all appropriate namespace locking. 1534 */ 1535 static 1536 int 1537 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1538 { 1539 struct hammer_transaction trans; 1540 struct hammer_inode *dip; 1541 struct hammer_inode *nip; 1542 struct nchandle *nch; 1543 hammer_mount_t hmp; 1544 int error; 1545 1546 nch = ap->a_nch; 1547 dip = VTOI(ap->a_dvp); 1548 hmp = dip->hmp; 1549 1550 if (dip->flags & HAMMER_INODE_RO) 1551 return (EROFS); 1552 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1553 return (error); 1554 1555 /* 1556 * Create a transaction to cover the operations we perform. 1557 */ 1558 lwkt_gettoken(&hmp->fs_token); 1559 hammer_start_transaction(&trans, hmp); 1560 ++hammer_stats_file_iopsw; 1561 1562 /* 1563 * Create a new filesystem object of the requested type. The 1564 * returned inode will be referenced but not locked. 1565 * 1566 * If mknod specifies a directory a pseudo-fs is created. 1567 */ 1568 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1569 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1570 NULL, &nip); 1571 if (error) { 1572 hammer_done_transaction(&trans); 1573 *ap->a_vpp = NULL; 1574 lwkt_reltoken(&hmp->fs_token); 1575 return (error); 1576 } 1577 1578 /* 1579 * Add the new filesystem object to the directory. This will also 1580 * bump the inode's link count. 1581 */ 1582 error = hammer_ip_add_directory(&trans, dip, 1583 nch->ncp->nc_name, nch->ncp->nc_nlen, 1584 nip); 1585 1586 /* 1587 * Finish up. 1588 */ 1589 if (error) { 1590 hammer_rel_inode(nip, 0); 1591 *ap->a_vpp = NULL; 1592 } else { 1593 error = hammer_get_vnode(nip, ap->a_vpp); 1594 hammer_rel_inode(nip, 0); 1595 if (error == 0) { 1596 cache_setunresolved(ap->a_nch); 1597 cache_setvp(ap->a_nch, *ap->a_vpp); 1598 } 1599 } 1600 hammer_done_transaction(&trans); 1601 if (error == 0) 1602 hammer_knote(ap->a_dvp, NOTE_WRITE); 1603 lwkt_reltoken(&hmp->fs_token); 1604 return (error); 1605 } 1606 1607 /* 1608 * hammer_vop_open { vp, mode, cred, fp } 1609 * 1610 * MPSAFE (does not require fs_token) 1611 */ 1612 static 1613 int 1614 hammer_vop_open(struct vop_open_args *ap) 1615 { 1616 hammer_inode_t ip; 1617 1618 ++hammer_stats_file_iopsr; 1619 ip = VTOI(ap->a_vp); 1620 1621 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1622 return (EROFS); 1623 return(vop_stdopen(ap)); 1624 } 1625 1626 /* 1627 * hammer_vop_print { vp } 1628 */ 1629 static 1630 int 1631 hammer_vop_print(struct vop_print_args *ap) 1632 { 1633 return EOPNOTSUPP; 1634 } 1635 1636 /* 1637 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1638 */ 1639 static 1640 int 1641 hammer_vop_readdir(struct vop_readdir_args *ap) 1642 { 1643 struct hammer_transaction trans; 1644 struct hammer_cursor cursor; 1645 struct hammer_inode *ip; 1646 hammer_mount_t hmp; 1647 struct uio *uio; 1648 hammer_base_elm_t base; 1649 int error; 1650 int cookie_index; 1651 int ncookies; 1652 off_t *cookies; 1653 off_t saveoff; 1654 int r; 1655 int dtype; 1656 1657 ++hammer_stats_file_iopsr; 1658 ip = VTOI(ap->a_vp); 1659 uio = ap->a_uio; 1660 saveoff = uio->uio_offset; 1661 hmp = ip->hmp; 1662 1663 if (ap->a_ncookies) { 1664 ncookies = uio->uio_resid / 16 + 1; 1665 if (ncookies > 1024) 1666 ncookies = 1024; 1667 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1668 cookie_index = 0; 1669 } else { 1670 ncookies = -1; 1671 cookies = NULL; 1672 cookie_index = 0; 1673 } 1674 1675 lwkt_gettoken(&hmp->fs_token); 1676 hammer_simple_transaction(&trans, hmp); 1677 1678 /* 1679 * Handle artificial entries 1680 * 1681 * It should be noted that the minimum value for a directory 1682 * hash key on-media is 0x0000000100000000, so we can use anything 1683 * less then that to represent our 'special' key space. 1684 */ 1685 error = 0; 1686 if (saveoff == 0) { 1687 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1688 if (r) 1689 goto done; 1690 if (cookies) 1691 cookies[cookie_index] = saveoff; 1692 ++saveoff; 1693 ++cookie_index; 1694 if (cookie_index == ncookies) 1695 goto done; 1696 } 1697 if (saveoff == 1) { 1698 if (ip->ino_data.parent_obj_id) { 1699 r = vop_write_dirent(&error, uio, 1700 ip->ino_data.parent_obj_id, 1701 DT_DIR, 2, ".."); 1702 } else { 1703 r = vop_write_dirent(&error, uio, 1704 ip->obj_id, DT_DIR, 2, ".."); 1705 } 1706 if (r) 1707 goto done; 1708 if (cookies) 1709 cookies[cookie_index] = saveoff; 1710 ++saveoff; 1711 ++cookie_index; 1712 if (cookie_index == ncookies) 1713 goto done; 1714 } 1715 1716 /* 1717 * Key range (begin and end inclusive) to scan. Directory keys 1718 * directly translate to a 64 bit 'seek' position. 1719 */ 1720 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1721 cursor.key_beg.localization = ip->obj_localization + 1722 hammer_dir_localization(ip); 1723 cursor.key_beg.obj_id = ip->obj_id; 1724 cursor.key_beg.create_tid = 0; 1725 cursor.key_beg.delete_tid = 0; 1726 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1727 cursor.key_beg.obj_type = 0; 1728 cursor.key_beg.key = saveoff; 1729 1730 cursor.key_end = cursor.key_beg; 1731 cursor.key_end.key = HAMMER_MAX_KEY; 1732 cursor.asof = ip->obj_asof; 1733 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1734 1735 error = hammer_ip_first(&cursor); 1736 1737 while (error == 0) { 1738 error = hammer_ip_resolve_data(&cursor); 1739 if (error) 1740 break; 1741 base = &cursor.leaf->base; 1742 saveoff = base->key; 1743 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1744 1745 if (base->obj_id != ip->obj_id) 1746 panic("readdir: bad record at %p", cursor.node); 1747 1748 /* 1749 * Convert pseudo-filesystems into softlinks 1750 */ 1751 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1752 r = vop_write_dirent( 1753 &error, uio, cursor.data->entry.obj_id, 1754 dtype, 1755 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1756 (void *)cursor.data->entry.name); 1757 if (r) 1758 break; 1759 ++saveoff; 1760 if (cookies) 1761 cookies[cookie_index] = base->key; 1762 ++cookie_index; 1763 if (cookie_index == ncookies) 1764 break; 1765 error = hammer_ip_next(&cursor); 1766 } 1767 hammer_done_cursor(&cursor); 1768 1769 done: 1770 hammer_done_transaction(&trans); 1771 1772 if (ap->a_eofflag) 1773 *ap->a_eofflag = (error == ENOENT); 1774 uio->uio_offset = saveoff; 1775 if (error && cookie_index == 0) { 1776 if (error == ENOENT) 1777 error = 0; 1778 if (cookies) { 1779 kfree(cookies, M_TEMP); 1780 *ap->a_ncookies = 0; 1781 *ap->a_cookies = NULL; 1782 } 1783 } else { 1784 if (error == ENOENT) 1785 error = 0; 1786 if (cookies) { 1787 *ap->a_ncookies = cookie_index; 1788 *ap->a_cookies = cookies; 1789 } 1790 } 1791 lwkt_reltoken(&hmp->fs_token); 1792 return(error); 1793 } 1794 1795 /* 1796 * hammer_vop_readlink { vp, uio, cred } 1797 */ 1798 static 1799 int 1800 hammer_vop_readlink(struct vop_readlink_args *ap) 1801 { 1802 struct hammer_transaction trans; 1803 struct hammer_cursor cursor; 1804 struct hammer_inode *ip; 1805 hammer_mount_t hmp; 1806 char buf[32]; 1807 u_int32_t localization; 1808 hammer_pseudofs_inmem_t pfsm; 1809 int error; 1810 1811 ip = VTOI(ap->a_vp); 1812 hmp = ip->hmp; 1813 1814 lwkt_gettoken(&hmp->fs_token); 1815 1816 /* 1817 * Shortcut if the symlink data was stuffed into ino_data. 1818 * 1819 * Also expand special "@@PFS%05d" softlinks (expansion only 1820 * occurs for non-historical (current) accesses made from the 1821 * primary filesystem). 1822 */ 1823 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1824 char *ptr; 1825 int bytes; 1826 1827 ptr = ip->ino_data.ext.symlink; 1828 bytes = (int)ip->ino_data.size; 1829 if (bytes == 10 && 1830 ip->obj_asof == HAMMER_MAX_TID && 1831 ip->obj_localization == 0 && 1832 strncmp(ptr, "@@PFS", 5) == 0) { 1833 hammer_simple_transaction(&trans, hmp); 1834 bcopy(ptr + 5, buf, 5); 1835 buf[5] = 0; 1836 localization = strtoul(buf, NULL, 10) << 16; 1837 pfsm = hammer_load_pseudofs(&trans, localization, 1838 &error); 1839 if (error == 0) { 1840 if (pfsm->pfsd.mirror_flags & 1841 HAMMER_PFSD_SLAVE) { 1842 /* vap->va_size == 26 */ 1843 ksnprintf(buf, sizeof(buf), 1844 "@@0x%016llx:%05d", 1845 (long long)pfsm->pfsd.sync_end_tid, 1846 localization >> 16); 1847 } else { 1848 /* vap->va_size == 10 */ 1849 ksnprintf(buf, sizeof(buf), 1850 "@@-1:%05d", 1851 localization >> 16); 1852 #if 0 1853 ksnprintf(buf, sizeof(buf), 1854 "@@0x%016llx:%05d", 1855 (long long)HAMMER_MAX_TID, 1856 localization >> 16); 1857 #endif 1858 } 1859 ptr = buf; 1860 bytes = strlen(buf); 1861 } 1862 if (pfsm) 1863 hammer_rel_pseudofs(hmp, pfsm); 1864 hammer_done_transaction(&trans); 1865 } 1866 error = uiomove(ptr, bytes, ap->a_uio); 1867 lwkt_reltoken(&hmp->fs_token); 1868 return(error); 1869 } 1870 1871 /* 1872 * Long version 1873 */ 1874 hammer_simple_transaction(&trans, hmp); 1875 ++hammer_stats_file_iopsr; 1876 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1877 1878 /* 1879 * Key range (begin and end inclusive) to scan. Directory keys 1880 * directly translate to a 64 bit 'seek' position. 1881 */ 1882 cursor.key_beg.localization = ip->obj_localization + 1883 HAMMER_LOCALIZE_MISC; 1884 cursor.key_beg.obj_id = ip->obj_id; 1885 cursor.key_beg.create_tid = 0; 1886 cursor.key_beg.delete_tid = 0; 1887 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1888 cursor.key_beg.obj_type = 0; 1889 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1890 cursor.asof = ip->obj_asof; 1891 cursor.flags |= HAMMER_CURSOR_ASOF; 1892 1893 error = hammer_ip_lookup(&cursor); 1894 if (error == 0) { 1895 error = hammer_ip_resolve_data(&cursor); 1896 if (error == 0) { 1897 KKASSERT(cursor.leaf->data_len >= 1898 HAMMER_SYMLINK_NAME_OFF); 1899 error = uiomove(cursor.data->symlink.name, 1900 cursor.leaf->data_len - 1901 HAMMER_SYMLINK_NAME_OFF, 1902 ap->a_uio); 1903 } 1904 } 1905 hammer_done_cursor(&cursor); 1906 hammer_done_transaction(&trans); 1907 lwkt_reltoken(&hmp->fs_token); 1908 return(error); 1909 } 1910 1911 /* 1912 * hammer_vop_nremove { nch, dvp, cred } 1913 */ 1914 static 1915 int 1916 hammer_vop_nremove(struct vop_nremove_args *ap) 1917 { 1918 struct hammer_transaction trans; 1919 struct hammer_inode *dip; 1920 hammer_mount_t hmp; 1921 int error; 1922 1923 dip = VTOI(ap->a_dvp); 1924 hmp = dip->hmp; 1925 1926 if (hammer_nohistory(dip) == 0 && 1927 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1928 return (error); 1929 } 1930 1931 lwkt_gettoken(&hmp->fs_token); 1932 hammer_start_transaction(&trans, hmp); 1933 ++hammer_stats_file_iopsw; 1934 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1935 hammer_done_transaction(&trans); 1936 if (error == 0) 1937 hammer_knote(ap->a_dvp, NOTE_WRITE); 1938 lwkt_reltoken(&hmp->fs_token); 1939 return (error); 1940 } 1941 1942 /* 1943 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1944 */ 1945 static 1946 int 1947 hammer_vop_nrename(struct vop_nrename_args *ap) 1948 { 1949 struct hammer_transaction trans; 1950 struct namecache *fncp; 1951 struct namecache *tncp; 1952 struct hammer_inode *fdip; 1953 struct hammer_inode *tdip; 1954 struct hammer_inode *ip; 1955 hammer_mount_t hmp; 1956 struct hammer_cursor cursor; 1957 int64_t namekey; 1958 u_int32_t max_iterations; 1959 int nlen, error; 1960 1961 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1962 return(EXDEV); 1963 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1964 return(EXDEV); 1965 1966 fdip = VTOI(ap->a_fdvp); 1967 tdip = VTOI(ap->a_tdvp); 1968 fncp = ap->a_fnch->ncp; 1969 tncp = ap->a_tnch->ncp; 1970 ip = VTOI(fncp->nc_vp); 1971 KKASSERT(ip != NULL); 1972 1973 hmp = ip->hmp; 1974 1975 if (fdip->obj_localization != tdip->obj_localization) 1976 return(EXDEV); 1977 if (fdip->obj_localization != ip->obj_localization) 1978 return(EXDEV); 1979 1980 if (fdip->flags & HAMMER_INODE_RO) 1981 return (EROFS); 1982 if (tdip->flags & HAMMER_INODE_RO) 1983 return (EROFS); 1984 if (ip->flags & HAMMER_INODE_RO) 1985 return (EROFS); 1986 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1987 return (error); 1988 1989 lwkt_gettoken(&hmp->fs_token); 1990 hammer_start_transaction(&trans, hmp); 1991 ++hammer_stats_file_iopsw; 1992 1993 /* 1994 * Remove tncp from the target directory and then link ip as 1995 * tncp. XXX pass trans to dounlink 1996 * 1997 * Force the inode sync-time to match the transaction so it is 1998 * in-sync with the creation of the target directory entry. 1999 */ 2000 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 2001 ap->a_cred, 0, -1); 2002 if (error == 0 || error == ENOENT) { 2003 error = hammer_ip_add_directory(&trans, tdip, 2004 tncp->nc_name, tncp->nc_nlen, 2005 ip); 2006 if (error == 0) { 2007 ip->ino_data.parent_obj_id = tdip->obj_id; 2008 ip->ino_data.ctime = trans.time; 2009 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2010 } 2011 } 2012 if (error) 2013 goto failed; /* XXX */ 2014 2015 /* 2016 * Locate the record in the originating directory and remove it. 2017 * 2018 * Calculate the namekey and setup the key range for the scan. This 2019 * works kinda like a chained hash table where the lower 32 bits 2020 * of the namekey synthesize the chain. 2021 * 2022 * The key range is inclusive of both key_beg and key_end. 2023 */ 2024 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2025 &max_iterations); 2026 retry: 2027 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2028 cursor.key_beg.localization = fdip->obj_localization + 2029 hammer_dir_localization(fdip); 2030 cursor.key_beg.obj_id = fdip->obj_id; 2031 cursor.key_beg.key = namekey; 2032 cursor.key_beg.create_tid = 0; 2033 cursor.key_beg.delete_tid = 0; 2034 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2035 cursor.key_beg.obj_type = 0; 2036 2037 cursor.key_end = cursor.key_beg; 2038 cursor.key_end.key += max_iterations; 2039 cursor.asof = fdip->obj_asof; 2040 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2041 2042 /* 2043 * Scan all matching records (the chain), locate the one matching 2044 * the requested path component. 2045 * 2046 * The hammer_ip_*() functions merge in-memory records with on-disk 2047 * records for the purposes of the search. 2048 */ 2049 error = hammer_ip_first(&cursor); 2050 while (error == 0) { 2051 if (hammer_ip_resolve_data(&cursor) != 0) 2052 break; 2053 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2054 KKASSERT(nlen > 0); 2055 if (fncp->nc_nlen == nlen && 2056 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2057 break; 2058 } 2059 error = hammer_ip_next(&cursor); 2060 } 2061 2062 /* 2063 * If all is ok we have to get the inode so we can adjust nlinks. 2064 * 2065 * WARNING: hammer_ip_del_directory() may have to terminate the 2066 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2067 * twice. 2068 */ 2069 if (error == 0) 2070 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2071 2072 /* 2073 * XXX A deadlock here will break rename's atomicy for the purposes 2074 * of crash recovery. 2075 */ 2076 if (error == EDEADLK) { 2077 hammer_done_cursor(&cursor); 2078 goto retry; 2079 } 2080 2081 /* 2082 * Cleanup and tell the kernel that the rename succeeded. 2083 * 2084 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2085 * without formally acquiring the vp since the vp might 2086 * have zero refs on it, or in the middle of a reclaim, 2087 * etc. 2088 */ 2089 hammer_done_cursor(&cursor); 2090 if (error == 0) { 2091 cache_rename(ap->a_fnch, ap->a_tnch); 2092 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2093 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2094 while (ip->vp) { 2095 struct vnode *vp; 2096 2097 error = hammer_get_vnode(ip, &vp); 2098 if (error == 0 && vp) { 2099 vn_unlock(vp); 2100 hammer_knote(ip->vp, NOTE_RENAME); 2101 vrele(vp); 2102 break; 2103 } 2104 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2105 } 2106 } 2107 2108 failed: 2109 hammer_done_transaction(&trans); 2110 lwkt_reltoken(&hmp->fs_token); 2111 return (error); 2112 } 2113 2114 /* 2115 * hammer_vop_nrmdir { nch, dvp, cred } 2116 */ 2117 static 2118 int 2119 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2120 { 2121 struct hammer_transaction trans; 2122 struct hammer_inode *dip; 2123 hammer_mount_t hmp; 2124 int error; 2125 2126 dip = VTOI(ap->a_dvp); 2127 hmp = dip->hmp; 2128 2129 if (hammer_nohistory(dip) == 0 && 2130 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2131 return (error); 2132 } 2133 2134 lwkt_gettoken(&hmp->fs_token); 2135 hammer_start_transaction(&trans, hmp); 2136 ++hammer_stats_file_iopsw; 2137 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2138 hammer_done_transaction(&trans); 2139 if (error == 0) 2140 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2141 lwkt_reltoken(&hmp->fs_token); 2142 return (error); 2143 } 2144 2145 /* 2146 * hammer_vop_markatime { vp, cred } 2147 */ 2148 static 2149 int 2150 hammer_vop_markatime(struct vop_markatime_args *ap) 2151 { 2152 struct hammer_transaction trans; 2153 struct hammer_inode *ip; 2154 hammer_mount_t hmp; 2155 2156 ip = VTOI(ap->a_vp); 2157 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2158 return (EROFS); 2159 if (ip->flags & HAMMER_INODE_RO) 2160 return (EROFS); 2161 hmp = ip->hmp; 2162 if (hmp->mp->mnt_flag & MNT_NOATIME) 2163 return (0); 2164 lwkt_gettoken(&hmp->fs_token); 2165 hammer_start_transaction(&trans, hmp); 2166 ++hammer_stats_file_iopsw; 2167 2168 ip->ino_data.atime = trans.time; 2169 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2170 hammer_done_transaction(&trans); 2171 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2172 lwkt_reltoken(&hmp->fs_token); 2173 return (0); 2174 } 2175 2176 /* 2177 * hammer_vop_setattr { vp, vap, cred } 2178 */ 2179 static 2180 int 2181 hammer_vop_setattr(struct vop_setattr_args *ap) 2182 { 2183 struct hammer_transaction trans; 2184 struct hammer_inode *ip; 2185 struct vattr *vap; 2186 hammer_mount_t hmp; 2187 int modflags; 2188 int error; 2189 int truncating; 2190 int blksize; 2191 int kflags; 2192 #if 0 2193 int64_t aligned_size; 2194 #endif 2195 u_int32_t flags; 2196 2197 vap = ap->a_vap; 2198 ip = ap->a_vp->v_data; 2199 modflags = 0; 2200 kflags = 0; 2201 hmp = ip->hmp; 2202 2203 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2204 return(EROFS); 2205 if (ip->flags & HAMMER_INODE_RO) 2206 return (EROFS); 2207 if (hammer_nohistory(ip) == 0 && 2208 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2209 return (error); 2210 } 2211 2212 lwkt_gettoken(&hmp->fs_token); 2213 hammer_start_transaction(&trans, hmp); 2214 ++hammer_stats_file_iopsw; 2215 error = 0; 2216 2217 if (vap->va_flags != VNOVAL) { 2218 flags = ip->ino_data.uflags; 2219 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2220 hammer_to_unix_xid(&ip->ino_data.uid), 2221 ap->a_cred); 2222 if (error == 0) { 2223 if (ip->ino_data.uflags != flags) { 2224 ip->ino_data.uflags = flags; 2225 ip->ino_data.ctime = trans.time; 2226 modflags |= HAMMER_INODE_DDIRTY; 2227 kflags |= NOTE_ATTRIB; 2228 } 2229 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2230 error = 0; 2231 goto done; 2232 } 2233 } 2234 goto done; 2235 } 2236 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2237 error = EPERM; 2238 goto done; 2239 } 2240 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2241 mode_t cur_mode = ip->ino_data.mode; 2242 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2243 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2244 uuid_t uuid_uid; 2245 uuid_t uuid_gid; 2246 2247 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2248 ap->a_cred, 2249 &cur_uid, &cur_gid, &cur_mode); 2250 if (error == 0) { 2251 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2252 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2253 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2254 sizeof(uuid_uid)) || 2255 bcmp(&uuid_gid, &ip->ino_data.gid, 2256 sizeof(uuid_gid)) || 2257 ip->ino_data.mode != cur_mode 2258 ) { 2259 ip->ino_data.uid = uuid_uid; 2260 ip->ino_data.gid = uuid_gid; 2261 ip->ino_data.mode = cur_mode; 2262 ip->ino_data.ctime = trans.time; 2263 modflags |= HAMMER_INODE_DDIRTY; 2264 } 2265 kflags |= NOTE_ATTRIB; 2266 } 2267 } 2268 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2269 switch(ap->a_vp->v_type) { 2270 case VREG: 2271 if (vap->va_size == ip->ino_data.size) 2272 break; 2273 2274 /* 2275 * Log the operation if in fast-fsync mode or if 2276 * there are unterminated redo write records present. 2277 * 2278 * The second check is needed so the recovery code 2279 * properly truncates write redos even if nominal 2280 * REDO operations is turned off due to excessive 2281 * writes, because the related records might be 2282 * destroyed and never lay down a TERM_WRITE. 2283 */ 2284 if ((ip->flags & HAMMER_INODE_REDO) || 2285 (ip->flags & HAMMER_INODE_RDIRTY)) { 2286 error = hammer_generate_redo(&trans, ip, 2287 vap->va_size, 2288 HAMMER_REDO_TRUNC, 2289 NULL, 0); 2290 } 2291 blksize = hammer_blocksize(vap->va_size); 2292 2293 /* 2294 * XXX break atomicy, we can deadlock the backend 2295 * if we do not release the lock. Probably not a 2296 * big deal here. 2297 */ 2298 if (vap->va_size < ip->ino_data.size) { 2299 nvtruncbuf(ap->a_vp, vap->va_size, 2300 blksize, 2301 hammer_blockoff(vap->va_size)); 2302 truncating = 1; 2303 kflags |= NOTE_WRITE; 2304 } else { 2305 nvextendbuf(ap->a_vp, 2306 ip->ino_data.size, 2307 vap->va_size, 2308 hammer_blocksize(ip->ino_data.size), 2309 hammer_blocksize(vap->va_size), 2310 hammer_blockoff(ip->ino_data.size), 2311 hammer_blockoff(vap->va_size), 2312 0); 2313 truncating = 0; 2314 kflags |= NOTE_WRITE | NOTE_EXTEND; 2315 } 2316 ip->ino_data.size = vap->va_size; 2317 ip->ino_data.mtime = trans.time; 2318 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2319 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2320 2321 /* 2322 * On-media truncation is cached in the inode until 2323 * the inode is synchronized. We must immediately 2324 * handle any frontend records. 2325 */ 2326 if (truncating) { 2327 hammer_ip_frontend_trunc(ip, vap->va_size); 2328 #ifdef DEBUG_TRUNCATE 2329 if (HammerTruncIp == NULL) 2330 HammerTruncIp = ip; 2331 #endif 2332 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2333 ip->flags |= HAMMER_INODE_TRUNCATED; 2334 ip->trunc_off = vap->va_size; 2335 #ifdef DEBUG_TRUNCATE 2336 if (ip == HammerTruncIp) 2337 kprintf("truncate1 %016llx\n", 2338 (long long)ip->trunc_off); 2339 #endif 2340 } else if (ip->trunc_off > vap->va_size) { 2341 ip->trunc_off = vap->va_size; 2342 #ifdef DEBUG_TRUNCATE 2343 if (ip == HammerTruncIp) 2344 kprintf("truncate2 %016llx\n", 2345 (long long)ip->trunc_off); 2346 #endif 2347 } else { 2348 #ifdef DEBUG_TRUNCATE 2349 if (ip == HammerTruncIp) 2350 kprintf("truncate3 %016llx (ignored)\n", 2351 (long long)vap->va_size); 2352 #endif 2353 } 2354 } 2355 2356 #if 0 2357 /* 2358 * When truncating, nvtruncbuf() may have cleaned out 2359 * a portion of the last block on-disk in the buffer 2360 * cache. We must clean out any frontend records 2361 * for blocks beyond the new last block. 2362 */ 2363 aligned_size = (vap->va_size + (blksize - 1)) & 2364 ~(int64_t)(blksize - 1); 2365 if (truncating && vap->va_size < aligned_size) { 2366 aligned_size -= blksize; 2367 hammer_ip_frontend_trunc(ip, aligned_size); 2368 } 2369 #endif 2370 break; 2371 case VDATABASE: 2372 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2373 ip->flags |= HAMMER_INODE_TRUNCATED; 2374 ip->trunc_off = vap->va_size; 2375 } else if (ip->trunc_off > vap->va_size) { 2376 ip->trunc_off = vap->va_size; 2377 } 2378 hammer_ip_frontend_trunc(ip, vap->va_size); 2379 ip->ino_data.size = vap->va_size; 2380 ip->ino_data.mtime = trans.time; 2381 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2382 kflags |= NOTE_ATTRIB; 2383 break; 2384 default: 2385 error = EINVAL; 2386 goto done; 2387 } 2388 break; 2389 } 2390 if (vap->va_atime.tv_sec != VNOVAL) { 2391 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2392 modflags |= HAMMER_INODE_ATIME; 2393 kflags |= NOTE_ATTRIB; 2394 } 2395 if (vap->va_mtime.tv_sec != VNOVAL) { 2396 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2397 modflags |= HAMMER_INODE_MTIME; 2398 kflags |= NOTE_ATTRIB; 2399 } 2400 if (vap->va_mode != (mode_t)VNOVAL) { 2401 mode_t cur_mode = ip->ino_data.mode; 2402 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2403 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2404 2405 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2406 cur_uid, cur_gid, &cur_mode); 2407 if (error == 0 && ip->ino_data.mode != cur_mode) { 2408 ip->ino_data.mode = cur_mode; 2409 ip->ino_data.ctime = trans.time; 2410 modflags |= HAMMER_INODE_DDIRTY; 2411 kflags |= NOTE_ATTRIB; 2412 } 2413 } 2414 done: 2415 if (error == 0) 2416 hammer_modify_inode(&trans, ip, modflags); 2417 hammer_done_transaction(&trans); 2418 hammer_knote(ap->a_vp, kflags); 2419 lwkt_reltoken(&hmp->fs_token); 2420 return (error); 2421 } 2422 2423 /* 2424 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2425 */ 2426 static 2427 int 2428 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2429 { 2430 struct hammer_transaction trans; 2431 struct hammer_inode *dip; 2432 struct hammer_inode *nip; 2433 hammer_record_t record; 2434 struct nchandle *nch; 2435 hammer_mount_t hmp; 2436 int error; 2437 int bytes; 2438 2439 ap->a_vap->va_type = VLNK; 2440 2441 nch = ap->a_nch; 2442 dip = VTOI(ap->a_dvp); 2443 hmp = dip->hmp; 2444 2445 if (dip->flags & HAMMER_INODE_RO) 2446 return (EROFS); 2447 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2448 return (error); 2449 2450 /* 2451 * Create a transaction to cover the operations we perform. 2452 */ 2453 lwkt_gettoken(&hmp->fs_token); 2454 hammer_start_transaction(&trans, hmp); 2455 ++hammer_stats_file_iopsw; 2456 2457 /* 2458 * Create a new filesystem object of the requested type. The 2459 * returned inode will be referenced but not locked. 2460 */ 2461 2462 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2463 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2464 NULL, &nip); 2465 if (error) { 2466 hammer_done_transaction(&trans); 2467 *ap->a_vpp = NULL; 2468 lwkt_reltoken(&hmp->fs_token); 2469 return (error); 2470 } 2471 2472 /* 2473 * Add a record representing the symlink. symlink stores the link 2474 * as pure data, not a string, and is no \0 terminated. 2475 */ 2476 if (error == 0) { 2477 bytes = strlen(ap->a_target); 2478 2479 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2480 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2481 } else { 2482 record = hammer_alloc_mem_record(nip, bytes); 2483 record->type = HAMMER_MEM_RECORD_GENERAL; 2484 2485 record->leaf.base.localization = nip->obj_localization + 2486 HAMMER_LOCALIZE_MISC; 2487 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2488 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2489 record->leaf.data_len = bytes; 2490 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2491 bcopy(ap->a_target, record->data->symlink.name, bytes); 2492 error = hammer_ip_add_record(&trans, record); 2493 } 2494 2495 /* 2496 * Set the file size to the length of the link. 2497 */ 2498 if (error == 0) { 2499 nip->ino_data.size = bytes; 2500 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2501 } 2502 } 2503 if (error == 0) 2504 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2505 nch->ncp->nc_nlen, nip); 2506 2507 /* 2508 * Finish up. 2509 */ 2510 if (error) { 2511 hammer_rel_inode(nip, 0); 2512 *ap->a_vpp = NULL; 2513 } else { 2514 error = hammer_get_vnode(nip, ap->a_vpp); 2515 hammer_rel_inode(nip, 0); 2516 if (error == 0) { 2517 cache_setunresolved(ap->a_nch); 2518 cache_setvp(ap->a_nch, *ap->a_vpp); 2519 hammer_knote(ap->a_dvp, NOTE_WRITE); 2520 } 2521 } 2522 hammer_done_transaction(&trans); 2523 lwkt_reltoken(&hmp->fs_token); 2524 return (error); 2525 } 2526 2527 /* 2528 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2529 */ 2530 static 2531 int 2532 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2533 { 2534 struct hammer_transaction trans; 2535 struct hammer_inode *dip; 2536 hammer_mount_t hmp; 2537 int error; 2538 2539 dip = VTOI(ap->a_dvp); 2540 hmp = dip->hmp; 2541 2542 if (hammer_nohistory(dip) == 0 && 2543 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2544 return (error); 2545 } 2546 2547 lwkt_gettoken(&hmp->fs_token); 2548 hammer_start_transaction(&trans, hmp); 2549 ++hammer_stats_file_iopsw; 2550 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2551 ap->a_cred, ap->a_flags, -1); 2552 hammer_done_transaction(&trans); 2553 lwkt_reltoken(&hmp->fs_token); 2554 2555 return (error); 2556 } 2557 2558 /* 2559 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2560 */ 2561 static 2562 int 2563 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2564 { 2565 struct hammer_inode *ip = ap->a_vp->v_data; 2566 hammer_mount_t hmp = ip->hmp; 2567 int error; 2568 2569 ++hammer_stats_file_iopsr; 2570 lwkt_gettoken(&hmp->fs_token); 2571 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2572 ap->a_fflag, ap->a_cred); 2573 lwkt_reltoken(&hmp->fs_token); 2574 return (error); 2575 } 2576 2577 static 2578 int 2579 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2580 { 2581 static const struct mountctl_opt extraopt[] = { 2582 { HMNT_NOHISTORY, "nohistory" }, 2583 { HMNT_MASTERID, "master" }, 2584 { 0, NULL} 2585 2586 }; 2587 struct hammer_mount *hmp; 2588 struct mount *mp; 2589 int usedbytes; 2590 int error; 2591 2592 error = 0; 2593 usedbytes = 0; 2594 mp = ap->a_head.a_ops->head.vv_mount; 2595 KKASSERT(mp->mnt_data != NULL); 2596 hmp = (struct hammer_mount *)mp->mnt_data; 2597 2598 lwkt_gettoken(&hmp->fs_token); 2599 2600 switch(ap->a_op) { 2601 case MOUNTCTL_SET_EXPORT: 2602 if (ap->a_ctllen != sizeof(struct export_args)) 2603 error = EINVAL; 2604 else 2605 error = hammer_vfs_export(mp, ap->a_op, 2606 (const struct export_args *)ap->a_ctl); 2607 break; 2608 case MOUNTCTL_MOUNTFLAGS: 2609 { 2610 /* 2611 * Call standard mountctl VOP function 2612 * so we get user mount flags. 2613 */ 2614 error = vop_stdmountctl(ap); 2615 if (error) 2616 break; 2617 2618 usedbytes = *ap->a_res; 2619 2620 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2621 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2622 ap->a_buf, 2623 ap->a_buflen - usedbytes, 2624 &error); 2625 } 2626 2627 *ap->a_res += usedbytes; 2628 break; 2629 } 2630 default: 2631 error = vop_stdmountctl(ap); 2632 break; 2633 } 2634 lwkt_reltoken(&hmp->fs_token); 2635 return(error); 2636 } 2637 2638 /* 2639 * hammer_vop_strategy { vp, bio } 2640 * 2641 * Strategy call, used for regular file read & write only. Note that the 2642 * bp may represent a cluster. 2643 * 2644 * To simplify operation and allow better optimizations in the future, 2645 * this code does not make any assumptions with regards to buffer alignment 2646 * or size. 2647 */ 2648 static 2649 int 2650 hammer_vop_strategy(struct vop_strategy_args *ap) 2651 { 2652 struct buf *bp; 2653 int error; 2654 2655 bp = ap->a_bio->bio_buf; 2656 2657 switch(bp->b_cmd) { 2658 case BUF_CMD_READ: 2659 error = hammer_vop_strategy_read(ap); 2660 break; 2661 case BUF_CMD_WRITE: 2662 error = hammer_vop_strategy_write(ap); 2663 break; 2664 default: 2665 bp->b_error = error = EINVAL; 2666 bp->b_flags |= B_ERROR; 2667 biodone(ap->a_bio); 2668 break; 2669 } 2670 2671 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2672 2673 return (error); 2674 } 2675 2676 /* 2677 * Read from a regular file. Iterate the related records and fill in the 2678 * BIO/BUF. Gaps are zero-filled. 2679 * 2680 * The support code in hammer_object.c should be used to deal with mixed 2681 * in-memory and on-disk records. 2682 * 2683 * NOTE: Can be called from the cluster code with an oversized buf. 2684 * 2685 * XXX atime update 2686 */ 2687 static 2688 int 2689 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2690 { 2691 struct hammer_transaction trans; 2692 struct hammer_inode *ip; 2693 struct hammer_inode *dip; 2694 hammer_mount_t hmp; 2695 struct hammer_cursor cursor; 2696 hammer_base_elm_t base; 2697 hammer_off_t disk_offset; 2698 struct bio *bio; 2699 struct bio *nbio; 2700 struct buf *bp; 2701 int64_t rec_offset; 2702 int64_t ran_end; 2703 int64_t tmp64; 2704 int error; 2705 int boff; 2706 int roff; 2707 int n; 2708 int isdedupable; 2709 2710 bio = ap->a_bio; 2711 bp = bio->bio_buf; 2712 ip = ap->a_vp->v_data; 2713 hmp = ip->hmp; 2714 2715 /* 2716 * The zone-2 disk offset may have been set by the cluster code via 2717 * a BMAP operation, or else should be NOOFFSET. 2718 * 2719 * Checking the high bits for a match against zone-2 should suffice. 2720 * 2721 * In cases where a lot of data duplication is present it may be 2722 * more beneficial to drop through and doubule-buffer through the 2723 * device. 2724 */ 2725 nbio = push_bio(bio); 2726 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2727 HAMMER_ZONE_LARGE_DATA) { 2728 if (hammer_double_buffer == 0) { 2729 lwkt_gettoken(&hmp->fs_token); 2730 error = hammer_io_direct_read(hmp, nbio, NULL); 2731 lwkt_reltoken(&hmp->fs_token); 2732 return (error); 2733 } 2734 2735 /* 2736 * Try to shortcut requests for double_buffer mode too. 2737 * Since this mode runs through the device buffer cache 2738 * only compatible buffer sizes (meaning those generated 2739 * by normal filesystem buffers) are legal. 2740 */ 2741 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2742 error = hammer_io_indirect_read(hmp, nbio, NULL); 2743 return (error); 2744 } 2745 } 2746 2747 /* 2748 * Well, that sucked. Do it the hard way. If all the stars are 2749 * aligned we may still be able to issue a direct-read. 2750 */ 2751 lwkt_gettoken(&hmp->fs_token); 2752 hammer_simple_transaction(&trans, hmp); 2753 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2754 2755 /* 2756 * Key range (begin and end inclusive) to scan. Note that the key's 2757 * stored in the actual records represent BASE+LEN, not BASE. The 2758 * first record containing bio_offset will have a key > bio_offset. 2759 */ 2760 cursor.key_beg.localization = ip->obj_localization + 2761 HAMMER_LOCALIZE_MISC; 2762 cursor.key_beg.obj_id = ip->obj_id; 2763 cursor.key_beg.create_tid = 0; 2764 cursor.key_beg.delete_tid = 0; 2765 cursor.key_beg.obj_type = 0; 2766 cursor.key_beg.key = bio->bio_offset + 1; 2767 cursor.asof = ip->obj_asof; 2768 cursor.flags |= HAMMER_CURSOR_ASOF; 2769 2770 cursor.key_end = cursor.key_beg; 2771 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2772 #if 0 2773 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2774 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2775 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2776 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2777 } else 2778 #endif 2779 { 2780 ran_end = bio->bio_offset + bp->b_bufsize; 2781 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2782 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2783 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2784 if (tmp64 < ran_end) 2785 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2786 else 2787 cursor.key_end.key = ran_end + MAXPHYS + 1; 2788 } 2789 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2790 2791 /* 2792 * Set NOSWAPCACHE for cursor data extraction if double buffering 2793 * is disabled or (if the file is not marked cacheable via chflags 2794 * and vm.swapcache_use_chflags is enabled). 2795 */ 2796 if (hammer_double_buffer == 0 || 2797 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2798 vm_swapcache_use_chflags)) { 2799 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2800 } 2801 2802 error = hammer_ip_first(&cursor); 2803 boff = 0; 2804 2805 while (error == 0) { 2806 /* 2807 * Get the base file offset of the record. The key for 2808 * data records is (base + bytes) rather then (base). 2809 */ 2810 base = &cursor.leaf->base; 2811 rec_offset = base->key - cursor.leaf->data_len; 2812 2813 /* 2814 * Calculate the gap, if any, and zero-fill it. 2815 * 2816 * n is the offset of the start of the record verses our 2817 * current seek offset in the bio. 2818 */ 2819 n = (int)(rec_offset - (bio->bio_offset + boff)); 2820 if (n > 0) { 2821 if (n > bp->b_bufsize - boff) 2822 n = bp->b_bufsize - boff; 2823 bzero((char *)bp->b_data + boff, n); 2824 boff += n; 2825 n = 0; 2826 } 2827 2828 /* 2829 * Calculate the data offset in the record and the number 2830 * of bytes we can copy. 2831 * 2832 * There are two degenerate cases. First, boff may already 2833 * be at bp->b_bufsize. Secondly, the data offset within 2834 * the record may exceed the record's size. 2835 */ 2836 roff = -n; 2837 rec_offset += roff; 2838 n = cursor.leaf->data_len - roff; 2839 if (n <= 0) { 2840 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2841 n = 0; 2842 } else if (n > bp->b_bufsize - boff) { 2843 n = bp->b_bufsize - boff; 2844 } 2845 2846 /* 2847 * Deal with cached truncations. This cool bit of code 2848 * allows truncate()/ftruncate() to avoid having to sync 2849 * the file. 2850 * 2851 * If the frontend is truncated then all backend records are 2852 * subject to the frontend's truncation. 2853 * 2854 * If the backend is truncated then backend records on-disk 2855 * (but not in-memory) are subject to the backend's 2856 * truncation. In-memory records owned by the backend 2857 * represent data written after the truncation point on the 2858 * backend and must not be truncated. 2859 * 2860 * Truncate operations deal with frontend buffer cache 2861 * buffers and frontend-owned in-memory records synchronously. 2862 */ 2863 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2864 if (hammer_cursor_ondisk(&cursor)/* || 2865 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2866 if (ip->trunc_off <= rec_offset) 2867 n = 0; 2868 else if (ip->trunc_off < rec_offset + n) 2869 n = (int)(ip->trunc_off - rec_offset); 2870 } 2871 } 2872 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2873 if (hammer_cursor_ondisk(&cursor)) { 2874 if (ip->sync_trunc_off <= rec_offset) 2875 n = 0; 2876 else if (ip->sync_trunc_off < rec_offset + n) 2877 n = (int)(ip->sync_trunc_off - rec_offset); 2878 } 2879 } 2880 2881 /* 2882 * Try to issue a direct read into our bio if possible, 2883 * otherwise resolve the element data into a hammer_buffer 2884 * and copy. 2885 * 2886 * The buffer on-disk should be zerod past any real 2887 * truncation point, but may not be for any synthesized 2888 * truncation point from above. 2889 * 2890 * NOTE: disk_offset is only valid if the cursor data is 2891 * on-disk. 2892 */ 2893 disk_offset = cursor.leaf->data_offset + roff; 2894 isdedupable = (boff == 0 && n == bp->b_bufsize && 2895 hammer_cursor_ondisk(&cursor) && 2896 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2897 2898 if (isdedupable && hammer_double_buffer == 0) { 2899 /* 2900 * Direct read case 2901 */ 2902 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2903 HAMMER_ZONE_LARGE_DATA); 2904 nbio->bio_offset = disk_offset; 2905 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2906 if (hammer_live_dedup && error == 0) 2907 hammer_dedup_cache_add(ip, cursor.leaf); 2908 goto done; 2909 } else if (isdedupable) { 2910 /* 2911 * Async I/O case for reading from backing store 2912 * and copying the data to the filesystem buffer. 2913 * live-dedup has to verify the data anyway if it 2914 * gets a hit later so we can just add the entry 2915 * now. 2916 */ 2917 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2918 HAMMER_ZONE_LARGE_DATA); 2919 nbio->bio_offset = disk_offset; 2920 if (hammer_live_dedup) 2921 hammer_dedup_cache_add(ip, cursor.leaf); 2922 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2923 goto done; 2924 } else if (n) { 2925 error = hammer_ip_resolve_data(&cursor); 2926 if (error == 0) { 2927 if (hammer_live_dedup && isdedupable) 2928 hammer_dedup_cache_add(ip, cursor.leaf); 2929 bcopy((char *)cursor.data + roff, 2930 (char *)bp->b_data + boff, n); 2931 } 2932 } 2933 if (error) 2934 break; 2935 2936 /* 2937 * We have to be sure that the only elements added to the 2938 * dedup cache are those which are already on-media. 2939 */ 2940 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2941 hammer_dedup_cache_add(ip, cursor.leaf); 2942 2943 /* 2944 * Iterate until we have filled the request. 2945 */ 2946 boff += n; 2947 if (boff == bp->b_bufsize) 2948 break; 2949 error = hammer_ip_next(&cursor); 2950 } 2951 2952 /* 2953 * There may have been a gap after the last record 2954 */ 2955 if (error == ENOENT) 2956 error = 0; 2957 if (error == 0 && boff != bp->b_bufsize) { 2958 KKASSERT(boff < bp->b_bufsize); 2959 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2960 /* boff = bp->b_bufsize; */ 2961 } 2962 2963 /* 2964 * Disallow swapcache operation on the vnode buffer if double 2965 * buffering is enabled, the swapcache will get the data via 2966 * the block device buffer. 2967 */ 2968 if (hammer_double_buffer) 2969 bp->b_flags |= B_NOTMETA; 2970 2971 /* 2972 * Cleanup 2973 */ 2974 bp->b_resid = 0; 2975 bp->b_error = error; 2976 if (error) 2977 bp->b_flags |= B_ERROR; 2978 biodone(ap->a_bio); 2979 2980 done: 2981 /* 2982 * Cache the b-tree node for the last data read in cache[1]. 2983 * 2984 * If we hit the file EOF then also cache the node in the 2985 * governing director's cache[3], it will be used to initialize 2986 * the inode's cache[1] for any inodes looked up via the directory. 2987 * 2988 * This doesn't reduce disk accesses since the B-Tree chain is 2989 * likely cached, but it does reduce cpu overhead when looking 2990 * up file offsets for cpdup/tar/cpio style iterations. 2991 */ 2992 if (cursor.node) 2993 hammer_cache_node(&ip->cache[1], cursor.node); 2994 if (ran_end >= ip->ino_data.size) { 2995 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2996 ip->obj_asof, ip->obj_localization); 2997 if (dip) { 2998 hammer_cache_node(&dip->cache[3], cursor.node); 2999 hammer_rel_inode(dip, 0); 3000 } 3001 } 3002 hammer_done_cursor(&cursor); 3003 hammer_done_transaction(&trans); 3004 lwkt_reltoken(&hmp->fs_token); 3005 return(error); 3006 } 3007 3008 /* 3009 * BMAP operation - used to support cluster_read() only. 3010 * 3011 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3012 * 3013 * This routine may return EOPNOTSUPP if the opration is not supported for 3014 * the specified offset. The contents of the pointer arguments do not 3015 * need to be initialized in that case. 3016 * 3017 * If a disk address is available and properly aligned return 0 with 3018 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3019 * to the run-length relative to that offset. Callers may assume that 3020 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3021 * large, so return EOPNOTSUPP if it is not sufficiently large. 3022 */ 3023 static 3024 int 3025 hammer_vop_bmap(struct vop_bmap_args *ap) 3026 { 3027 struct hammer_transaction trans; 3028 struct hammer_inode *ip; 3029 hammer_mount_t hmp; 3030 struct hammer_cursor cursor; 3031 hammer_base_elm_t base; 3032 int64_t rec_offset; 3033 int64_t ran_end; 3034 int64_t tmp64; 3035 int64_t base_offset; 3036 int64_t base_disk_offset; 3037 int64_t last_offset; 3038 hammer_off_t last_disk_offset; 3039 hammer_off_t disk_offset; 3040 int rec_len; 3041 int error; 3042 int blksize; 3043 3044 ++hammer_stats_file_iopsr; 3045 ip = ap->a_vp->v_data; 3046 hmp = ip->hmp; 3047 3048 /* 3049 * We can only BMAP regular files. We can't BMAP database files, 3050 * directories, etc. 3051 */ 3052 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3053 return(EOPNOTSUPP); 3054 3055 /* 3056 * bmap is typically called with runp/runb both NULL when used 3057 * for writing. We do not support BMAP for writing atm. 3058 */ 3059 if (ap->a_cmd != BUF_CMD_READ) 3060 return(EOPNOTSUPP); 3061 3062 /* 3063 * Scan the B-Tree to acquire blockmap addresses, then translate 3064 * to raw addresses. 3065 */ 3066 lwkt_gettoken(&hmp->fs_token); 3067 hammer_simple_transaction(&trans, hmp); 3068 #if 0 3069 kprintf("bmap_beg %016llx ip->cache %p\n", 3070 (long long)ap->a_loffset, ip->cache[1]); 3071 #endif 3072 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3073 3074 /* 3075 * Key range (begin and end inclusive) to scan. Note that the key's 3076 * stored in the actual records represent BASE+LEN, not BASE. The 3077 * first record containing bio_offset will have a key > bio_offset. 3078 */ 3079 cursor.key_beg.localization = ip->obj_localization + 3080 HAMMER_LOCALIZE_MISC; 3081 cursor.key_beg.obj_id = ip->obj_id; 3082 cursor.key_beg.create_tid = 0; 3083 cursor.key_beg.delete_tid = 0; 3084 cursor.key_beg.obj_type = 0; 3085 if (ap->a_runb) 3086 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3087 else 3088 cursor.key_beg.key = ap->a_loffset + 1; 3089 if (cursor.key_beg.key < 0) 3090 cursor.key_beg.key = 0; 3091 cursor.asof = ip->obj_asof; 3092 cursor.flags |= HAMMER_CURSOR_ASOF; 3093 3094 cursor.key_end = cursor.key_beg; 3095 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3096 3097 ran_end = ap->a_loffset + MAXPHYS; 3098 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3099 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3100 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3101 if (tmp64 < ran_end) 3102 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3103 else 3104 cursor.key_end.key = ran_end + MAXPHYS + 1; 3105 3106 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3107 3108 error = hammer_ip_first(&cursor); 3109 base_offset = last_offset = 0; 3110 base_disk_offset = last_disk_offset = 0; 3111 3112 while (error == 0) { 3113 /* 3114 * Get the base file offset of the record. The key for 3115 * data records is (base + bytes) rather then (base). 3116 * 3117 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3118 * The extra bytes should be zero on-disk and the BMAP op 3119 * should still be ok. 3120 */ 3121 base = &cursor.leaf->base; 3122 rec_offset = base->key - cursor.leaf->data_len; 3123 rec_len = cursor.leaf->data_len; 3124 3125 /* 3126 * Incorporate any cached truncation. 3127 * 3128 * NOTE: Modifications to rec_len based on synthesized 3129 * truncation points remove the guarantee that any extended 3130 * data on disk is zero (since the truncations may not have 3131 * taken place on-media yet). 3132 */ 3133 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3134 if (hammer_cursor_ondisk(&cursor) || 3135 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3136 if (ip->trunc_off <= rec_offset) 3137 rec_len = 0; 3138 else if (ip->trunc_off < rec_offset + rec_len) 3139 rec_len = (int)(ip->trunc_off - rec_offset); 3140 } 3141 } 3142 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3143 if (hammer_cursor_ondisk(&cursor)) { 3144 if (ip->sync_trunc_off <= rec_offset) 3145 rec_len = 0; 3146 else if (ip->sync_trunc_off < rec_offset + rec_len) 3147 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3148 } 3149 } 3150 3151 /* 3152 * Accumulate information. If we have hit a discontiguous 3153 * block reset base_offset unless we are already beyond the 3154 * requested offset. If we are, that's it, we stop. 3155 */ 3156 if (error) 3157 break; 3158 if (hammer_cursor_ondisk(&cursor)) { 3159 disk_offset = cursor.leaf->data_offset; 3160 if (rec_offset != last_offset || 3161 disk_offset != last_disk_offset) { 3162 if (rec_offset > ap->a_loffset) 3163 break; 3164 base_offset = rec_offset; 3165 base_disk_offset = disk_offset; 3166 } 3167 last_offset = rec_offset + rec_len; 3168 last_disk_offset = disk_offset + rec_len; 3169 3170 if (hammer_live_dedup) 3171 hammer_dedup_cache_add(ip, cursor.leaf); 3172 } 3173 3174 error = hammer_ip_next(&cursor); 3175 } 3176 3177 #if 0 3178 kprintf("BMAP %016llx: %016llx - %016llx\n", 3179 (long long)ap->a_loffset, 3180 (long long)base_offset, 3181 (long long)last_offset); 3182 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3183 (long long)base_disk_offset, 3184 (long long)last_disk_offset); 3185 #endif 3186 3187 if (cursor.node) { 3188 hammer_cache_node(&ip->cache[1], cursor.node); 3189 #if 0 3190 kprintf("bmap_end2 %016llx ip->cache %p\n", 3191 (long long)ap->a_loffset, ip->cache[1]); 3192 #endif 3193 } 3194 hammer_done_cursor(&cursor); 3195 hammer_done_transaction(&trans); 3196 lwkt_reltoken(&hmp->fs_token); 3197 3198 /* 3199 * If we couldn't find any records or the records we did find were 3200 * all behind the requested offset, return failure. A forward 3201 * truncation can leave a hole w/ no on-disk records. 3202 */ 3203 if (last_offset == 0 || last_offset < ap->a_loffset) 3204 return (EOPNOTSUPP); 3205 3206 /* 3207 * Figure out the block size at the requested offset and adjust 3208 * our limits so the cluster_read() does not create inappropriately 3209 * sized buffer cache buffers. 3210 */ 3211 blksize = hammer_blocksize(ap->a_loffset); 3212 if (hammer_blocksize(base_offset) != blksize) { 3213 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3214 } 3215 if (last_offset != ap->a_loffset && 3216 hammer_blocksize(last_offset - 1) != blksize) { 3217 last_offset = hammer_blockdemarc(ap->a_loffset, 3218 last_offset - 1); 3219 } 3220 3221 /* 3222 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3223 * from occuring. 3224 */ 3225 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3226 3227 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3228 /* 3229 * Only large-data zones can be direct-IOd 3230 */ 3231 error = EOPNOTSUPP; 3232 } else if ((disk_offset & HAMMER_BUFMASK) || 3233 (last_offset - ap->a_loffset) < blksize) { 3234 /* 3235 * doffsetp is not aligned or the forward run size does 3236 * not cover a whole buffer, disallow the direct I/O. 3237 */ 3238 error = EOPNOTSUPP; 3239 } else { 3240 /* 3241 * We're good. 3242 */ 3243 *ap->a_doffsetp = disk_offset; 3244 if (ap->a_runb) { 3245 *ap->a_runb = ap->a_loffset - base_offset; 3246 KKASSERT(*ap->a_runb >= 0); 3247 } 3248 if (ap->a_runp) { 3249 *ap->a_runp = last_offset - ap->a_loffset; 3250 KKASSERT(*ap->a_runp >= 0); 3251 } 3252 error = 0; 3253 } 3254 return(error); 3255 } 3256 3257 /* 3258 * Write to a regular file. Because this is a strategy call the OS is 3259 * trying to actually get data onto the media. 3260 */ 3261 static 3262 int 3263 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3264 { 3265 hammer_record_t record; 3266 hammer_mount_t hmp; 3267 hammer_inode_t ip; 3268 struct bio *bio; 3269 struct buf *bp; 3270 int blksize; 3271 int bytes; 3272 int error; 3273 3274 bio = ap->a_bio; 3275 bp = bio->bio_buf; 3276 ip = ap->a_vp->v_data; 3277 hmp = ip->hmp; 3278 3279 blksize = hammer_blocksize(bio->bio_offset); 3280 KKASSERT(bp->b_bufsize == blksize); 3281 3282 if (ip->flags & HAMMER_INODE_RO) { 3283 bp->b_error = EROFS; 3284 bp->b_flags |= B_ERROR; 3285 biodone(ap->a_bio); 3286 return(EROFS); 3287 } 3288 3289 lwkt_gettoken(&hmp->fs_token); 3290 3291 /* 3292 * Disallow swapcache operation on the vnode buffer if double 3293 * buffering is enabled, the swapcache will get the data via 3294 * the block device buffer. 3295 */ 3296 if (hammer_double_buffer) 3297 bp->b_flags |= B_NOTMETA; 3298 3299 /* 3300 * Interlock with inode destruction (no in-kernel or directory 3301 * topology visibility). If we queue new IO while trying to 3302 * destroy the inode we can deadlock the vtrunc call in 3303 * hammer_inode_unloadable_check(). 3304 * 3305 * Besides, there's no point flushing a bp associated with an 3306 * inode that is being destroyed on-media and has no kernel 3307 * references. 3308 */ 3309 if ((ip->flags | ip->sync_flags) & 3310 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3311 bp->b_resid = 0; 3312 biodone(ap->a_bio); 3313 lwkt_reltoken(&hmp->fs_token); 3314 return(0); 3315 } 3316 3317 /* 3318 * Reserve space and issue a direct-write from the front-end. 3319 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3320 * allocations. 3321 * 3322 * An in-memory record will be installed to reference the storage 3323 * until the flusher can get to it. 3324 * 3325 * Since we own the high level bio the front-end will not try to 3326 * do a direct-read until the write completes. 3327 * 3328 * NOTE: The only time we do not reserve a full-sized buffers 3329 * worth of data is if the file is small. We do not try to 3330 * allocate a fragment (from the small-data zone) at the end of 3331 * an otherwise large file as this can lead to wildly separated 3332 * data. 3333 */ 3334 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3335 KKASSERT(bio->bio_offset < ip->ino_data.size); 3336 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3337 bytes = bp->b_bufsize; 3338 else 3339 bytes = ((int)ip->ino_data.size + 15) & ~15; 3340 3341 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3342 bytes, &error); 3343 3344 /* 3345 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3346 * in hammer_vop_write(). We must flag the record so the proper 3347 * REDO_TERM_WRITE entry is generated during the flush. 3348 */ 3349 if (record) { 3350 if (bp->b_flags & B_VFSFLAG1) { 3351 record->flags |= HAMMER_RECF_REDO; 3352 bp->b_flags &= ~B_VFSFLAG1; 3353 } 3354 if (record->flags & HAMMER_RECF_DEDUPED) { 3355 bp->b_resid = 0; 3356 hammer_ip_replace_bulk(hmp, record); 3357 biodone(ap->a_bio); 3358 } else { 3359 hammer_io_direct_write(hmp, bio, record); 3360 } 3361 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3362 hammer_flush_inode(ip, 0); 3363 } else { 3364 bp->b_bio2.bio_offset = NOOFFSET; 3365 bp->b_error = error; 3366 bp->b_flags |= B_ERROR; 3367 biodone(ap->a_bio); 3368 } 3369 lwkt_reltoken(&hmp->fs_token); 3370 return(error); 3371 } 3372 3373 /* 3374 * dounlink - disconnect a directory entry 3375 * 3376 * XXX whiteout support not really in yet 3377 */ 3378 static int 3379 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3380 struct vnode *dvp, struct ucred *cred, 3381 int flags, int isdir) 3382 { 3383 struct namecache *ncp; 3384 hammer_inode_t dip; 3385 hammer_inode_t ip; 3386 hammer_mount_t hmp; 3387 struct hammer_cursor cursor; 3388 int64_t namekey; 3389 u_int32_t max_iterations; 3390 int nlen, error; 3391 3392 /* 3393 * Calculate the namekey and setup the key range for the scan. This 3394 * works kinda like a chained hash table where the lower 32 bits 3395 * of the namekey synthesize the chain. 3396 * 3397 * The key range is inclusive of both key_beg and key_end. 3398 */ 3399 dip = VTOI(dvp); 3400 ncp = nch->ncp; 3401 hmp = dip->hmp; 3402 3403 if (dip->flags & HAMMER_INODE_RO) 3404 return (EROFS); 3405 3406 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3407 &max_iterations); 3408 retry: 3409 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3410 cursor.key_beg.localization = dip->obj_localization + 3411 hammer_dir_localization(dip); 3412 cursor.key_beg.obj_id = dip->obj_id; 3413 cursor.key_beg.key = namekey; 3414 cursor.key_beg.create_tid = 0; 3415 cursor.key_beg.delete_tid = 0; 3416 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3417 cursor.key_beg.obj_type = 0; 3418 3419 cursor.key_end = cursor.key_beg; 3420 cursor.key_end.key += max_iterations; 3421 cursor.asof = dip->obj_asof; 3422 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3423 3424 /* 3425 * Scan all matching records (the chain), locate the one matching 3426 * the requested path component. info->last_error contains the 3427 * error code on search termination and could be 0, ENOENT, or 3428 * something else. 3429 * 3430 * The hammer_ip_*() functions merge in-memory records with on-disk 3431 * records for the purposes of the search. 3432 */ 3433 error = hammer_ip_first(&cursor); 3434 3435 while (error == 0) { 3436 error = hammer_ip_resolve_data(&cursor); 3437 if (error) 3438 break; 3439 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3440 KKASSERT(nlen > 0); 3441 if (ncp->nc_nlen == nlen && 3442 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3443 break; 3444 } 3445 error = hammer_ip_next(&cursor); 3446 } 3447 3448 /* 3449 * If all is ok we have to get the inode so we can adjust nlinks. 3450 * To avoid a deadlock with the flusher we must release the inode 3451 * lock on the directory when acquiring the inode for the entry. 3452 * 3453 * If the target is a directory, it must be empty. 3454 */ 3455 if (error == 0) { 3456 hammer_unlock(&cursor.ip->lock); 3457 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3458 hmp->asof, 3459 cursor.data->entry.localization, 3460 0, &error); 3461 hammer_lock_sh(&cursor.ip->lock); 3462 if (error == ENOENT) { 3463 kprintf("HAMMER: WARNING: Removing " 3464 "dirent w/missing inode \"%s\"\n" 3465 "\tobj_id = %016llx\n", 3466 ncp->nc_name, 3467 (long long)cursor.data->entry.obj_id); 3468 error = 0; 3469 } 3470 3471 /* 3472 * If isdir >= 0 we validate that the entry is or is not a 3473 * directory. If isdir < 0 we don't care. 3474 */ 3475 if (error == 0 && isdir >= 0 && ip) { 3476 if (isdir && 3477 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3478 error = ENOTDIR; 3479 } else if (isdir == 0 && 3480 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3481 error = EISDIR; 3482 } 3483 } 3484 3485 /* 3486 * If we are trying to remove a directory the directory must 3487 * be empty. 3488 * 3489 * The check directory code can loop and deadlock/retry. Our 3490 * own cursor's node locks must be released to avoid a 3-way 3491 * deadlock with the flusher if the check directory code 3492 * blocks. 3493 * 3494 * If any changes whatsoever have been made to the cursor 3495 * set EDEADLK and retry. 3496 * 3497 * WARNING: See warnings in hammer_unlock_cursor() 3498 * function. 3499 */ 3500 if (error == 0 && ip && ip->ino_data.obj_type == 3501 HAMMER_OBJTYPE_DIRECTORY) { 3502 hammer_unlock_cursor(&cursor); 3503 error = hammer_ip_check_directory_empty(trans, ip); 3504 hammer_lock_cursor(&cursor); 3505 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3506 kprintf("HAMMER: Warning: avoided deadlock " 3507 "on rmdir '%s'\n", 3508 ncp->nc_name); 3509 error = EDEADLK; 3510 } 3511 } 3512 3513 /* 3514 * Delete the directory entry. 3515 * 3516 * WARNING: hammer_ip_del_directory() may have to terminate 3517 * the cursor to avoid a deadlock. It is ok to call 3518 * hammer_done_cursor() twice. 3519 */ 3520 if (error == 0) { 3521 error = hammer_ip_del_directory(trans, &cursor, 3522 dip, ip); 3523 } 3524 hammer_done_cursor(&cursor); 3525 if (error == 0) { 3526 cache_setunresolved(nch); 3527 cache_setvp(nch, NULL); 3528 3529 /* 3530 * NOTE: ip->vp, if non-NULL, cannot be directly 3531 * referenced without formally acquiring the 3532 * vp since the vp might have zero refs on it, 3533 * or in the middle of a reclaim, etc. 3534 * 3535 * NOTE: The cache_setunresolved() can rip the vp 3536 * out from under us since the vp may not have 3537 * any refs, in which case ip->vp will be NULL 3538 * from the outset. 3539 */ 3540 while (ip && ip->vp) { 3541 struct vnode *vp; 3542 3543 error = hammer_get_vnode(ip, &vp); 3544 if (error == 0 && vp) { 3545 vn_unlock(vp); 3546 hammer_knote(ip->vp, NOTE_DELETE); 3547 cache_inval_vp(ip->vp, CINV_DESTROY); 3548 vrele(vp); 3549 break; 3550 } 3551 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3552 } 3553 } 3554 if (ip) 3555 hammer_rel_inode(ip, 0); 3556 } else { 3557 hammer_done_cursor(&cursor); 3558 } 3559 if (error == EDEADLK) 3560 goto retry; 3561 3562 return (error); 3563 } 3564 3565 /************************************************************************ 3566 * FIFO AND SPECFS OPS * 3567 ************************************************************************ 3568 * 3569 */ 3570 static int 3571 hammer_vop_fifoclose (struct vop_close_args *ap) 3572 { 3573 /* XXX update itimes */ 3574 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3575 } 3576 3577 static int 3578 hammer_vop_fiforead (struct vop_read_args *ap) 3579 { 3580 int error; 3581 3582 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3583 /* XXX update access time */ 3584 return (error); 3585 } 3586 3587 static int 3588 hammer_vop_fifowrite (struct vop_write_args *ap) 3589 { 3590 int error; 3591 3592 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3593 /* XXX update access time */ 3594 return (error); 3595 } 3596 3597 static 3598 int 3599 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3600 { 3601 int error; 3602 3603 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3604 if (error) 3605 error = hammer_vop_kqfilter(ap); 3606 return(error); 3607 } 3608 3609 /************************************************************************ 3610 * KQFILTER OPS * 3611 ************************************************************************ 3612 * 3613 */ 3614 static void filt_hammerdetach(struct knote *kn); 3615 static int filt_hammerread(struct knote *kn, long hint); 3616 static int filt_hammerwrite(struct knote *kn, long hint); 3617 static int filt_hammervnode(struct knote *kn, long hint); 3618 3619 static struct filterops hammerread_filtops = 3620 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread }; 3621 static struct filterops hammerwrite_filtops = 3622 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite }; 3623 static struct filterops hammervnode_filtops = 3624 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode }; 3625 3626 static 3627 int 3628 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3629 { 3630 struct vnode *vp = ap->a_vp; 3631 struct knote *kn = ap->a_kn; 3632 3633 switch (kn->kn_filter) { 3634 case EVFILT_READ: 3635 kn->kn_fop = &hammerread_filtops; 3636 break; 3637 case EVFILT_WRITE: 3638 kn->kn_fop = &hammerwrite_filtops; 3639 break; 3640 case EVFILT_VNODE: 3641 kn->kn_fop = &hammervnode_filtops; 3642 break; 3643 default: 3644 return (EOPNOTSUPP); 3645 } 3646 3647 kn->kn_hook = (caddr_t)vp; 3648 3649 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3650 3651 return(0); 3652 } 3653 3654 static void 3655 filt_hammerdetach(struct knote *kn) 3656 { 3657 struct vnode *vp = (void *)kn->kn_hook; 3658 3659 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3660 } 3661 3662 static int 3663 filt_hammerread(struct knote *kn, long hint) 3664 { 3665 struct vnode *vp = (void *)kn->kn_hook; 3666 hammer_inode_t ip = VTOI(vp); 3667 hammer_mount_t hmp = ip->hmp; 3668 off_t off; 3669 3670 if (hint == NOTE_REVOKE) { 3671 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3672 return(1); 3673 } 3674 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3675 off = ip->ino_data.size - kn->kn_fp->f_offset; 3676 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3677 lwkt_reltoken(&hmp->fs_token); 3678 if (kn->kn_sfflags & NOTE_OLDAPI) 3679 return(1); 3680 return (kn->kn_data != 0); 3681 } 3682 3683 static int 3684 filt_hammerwrite(struct knote *kn, long hint) 3685 { 3686 if (hint == NOTE_REVOKE) 3687 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3688 kn->kn_data = 0; 3689 return (1); 3690 } 3691 3692 static int 3693 filt_hammervnode(struct knote *kn, long hint) 3694 { 3695 if (kn->kn_sfflags & hint) 3696 kn->kn_fflags |= hint; 3697 if (hint == NOTE_REVOKE) { 3698 kn->kn_flags |= (EV_EOF | EV_NODATA); 3699 return (1); 3700 } 3701 return (kn->kn_fflags != 0); 3702 } 3703 3704